# Feature Engineering 

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline


In [11]:
auto = pd.read_csv('../data/auto.csv')
auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
387,27.0,4,140.0,86.0,2790,15.6,82,1,ford mustang gl
388,44.0,4,97.0,52.0,2130,24.6,82,2,vw pickup
389,32.0,4,135.0,84.0,2295,11.6,82,1,dodge rampage
390,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger


In [12]:
px.scatter(data_frame=auto, x='horsepower', y='mpg')

In [13]:
X = auto[['horsepower']]
y = auto['mpg'] 
first_degree_model = LinearRegression().fit(X, y)
y_pred = first_degree_model.predict(X)
first_degree_mse = mean_squared_error(y, y_pred)

In [14]:
auto['hp2'] = ''
auto['hp2'] = auto['horsepower'] ** 2

In [15]:
X = auto[['horsepower', 'hp2']]
y = auto['mpg']
quadratic_model = LinearRegression().fit(X, y)
y_quad_pred = quadratic_model.predict(X)
quad_mse = mean_squared_error(y, y_quad_pred)
auto['predictions'] = y_quad_pred
auto


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,hp2,predictions
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,16900.0,17.091508
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,27225.0,13.480156
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,22500.0,14.658717
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,22500.0,14.658717
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,19600.0,15.752059
...,...,...,...,...,...,...,...,...,...,...,...
387,27.0,4,140.0,86.0,2790,15.6,82,1,ford mustang gl,7396.0,25.908837
388,44.0,4,97.0,52.0,2130,24.6,82,2,vw pickup,2704.0,35.985609
389,32.0,4,135.0,84.0,2295,11.6,82,1,dodge rampage,7056.0,26.422834
390,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger,6241.0,27.750895


In [16]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=auto['horsepower'], y=auto['mpg'],
                    mode='markers',
                    name='Actual MPG'))
fig.add_trace(go.Scatter(x=auto['horsepower'], y=auto['predictions'],
                    mode='lines', name='Predicted MPG'))
fig.update_layout(title='Horsepower vs MPG with Predictions',
                     xaxis_title='MPG')
fig.update_layout(width=800, height=600)
fig.show()


# Scikit-Learn Transformers

## Fit and Transform

In [26]:
df = pd.DataFrame({

    "alpha": [1, 2, 3, 4],

    "beta": [5, 6, 7, 8]

})
# Create a PolynomialFeatures transformer which can generate features of up to degree 2

poly_transform = PolynomialFeatures(degree=2)
# Determine the number and name of the features of up to degree 2

# For this example, there will be a bias term, alpha, beta, alpha^2

# alpha*beta, and beta^2, for a total of six features

poly_transform.fit(df[["alpha", "beta"]])

# For each row of df, generate the values of our six output features

transformed_data = poly_transform.transform(df[["alpha", "beta"]])
transformed_data

array([[ 1.,  1.,  5.,  1.,  5., 25.],
       [ 1.,  2.,  6.,  4., 12., 36.],
       [ 1.,  3.,  7.,  9., 21., 49.],
       [ 1.,  4.,  8., 16., 32., 64.]])

Fit: When you create a PolynomialFeatures object and call its fit method with your data, the transformer analyzes the data to determine the number and names of the output features. The actual values of the output features are not calculated yet. The number of output features depends on the degree of the polynomial and the number of input features.

Transform: Once the transformer is fitted, you can call its transform method with new data to generate polynomial features based on the degree you specified. For each sample in the input data, the transformer creates all possible polynomial combinations of the features up to the specified degree. The output of this method is an array containing the polynomial combinations of the input features starting from degree 0 up to the specified degree.

For example, the input dataframe provided as input to the fit and transform methods in the example above is:

Since we’ve asked for the degree 2 features (bias term, alpha, beta, alpha2, alpha × beta, beta2) the output of the transform method is the array. For example, on the second row, the 12 is alpha × beta = 2 × 6 = 12.

## get_feature_names_out

In [34]:
# Create a PolynomialFeatures transformer which can generate features of up to degree 2

poly_transform = PolynomialFeatures(degree=2)
# Determine the number and name of the features of up to degree 2

# For this example, there will be a bias term, alpha, beta, alpha^2

# alpha*beta, and beta^2, for a total of six features

poly_transform.fit(df[["alpha", "beta"]])
# For each row of df, generate the values of our six output features

transformed_data = pd.DataFrame(poly_transform.transform(df[["alpha", "beta"]]), columns=poly_transform.get_feature_names_out())
transformed_data

Unnamed: 0,1,alpha,beta,alpha^2,alpha beta,beta^2
0,1.0,1.0,5.0,1.0,5.0,25.0
1,1.0,2.0,6.0,4.0,12.0,36.0
2,1.0,3.0,7.0,9.0,21.0,49.0
3,1.0,4.0,8.0,16.0,32.0,64.0


## Fit Transform

In [35]:
poly_transform = PolynomialFeatures(degree=2)
transformed_data = poly_transform.fit_transform(df[["alpha", "beta"]])
transformed_data

array([[ 1.,  1.,  5.,  1.,  5., 25.],
       [ 1.,  2.,  6.,  4., 12., 36.],
       [ 1.,  3.,  7.,  9., 21., 49.],
       [ 1.,  4.,  8., 16., 32., 64.]])

## Pipelines in Data Preprocessing - Fitting a regression model

In [37]:
df = pd.DataFrame({

    "alpha": [1, 2, 3, 4],

    "beta": [5, 6, 7, 8],

    "y": [7.9, 16.1, 29.9, 42.1]

})
poly_transform = PolynomialFeatures(degree=2)
transformed_data = poly_transform.fit_transform(df[["alpha", "beta"]])

model = LinearRegression()

model.fit(transformed_data, df["y"])



0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
model.predict([[3, 5]]) # will error because the model expects six features and not two.

In [40]:
pipelined_model = Pipeline([

    ('josh_transform', PolynomialFeatures(degree = 3)),

    ('josh_regression', LinearRegression())   

])

pipelined_model.fit(df[["alpha", "beta"]], df["y"])

pipelined_model.predict([[3, 5]])


X does not have valid feature names, but PolynomialFeatures was fitted with feature names



array([66.39181554])

In [41]:
pipelined_model.named_steps['josh_regression'].coef_

array([ 6.72240041e-14, -2.17241379e-01, -2.17241379e-01,  1.07619443e+00,
        2.07228916e-01, -6.61736602e-01, -3.28130453e+00,  1.02347320e+00,
        1.85238887e+00, -7.94557541e-01])

## Perform Polynomial Transform on Auto dataset.

In [30]:
# Create a PolynomialFeatures transformer which can generate features of up to degree 2

poly_transform = PolynomialFeatures(degree=4, include_bias=False)

In [31]:
poly_transform.fit(auto[["horsepower"]])

0,1,2
,degree,4
,interaction_only,False
,include_bias,False
,order,'C'


In [32]:
transformed_data = pd.DataFrame(poly_transform.transform(auto[["horsepower"]]), columns=poly_transform.get_feature_names_out())
transformed_data

Unnamed: 0,horsepower,horsepower^2,horsepower^3,horsepower^4
0,130.0,16900.0,2197000.0,285610000.0
1,165.0,27225.0,4492125.0,741200625.0
2,150.0,22500.0,3375000.0,506250000.0
3,150.0,22500.0,3375000.0,506250000.0
4,140.0,19600.0,2744000.0,384160000.0
...,...,...,...,...
387,86.0,7396.0,636056.0,54700816.0
388,52.0,2704.0,140608.0,7311616.0
389,84.0,7056.0,592704.0,49787136.0
390,79.0,6241.0,493039.0,38950081.0
