# Feature Engineering Review

Machine learning workflow with some feature engineering. We'll be looking at the cars dataset. The goal here is to get more comfortable with feature engineering. 

In [65]:
#usual imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

Load dataset from seaborn with information about car fuel efficiency

In [66]:
df_cars = sns.load_dataset('mpg')

#### Basic dataset inspection

In [67]:
df_cars.head(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320


In [68]:
df_cars.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [69]:
df_cars.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
name             object
dtype: object

#### Drop the name column

In [71]:
df_cars.drop(columns='name', inplace=True)

In [72]:
df_cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa


### Deal with missing values

In [73]:
df_cars.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [78]:
df_cars.dropna(inplace=True)
df_cars.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

### Make X and y
We'll predict mpg using the remaining columns (after some engineering)

In [81]:
cols = list(df_cars.columns)
cols

['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'model_year',
 'origin']

In [82]:
X = df_cars.drop('mpg', axis=1)
X.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,usa
1,8,350.0,165.0,3693,11.5,70,usa


In [84]:
y = df_cars['mpg']
y.shape

(392,)

In [85]:
X.dtypes

cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
dtype: object

## Encode string values 

(must do this after splitting into train and test if some values might only show up in the test data). Then you have to make sure X_test and X_train have the same columns, but only use the data available in X_train.

In [89]:
X['origin'].value_counts()

usa       245
japan      79
europe     68
Name: origin, dtype: int64

### Dummify/OneHotEncode

In [97]:
X_dummies = pd.get_dummies(X, columns=['origin'], drop_first=True)

## Separate into training and test sets

In [96]:
from sklearn.model_selection import train_test_split

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y)

In [99]:
X_train.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_japan,origin_usa
129,4,79.0,67.0,1950,19.0,74,1,0
211,6,168.0,120.0,3820,16.7,76,0,0


In [100]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 294 entries, 129 to 216
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cylinders     294 non-null    int64  
 1   displacement  294 non-null    float64
 2   horsepower    294 non-null    float64
 3   weight        294 non-null    int64  
 4   acceleration  294 non-null    float64
 5   model_year    294 non-null    int64  
 6   origin_japan  294 non-null    uint8  
 7   origin_usa    294 non-null    uint8  
dtypes: float64(3), int64(3), uint8(2)
memory usage: 16.7 KB


### Standardize and scale our data

#### Import, instantiate, fit_transform, and transform

In [102]:
from sklearn.preprocessing import StandardScaler

In [104]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_train_scaled[:1]

array([[-0.84709621, -1.07438742, -0.93987847, -1.19352569,  1.21513202,
        -0.63128493,  1.89571886, -1.21267813]])

In [106]:
X_test.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_japan,origin_usa
204,4,85.0,70.0,1990,17.0,76,1,0
252,6,231.0,105.0,3535,19.2,78,0,1


In [107]:
X_test_scaled = ss.transform(X_test)
X_test_scaled[:2]

array([[-0.84709621, -1.01753453, -0.86210637, -1.14662254,  0.49246829,
        -0.06715797,  1.89571886, -1.21267813],
       [ 0.33602154,  0.36588571,  0.04523479,  0.66501172,  1.28739839,
         0.49696898, -0.52750438,  0.82462113]])

## Polynomial Features

#### import and instantiate

In [108]:
from sklearn.preprocessing import PolynomialFeatures

In [170]:
poly_model = PolynomialFeatures(3)

#### fit_transform and transform

In [171]:
X_train_poly = poly_model.fit_transform(X_train_scaled)

In [172]:
X_train_poly[0]

array([ 1.        , -0.84709621, -1.07438742, -0.93987847, -1.19352569,
        1.21513202, -0.63128493,  1.89571886, -1.21267813,  0.71757199,
        0.91010952,  0.7961675 ,  1.0110311 , -1.02933373,  0.53475907,
       -1.60585627,  1.02725505,  1.15430833,  1.00979361,  1.28230899,
       -1.30552256,  0.67824458, -2.0367365 ,  1.30288612,  0.88337155,
        1.12176911, -1.14207643,  0.59333111, -1.78174535,  1.13977007,
        1.42450358, -1.45029129,  0.75345478, -2.26258917,  1.4473625 ,
        1.47654583, -0.76709453,  2.30354869, -1.47356402,  0.39852066,
       -1.19673874,  0.76554542,  3.59375   , -2.29889679,  1.47058824,
       -0.60785252, -0.77095032, -0.67443047, -0.85644061,  0.87194471,
       -0.45299238,  1.36031476, -0.87018386, -0.97781021, -0.85539234,
       -1.08623909,  1.10590321, -0.57453842,  1.72531177, -1.1036699 ,
       -0.74830069, -0.95024636,  0.96744862, -0.50260854,  1.50930974,
       -0.96549491, -1.20669159,  1.22853626, -0.63824869,  1.91

##### What do you get back?

In [173]:
X_test_poly = poly_model.transform(X_test_scaled)
X_test_poly[0]

array([ 1.00000000e+00, -8.47096213e-01, -1.01753453e+00, -8.62106374e-01,
       -1.14662254e+00,  4.92468292e-01, -6.71579708e-02,  1.89571886e+00,
       -1.21267813e+00,  7.17571994e-01,  8.61949650e-01,  7.30287044e-01,
        9.71299613e-01, -4.17168025e-01,  5.68892627e-02, -1.60585627e+00,
        1.02725505e+00,  1.03537653e+00,  8.77223007e-01,  1.16672803e+00,
       -5.01103494e-01,  6.83355545e-02, -1.92895941e+00,  1.23394187e+00,
        7.43227400e-01,  9.88510602e-01, -4.24560054e-01,  5.78973146e-02,
       -1.63431131e+00,  1.04545754e+00,  1.31474325e+00, -5.64675245e-01,
        7.70048432e-02, -2.17367398e+00,  1.39048407e+00,  2.42525019e-01,
       -3.30731712e-02,  9.33581431e-01, -5.97205526e-01,  4.51019304e-03,
       -1.27312632e-01,  8.14410021e-02,  3.59375000e+00, -2.29889679e+00,
        1.47058824e+00, -6.07852518e-01, -7.30154284e-01, -6.18623389e-01,
       -8.22784224e-01,  3.53381454e-01, -4.81906790e-02,  1.36031476e+00,
       -8.70183860e-01, -

#### import and instantiate a Linear Regression Model

In [174]:
from sklearn.linear_model import LinearRegression

In [179]:
lr = LinearRegression()

#### import cross_val_score and score the model on the training data

In [180]:
from sklearn.model_selection import cross_val_score

In [181]:
cross_val_score(lr, X_train_poly, y_train).mean()

-2.2625909991953898e+20

#### Result

Basically no predictive power.

Then you could try other values for polynomial features (or try other models -- soon 😀). See what performs best with cross validation on your training data. Then use the best model to score on your holdout/test data. 

#### Make a baseline model.

In [151]:
y_train.mean()

23.652721088435374

In [152]:
df_baseline = pd.DataFrame(y_test)
df_baseline.head()

Unnamed: 0,mpg
204,32.0
252,19.2
28,9.0
141,29.0
391,36.0


In [153]:
df_baseline['y_train_mean'] = y_train.mean()
df_baseline.head(2)

Unnamed: 0,mpg,y_train_mean
204,32.0,23.652721
252,19.2,23.652721


In [139]:
df_baseline.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98 entries, 204 to 175
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           98 non-null     float64
 1   y_train_mean  98 non-null     float64
dtypes: float64(2)
memory usage: 2.3 KB


#### Score the baseline model

In [140]:
from sklearn.metrics import r2_score

In [141]:
r2_score(df_baseline['mpg'], df_baseline['y_train_mean'])

-0.012726193558043386

basically 0.