# Feature Engineering

In this lesson we discuss some methods for *automated* feature engineering, specifically feature selection.

While these methods can produce useful results, they are but a single piece of of the feature engineering puzzle.

## Setup

In [39]:
#first begin with imports
import warnings
warnings.filterwarnings("ignore")
import env
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pydataset import data
from statsmodels.formula.api import ols
import feature as fe

from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression, RFE
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


import pandas as pd
import numpy as np
import pydataset

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

tips = pydataset.data('tips')
tips['smoker'] = (tips.smoker == 'Yes').astype(int)
tips['dinner'] = (tips.time == 'Dinner').astype(int)

In [51]:
X = tips[['total_bill', 'size', 'smoker', 'dinner']]
y = tips.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Select K Best

- uses an f regression test
- looks at each feature in isolation
- is a model with that feature better than no model at all

In [52]:
kbest = SelectKBest(f_regression, k=1)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=1, score_func=<function f_regression at 0x7fd185e9c550>)

In [53]:
kbest.get_support()

array([ True, False, False, False])

In [54]:
X_train.columns[kbest.get_support()]

Index(['total_bill'], dtype='object')

In [55]:
X_kbest = kbest.transform(X_train_scaled)
X_kbest.shape

(195, 1)

<div style="border: 1px solid black; border-radius: 3px; background: palegreen; padding: .5em 1em;">
    <p style="font-size: 1.3em; font-weight: bold">Mini Exercise</p>
    <ol>
        <li>Use <code>pydataset</code> to load the <code>swiss</code> dataset.</li>
        <li>Split the swiss dataset into X and y, and train and test. The goal is to predict <code>Fertility</code>.</li>
        <li>Use <code>SelectKBest</code> to find the top 3 features that predict fertility in the swiss data set.</li>
    </ol>
</div>

In [66]:
swiss_df = data('swiss')

In [67]:
swiss_df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [68]:
X_train

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rolle,60.8,16,10,7.72,16.3
Lavaux,73.0,19,9,2.84,20.0
Nyone,50.9,22,12,15.14,16.7
Conthey,85.9,3,2,99.71,15.1
Yverdon,49.5,15,8,6.1,22.5
Oron,71.2,12,1,2.4,21.0
Cossonay,69.3,22,5,2.82,18.7
St Maurice,75.9,9,9,99.06,17.8
Franches-Mnt,39.7,5,5,93.4,20.2
Orbe,54.1,20,6,4.2,15.3


In [69]:
y_train

Rolle           60.5
Lavaux          65.1
Nyone           56.6
Conthey         75.5
Yverdon         65.4
Oron            72.5
Cossonay        61.7
St Maurice      65.0
Franches-Mnt    92.5
Orbe            57.4
Sarine          82.9
La Chauxdfnd    65.7
Neuchatel       64.4
Monthey         79.4
Broye           83.8
Paysd'enhaut    72.0
Le Locle        72.7
Sion            79.3
Herens          77.3
Glane           92.4
Neuveville      76.9
Grandson        71.7
Vevey           58.3
Martigwy        70.5
Avenches        68.9
Name: Fertility, dtype: float64

In [70]:
obj_cols = fe.get_object_cols(X_train)
obj_cols

[]

In [71]:
num_cols = fe.get_numeric_X_cols(X_train, obj_cols)
num_cols

['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']

In [72]:
X_train_scaled, X_validate_scaled, X_test_scaled = fe.min_max_scale(X_train, X_validate, X_test, num_cols)

In [73]:
X_train_scaled

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rolle,0.647561,0.40625,0.290323,0.054508,0.122449
Lavaux,0.796341,0.5,0.258065,0.004508,0.5
Nyone,0.526829,0.59375,0.354839,0.130533,0.163265
Conthey,0.953659,0.0,0.032258,0.997029,0.0
Yverdon,0.509756,0.375,0.225806,0.03791,0.755102
Oron,0.77439,0.28125,0.0,0.0,0.602041
Cossonay,0.75122,0.59375,0.129032,0.004303,0.367347
St Maurice,0.831707,0.1875,0.258065,0.990369,0.27551
Franches-Mnt,0.390244,0.0625,0.129032,0.932377,0.520408
Orbe,0.565854,0.53125,0.16129,0.018443,0.020408


In [74]:
fe.select_kbest(swiss_df.drop(columns=['Fertility']), swiss_df.Fertility, top=3)

['Examination', 'Education', 'Catholic']

In [75]:
fe.rfe(swiss_df.drop(columns=['Fertility']), swiss_df.Fertility,3)

3 selected features
['Examination', 'Education', 'Infant.Mortality']


## Recursive Feature Elimination

- Fits a model and eliminates the worst performing features
- More computationally expensive
- Looks at all the features together

In [76]:
rfe = RFE(estimator=LinearRegression(), n_features_to_select=3)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([ True,  True, False, False,  True])

In [77]:
rfe.ranking_

array([1, 1, 3, 2, 1])

<div style="border: 1px solid black; border-radius: 3px; background: palegreen; padding: .5em 1em;">
    <p style="font-size: 1.3em; font-weight: bold">Mini Exercise</p>
    <ol>
        <li>Use <code>RFE</code> and <code>LinearRegression</code> to find the top 3 features that predict fertility in the swiss data set.</li>
        <li>Are the results different than what select k best gave you?</li>
    </ol>
</div>

In [78]:
#recursive
lm=LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X_train_scaled,y_train) 
lm.fit(X_rfe,y_train)

var_ranks = rfe.ranking_
var_names = X_train_scaled.columns.tolist()

pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

Unnamed: 0,Var,Rank
0,Agriculture,2
1,Examination,1
2,Education,4
3,Catholic,3
4,Infant.Mortality,1
