# Feature Engineering

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pydataset import data

from scipy import stats

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
import sklearn.metrics
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.metrics import r2_score
import sklearn.preprocessing

# feature engineering: 
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

from math import sqrt

from env import host, user, password


## Tips dataset

In [3]:
# acquire tips data from seaborn
tips = sns.load_dataset("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.


In [5]:
tips['tip_percentage'] = tips['tip'] / tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [6]:
tips['price_per_person'] = tips['total_bill'] / tips['size']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

answer:

tip amount = total_bill, size

tip percentage = tip, size

d. Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

Answer: 

SelectKBest: ['total_bill', 'size']

RFE: ['total_bill', 'tip_percentage']

### Prepare tips data

In [7]:
# Make categorical columns numeric (or could aslo drop or add only numeric columns in a new df)
col_replace = {'sex': {'Male': 1, 'Female': 0}, 'smoker': {'Yes': 1, 'No': 0}, 'time': {'Lunch': 1, 'Dinner': 0}, 'day': {'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun': 4}}
tips.replace(col_replace, inplace=True)

In [8]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,0,0,4,0,2,0.059447,8.495
1,10.34,1.66,1,0,4,0,3,0.160542,3.446667
2,21.01,3.5,1,0,4,0,3,0.166587,7.003333
3,23.68,3.31,1,0,4,0,2,0.13978,11.84
4,24.59,3.61,0,0,4,0,4,0.146808,6.1475


In [9]:
# function to split data
def tips_split(tips):
    
    train_validate, test = train_test_split(tips, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

    return train, validate, test

In [10]:
train, validate, test = tips_split(tips)

In [11]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
18,16.97,3.5,0,0,4,0,3,0.206246,5.656667
172,7.25,5.15,1,1,4,0,2,0.710345,3.625
118,12.43,1.8,0,0,0,1,2,0.144811,6.215
28,21.7,4.3,1,0,2,0,2,0.198157,10.85
237,32.83,1.17,1,1,2,0,2,0.035638,16.415


In [12]:
train.shape, validate.shape, test.shape

((136, 9), (59, 9), (49, 9))

In [13]:
target = tips.tip

In [14]:
# split into x, y 
def split_tvt_into_variables(train, validate, test, target):

# split train into X (dataframe, drop target) & y (series, keep target only)
    X_train = train.drop(columns=[target])
    y_train = train[target]
    
    # split validate into X (dataframe, drop target) & y (series, keep target only)
    X_validate = validate.drop(columns=[target])
    y_validate = validate[target]
    
    # split test into X (dataframe, drop target) & y (series, keep target only)
    X_test = test.drop(columns=[target])
    y_test = test[target]
    
    return train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test


In [15]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test = split_tvt_into_variables(train, validate, test, target='tip')

In [16]:
def Standard_Scaler(X_train, X_validate, X_test):
    """
    Takes in X_train, X_validate and X_test dfs with numeric values only
    Returns scaler, X_train_scaled, X_validate_scaled, X_test_scaled dfs
    """

    scaler = sklearn.preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_validate_scaled = pd.DataFrame(scaler.transform(X_validate), index = X_validate.index, columns = X_validate.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    return scaler, X_train_scaled, X_validate_scaled, X_test_scaled

In [17]:
scaler, X_train_scaled, X_validate_scaled, X_test_scaled = Standard_Scaler(X_train, X_validate, X_test)

### SelectKBest

In [18]:
# parameters of SelectKBest: find top 2 features
f_selector = SelectKBest(f_regression, k=2)

In [19]:
f_selector.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fb573f4f820>)

In [20]:
feature_mask = f_selector.get_support()

In [21]:
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [22]:
f_feature

['total_bill', 'size']

### Recursive Feature Elimination

In [23]:
# create moddel
lm = LinearRegression()

In [24]:
# initialize RFE object and set parameters to return two features
rfe = RFE(lm, 2)

In [25]:
X_rfe = rfe.fit_transform(X_train_scaled,y_train) 

In [26]:
# fit model
lm.fit(X_rfe,y_train)

LinearRegression()

In [27]:
mask = rfe.support_

In [28]:
rfe_features = X_train_scaled.loc[:,mask].columns.tolist()

In [29]:
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['total_bill', 'tip_percentage']


e. Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. What are they?

Answer: 

SelectKBest: ['tip', 'price_per_person']

RFE: ['total_bill', 'tip']

In [30]:
target = tips.tip_percentage

In [31]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test = split_tvt_into_variables(train, validate, test, target='tip_percentage')

In [32]:
#y_train.head()

In [33]:
scaler, X_train_scaled, X_validate_scaled, X_test_scaled = Standard_Scaler(X_train, X_validate, X_test)

### SelectKBest

In [34]:
# parameters of SelectKBest: find top 2 features
f_selector = SelectKBest(f_regression, k=2)

In [35]:
f_selector.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fb573f4f820>)

In [36]:
feature_mask = f_selector.get_support()

In [37]:
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [38]:
f_feature

['tip', 'price_per_person']

### Recursive Feature Elimination

In [39]:
lm2 = LinearRegression()

In [40]:
# initialize RFE object and set parameters to return two features
rfe2 = RFE(lm2, 2)

In [41]:
X_rfe2 = rfe2.fit_transform(X_train_scaled,y_train)

In [42]:
# fit model
lm2.fit(X_rfe2,y_train)

LinearRegression()

In [43]:
mask = rfe2.support_

In [44]:
rfe2_features = X_train_scaled.loc[:,mask].columns.tolist()

In [45]:
print(str(len(rfe2_features)), 'selected features')
print(rfe2_features)

2 selected features
['total_bill', 'tip']


f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?


2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [46]:
def select_kbest(X, y, k):
    # make the object
    kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=k)

    # fit the object
    kbest.fit(X, y)
    
    # use the object (.get_support() is that array of booleans to filter the list of column names)
    return X.columns[kbest.get_support()].tolist()

select_kbest(X_train, y_train, 2)

['tip', 'price_per_person']

In [47]:
def show_features_rankings(X_train, rfe):
    """
    Takes in a dataframe and a fit RFE object in order to output the rank of all features
    """
    # rfe here is reference rfe from cell 15
    var_ranks = rfe.ranking_
    var_names = X_train.columns.tolist()
    ranks = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
    ranks = ranks.sort_values(by="Rank", ascending=True)
    return ranks

show_features_rankings(X_train, rfe)

Unnamed: 0,Var,Rank
0,total_bill,1
6,size,1
2,sex,2
5,time,3
3,smoker,4
7,price_per_person,5
4,day,6
1,tip,7


3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [48]:
def select_rfe(X_train_scaled, y_train, k):
    # make the thing
    lm = sklearn.linear_model.LinearRegression()
    rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=k)

    # Fit the thing
    rfe.fit(X_train_scaled, y_train)
    
    # use the thing
    features_to_use = X_train_scaled.columns[rfe.support_].tolist()
    
    # we need to send show_feature_rankings a trained/fit RFE object
    all_rankings = show_features_rankings(X_train_scaled, rfe)
    
    return features_to_use, all_rankings
select_rfe(X_train_scaled, y_train, 2)

(['total_bill', 'tip'],
                 Var  Rank
 0        total_bill     1
 1               tip     1
 6              size     2
 7  price_per_person     3
 3            smoker     4
 4               day     5
 5              time     6
 2               sex     7)

In [49]:
# RFE output as dataframe
var_ranks = rfe.ranking_
var_names = X_train.columns.tolist()
ranks = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
ranks.sort_values(by="Rank", ascending=True)

Unnamed: 0,Var,Rank
0,total_bill,1
6,size,1
2,sex,2
5,time,3
3,smoker,4
7,price_per_person,5
4,day,6
1,tip,7


4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [50]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [59]:
target = swiss.Fertility

In [61]:
def split(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    return train, validate, test

In [62]:
# Split the data
train, validate, test = split(swiss)

# Setup X and y
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

X_validate = validate.drop(columns='Fertility')
y_validate = validate.Fertility

X_test = test.drop(columns='Fertility')
y_test = test.Fertility

In [63]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test = split_tvt_into_variables(train, validate, test, target='Fertility')

In [64]:
scaler, X_train_scaled, X_validate_scaled, X_test_scaled = Standard_Scaler(X_train, X_validate, X_test)

In [65]:
# Find the top 3 features using kbest
select_kbest(X_train_scaled, y_train, 3)

['Examination', 'Catholic', 'Infant.Mortality']

In [66]:
# Find the top 3 features using RFE
selected_features, all_rankings = select_rfe(X_train, y_train, 3)
print(selected_features)
all_rankings

['Agriculture', 'Examination', 'Infant.Mortality']


Unnamed: 0,Var,Rank
0,Agriculture,1
1,Examination,1
4,Infant.Mortality,1
2,Education,2
3,Catholic,3
