### Feature Enginering Exercises

Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

In [96]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
import sklearn.linear_model
import sklearn.feature_selection
import sklearn.preprocessing
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
import seaborn as sns
from pydataset import data

1. Load the tips dataset.

    a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.
    
    b. Create a column named price_per_person. This should be the total bill divided by the party size.
    
    c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?
    
    d. Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?
    
    e. Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. What are they?
    
    f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [3]:
# 1
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
# 1.a
# adding tip_percentage column: df['tip']/df['total_bill']
df['tip_percentage'] = df['tip']/df['total_bill']
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542


In [6]:
# 1.b
# adding price_per_person column: df['total_bill']/df['size']
df['price_per_person'] = df['total_bill']/df['size']
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667


In [None]:
# 1.c
# I think total bill would be the most important in predicting the tip and tip percentage.

In [23]:
str_col = list(df.select_dtypes(include = 'category'))

In [31]:
df.drop(columns = str_col, inplace = True)

In [70]:
# using kbest and recursive features to select the top 2 features in predicting tip
train_validate, test = train_test_split(df, test_size = .2, random_state = 123)
train, validate = train_test_split(train_validate, test_size = .3, random_state = 123)

In [79]:
def standard_scaler_tvt(train, validate, test):
    # list of columns float and int dtypes
    num_cols = list(train.select_dtypes(include = ['float64', 'int64', 'complex']).columns)
    
    # standard scaler object
    scaler = sklearn.preprocessing.StandardScaler()
    
    # fit scaler
    scaler.fit(train[num_cols])
    
    # scale
    train_scaled = scaler.transform(train[num_cols])
    validate_scaled = scaler.transform(validate[num_cols])
    test_scaled = scaler.transform(test[num_cols])

    # new column names
    #new_column_names = [c + '_scaled' for c in num_cols]

    # add scaled columns to input dataset
    #train[new_column_names] = scaler.transform(train[num_cols])
    #validate[new_column_names] = scaler.transform(train[num_cols])
    #test[new_column_names] = scaler.transform(train[num_cols])
    
    # add column names if scaled df is needed separately
    train_scaled = pd.DataFrame(train_scaled, columns = num_cols)
    validate_scaled = pd.DataFrame(validate_scaled, columns = num_cols)
    test_scaled = pd.DataFrame(test_scaled, columns = num_cols)
    
    return train_scaled, validate_scaled, test_scaled

In [80]:
train_scaled, validate_scaled, test_scaled = standard_scaler_tvt(train, validate, test)

In [33]:
target = "tip"

# split train into X (dataframe, drop target) & y (series, keep target only)
X_train = train.drop(columns = [target])
y_train = train[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
X_validate = validate.drop(columns = [target])
y_validate = validate[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
X_test = test.drop(columns = [target])
y_test = test[target]

X_train.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
18,16.97,3,0.206246,5.656667
172,7.25,2,0.710345,3.625
118,12.43,2,0.144811,6.215
28,21.7,2,0.198157,10.85
237,32.83,2,0.035638,16.415


In [83]:
# 1.d
target = "tip"

# split train into X (dataframe, drop target) & y (series, keep target only)
X_train_scaled = train_scaled.drop(columns = [target])
y_train_scaled = train_scaled[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
X_validate_scaled = validate_scaled.drop(columns = [target])
y_validate_scaled = validate_scaled[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
X_test_scaled = test_scaled.drop(columns = [target])
y_test_scaled = test_scaled[target]

X_train_scaled.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
0,-0.208121,0.463203,0.557201,-0.673931
1,-1.319309,-0.552855,7.633113,-1.395522
2,-0.727132,-0.552855,-0.305153,-0.475627
3,0.332612,-0.552855,0.443649,1.170595
4,1.60499,-0.552855,-1.837586,3.147127


In [39]:
# 1.d
# select kbest
k = 2

kbest = SelectKBest(f_regression, k = 2)

kbest.fit(X_train, y_train)

kbest_features = X_train.columns[kbest.get_support()].tolist()

print('The best two features are: ', kbest_features)

The best two features are:  ['total_bill', 'size']


In [86]:
# 1.d
# rfe

lrm = LinearRegression()
rfe = RFE(lrm, n_features_to_select = 2)

rfe.fit(X_train_scaled, y_train)

rfe_columns = X_train_scaled.columns[rfe.support_].tolist()
print('The best two features are: ', rfe_columns)

The best two features are:  ['total_bill', 'tip_percentage']


In [87]:
lrm = LinearRegression()
rfe = RFE(lrm, n_features_to_select = 2)

rfe.fit(X_train, y_train)

rfe_columns = X_train_scaled.columns[rfe.support_].tolist()
print('The best two features are: ', rfe_columns)

The best two features are:  ['size', 'tip_percentage']


In [88]:
lrm = LinearRegression()
rfe = RFE(lrm, n_features_to_select = 2)

rfe.fit(X_train_scaled, y_train_scaled)

rfe_columns = X_train_scaled.columns[rfe.support_].tolist()
print('The best two features are: ', rfe_columns)

The best two features are:  ['total_bill', 'tip_percentage']


In [89]:
# 1.e
target = "tip_percentage"

# split train into X (dataframe, drop target) & y (series, keep target only)
X_train_scaled = train_scaled.drop(columns = [target])
y_train_scaled = train_scaled[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
X_validate_scaled = validate_scaled.drop(columns = [target])
y_validate_scaled = validate_scaled[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
X_test_scaled = test_scaled.drop(columns = [target])
y_test_scaled = test_scaled[target]

X_train_scaled.head()

Unnamed: 0,total_bill,tip,size,price_per_person
0,-0.208121,0.381062,0.463203,-0.673931
1,-1.319309,1.518016,-0.552855,-1.395522
2,-0.727132,-0.790345,-0.552855,-0.475627
3,0.332612,0.932312,-0.552855,1.170595
4,1.60499,-1.224455,-0.552855,3.147127


In [90]:
# 1.e
# select kbest
k = 2

kbest = SelectKBest(f_regression, k = 2)

kbest.fit(X_train, y_train)

kbest_features = X_train.columns[kbest.get_support()].tolist()

print('The best two features are: ', kbest_features)

The best two features are:  ['total_bill', 'size']


In [91]:
# 1.e
# rfe

lrm = LinearRegression()
rfe = RFE(lrm, n_features_to_select = 2)

rfe.fit(X_train_scaled, y_train_scaled)

rfe_columns = X_train_scaled.columns[rfe.support_].tolist()
print('The best two features are: ', rfe_columns)

The best two features are:  ['total_bill', 'tip']


2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [92]:
def select_kbest(X, y, k):
    # make the object
    kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=k)

    # fit the object
    kbest.fit(X, y)
    
    # use the object (.get_support() is that array of booleans to filter the list of column names)
    return X.columns[kbest.get_support()].tolist()

select_kbest(X_train, y_train, 2)

['total_bill', 'size']

In [93]:
def show_features_rankings(X_train, rfe):
    """
    Takes in a dataframe and a fit RFE object in order to output the rank of all features
    """
    # rfe here is reference rfe from cell 15
    var_ranks = rfe.ranking_
    var_names = X_train.columns.tolist()
    ranks = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
    ranks = ranks.sort_values(by="Rank", ascending=True)
    return ranks

3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [94]:
def select_rfe(X, y, k):
    # make the thing
    lm = sklearn.linear_model.LinearRegression()
    rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=k)

    # Fit the thing
    rfe.fit(X, y)
    
    # use the thing
    features_to_use = X.columns[rfe.support_].tolist()
    
    # we need to send show_feature_rankings a trained/fit RFE object
    all_rankings = show_features_rankings(X, rfe)
    
    return features_to_use, all_rankings

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [97]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [99]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [98]:
# using kbest and recursive features to select the top 2 features in predicting tip
train_validate, test = train_test_split(swiss, test_size = .2, random_state = 123)
train, validate = train_test_split(train_validate, test_size = .3, random_state = 123)

In [100]:
train_scaled, validate_scaled, test_scaled = standard_scaler_tvt(train, validate, test)

In [101]:
target = "Fertility"

# split train into X (dataframe, drop target) & y (series, keep target only)
X_train = train.drop(columns = [target])
y_train = train[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
X_validate = validate.drop(columns = [target])
y_validate = validate[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
X_test = test.drop(columns = [target])
y_test = test[target]

# split train into X (dataframe, drop target) & y (series, keep target only)
X_train_scaled = train_scaled.drop(columns = [target])
y_train_scaled = train_scaled[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
X_validate_scaled = validate_scaled.drop(columns = [target])
y_validate_scaled = validate_scaled[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
X_test_scaled = test_scaled.drop(columns = [target])
y_test_scaled = test_scaled[target]

In [102]:
# select kbest
k = 3

select_kbest(X_train, y_train, k)

['Examination', 'Catholic', 'Infant.Mortality']

In [104]:
select_rfe(X_train_scaled, y_train_scaled, k)

(['Examination', 'Catholic', 'Infant.Mortality'],
                 Var  Rank
 1       Examination     1
 3          Catholic     1
 4  Infant.Mortality     1
 0       Agriculture     2
 2         Education     3)