In [2]:
import pandas as pd
import numpy as np
import env

from pydataset import data
from utilities import split_dataframe, generate_xy_splits
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression, RFE

## 1. Load the tips dataset.

In [6]:
tips_df = data("tips")
tips_df.sample(14)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
231,24.01,2.0,Male,Yes,Sat,Dinner,4
127,8.52,1.48,Male,No,Thur,Lunch,2
82,16.66,3.4,Male,No,Thur,Lunch,2
80,17.29,2.71,Male,No,Thur,Lunch,2
149,9.78,1.73,Male,No,Thur,Lunch,2
241,27.18,2.0,Female,Yes,Sat,Dinner,2
120,24.08,2.92,Female,No,Thur,Lunch,4
238,32.83,1.17,Male,Yes,Sat,Dinner,2
171,50.81,10.0,Male,Yes,Sat,Dinner,3
35,17.78,3.27,Male,No,Sat,Dinner,2


* Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [7]:
tips_df['tip_percentage'] = tips_df.tip / tips_df.total_bill
tips_df.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
70,15.01,2.09,Male,Yes,Sat,Dinner,2,0.139241
243,17.82,1.75,Male,No,Sat,Dinner,2,0.098204
202,12.74,2.01,Female,Yes,Thur,Lunch,2,0.157771
201,18.71,4.0,Male,Yes,Thur,Lunch,3,0.213789
220,30.14,3.09,Female,Yes,Sat,Dinner,4,0.102522


* Create a column named price_per_person. This should be the total bill divided by the party size.

In [8]:
tips_df['price_per_person'] = tips_df.total_bill / tips_df['size']
tips_df.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
128,14.52,2.0,Female,No,Thur,Lunch,2,0.137741,7.26
124,15.95,2.0,Male,No,Thur,Lunch,2,0.125392,7.975
225,13.42,1.58,Male,Yes,Fri,Lunch,2,0.117735,6.71
229,13.28,2.72,Male,No,Sat,Dinner,2,0.204819,6.64
117,29.93,5.07,Male,No,Sun,Dinner,4,0.169395,7.4825


* Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

I think total_bill is the most important feature for predicting tip amount. I think sex is the most important feature for predicting tip percentage.

* Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [9]:
tips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   tip_percentage    244 non-null    float64
 8   price_per_person  244 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 19.1+ KB


In [10]:
numeric_features_df = tips_df[['total_bill', 'tip', 'size', 'tip_percentage', 'price_per_person']]
numeric_features_df.sample(5)

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person
219,7.74,1.44,2,0.186047,3.87
188,30.46,2.0,5,0.06566,6.092
156,29.85,5.14,5,0.172194,5.97
96,40.17,4.73,4,0.11775,10.0425
60,48.27,6.73,4,0.139424,12.0675


In [13]:
train, validate, test = split_dataframe(numeric_features_df)
splits = generate_xy_splits(train, validate, test, 'tip', [])

In [14]:
# Select K best
f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(splits['X_train'], splits['y_train'])

feature_mask = f_selector.get_support()

f_feature = splits['X_train'].iloc[:, feature_mask].columns.tolist()
f_feature

['total_bill', 'size']

In [15]:
# RFE
lm = LinearRegression()
rfe = RFE(lm, 2)
rfe.fit(splits['X_train'], splits['y_train'])

rfe_feature_mask = rfe.support_

rfe_feature = splits['X_train'].iloc[:, rfe_feature_mask].columns.tolist()
rfe_feature

['size', 'tip_percentage']

* Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [18]:
tip_percent_splits = generate_xy_splits(train, validate, test, 'tip_percentage', [])

In [19]:
# K best
f_selector.fit(tip_percent_splits['X_train'], tip_percent_splits['y_train'])

feature_mask = f_selector.get_support()

f_feature = tip_percent_splits['X_train'].iloc[:, feature_mask].columns.tolist()
f_feature

['total_bill', 'price_per_person']

In [20]:
rfe.fit(tip_percent_splits['X_train'], tip_percent_splits['y_train'])

rfe_feature_mask = rfe.support_

rfe_feature = tip_percent_splits['X_train'].iloc[:, rfe_feature_mask].columns.tolist()
rfe_feature

['tip', 'size']

* Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

The features may not be independent of each other, so as the target changes, the top two features may change as well. Also, ass you add more features, the interaction between newly added features and previous features may cause those combinations to be better predictors of the target.

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [21]:
def select_kbest(predictors, targets, k):
    f_selector = SelectKBest(f_regression, k=k)
    f_selector.fit(predictors, targets)

    feature_mask = f_selector.get_support()

    return predictors.iloc[:, feature_mask].columns.tolist()

In [22]:
select_kbest(splits['X_train'], splits['y_train'], 2)

['total_bill', 'size']

In [23]:
select_kbest(tip_percent_splits['X_train'], tip_percent_splits['y_train'], 2)

['total_bill', 'price_per_person']

## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [24]:
def rfe(predictors, targets, k, model_type):
    model = model_type
    
    rfe = RFE(model, k)
    rfe.fit(predictors, targets)

    rfe_feature_mask = rfe.support_

    return predictors.iloc[:, rfe_feature_mask].columns.tolist()

In [25]:
rfe(splits['X_train'], splits['y_train'], 2, LinearRegression())

['size', 'tip_percentage']

In [26]:
rfe(tip_percent_splits['X_train'], tip_percent_splits['y_train'], 2, LinearRegression())

['tip', 'size']

## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [27]:
swiss_df = data("swiss")
swiss_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [28]:
swiss_df = swiss_df.rename(columns={'Infant.Mortality' : 'Infant_Mortality'})
swiss_df.sample(14)

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant_Mortality
Martigwy,70.5,78.2,12,6,98.96,19.4
Entremont,69.3,84.9,7,6,99.68,19.8
Lavaux,65.1,73.0,19,9,2.84,20.0
Monthey,79.4,64.9,7,3,98.22,20.2
Yverdon,65.4,49.5,15,8,6.1,22.5
Conthey,75.5,85.9,3,2,99.71,15.1
Aigle,64.1,62.0,21,12,8.52,16.5
Vevey,58.3,26.8,25,19,18.46,20.9
La Vallee,54.3,15.2,31,20,2.15,10.8
Courtelary,80.2,17.0,15,12,9.96,22.2


In [31]:
train, validate, test = split_dataframe(swiss_df)
splits = generate_xy_splits(train, validate, test, target='Fertility', drop_columns=[])

In [32]:
select_kbest(splits['X_train'], splits['y_train'], 3)

['Agriculture', 'Examination', 'Education']

In [33]:
rfe(splits['X_train'], splits['y_train'], 2, LinearRegression())

['Education', 'Infant_Mortality']