In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import wrangle
import warnings
warnings.filterwarnings("ignore")

import sklearn.preprocessing as skl_pp

from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector, f_classif

from sklearn.linear_model import LinearRegression

1. Load the tips dataset.

- Create a column named price_per_person. This should be the total bill divided by the party size.
- Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
- Use select k best to select the top 2 features for predicting tip amount. What are they?
- Use recursive feature elimination to select the top 2 features for tip amount. What are they?
- Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [2]:
# load the 'tips' dataset
df = sns.load_dataset('tips')

In [3]:
# # Create a column named price_per_person. This should be the total bill divided by the party size.
df['price_per_person'] = df['total_bill'] / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [4]:
# convert smoker into is_smoker 1 or 0
df['is_smoker'] = df.smoker.isin(['Yes']).astype(int)
# convert sex into boolean 1 = male, 0 = dinner
df['is_male'] = df.sex.isin(['Male']).astype(int)
# create dummy columns for day
dummy_df = pd.get_dummies(df[['day']], dummy_na=False, drop_first=[True])
df = pd.concat([df, dummy_df], axis=1)
# create a new column named day_Thur if all other days are 0 than it is 1
df['day_Thur'] = (df.day_Fri + df.day_Sat + df.day_Sun == 0).astype(int)
# drop the columns we don't need
df.drop(['day', 'time', 'sex', 'smoker'], axis=1, inplace=True)
df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,is_smoker,is_male,day_Fri,day_Sat,day_Sun,day_Thur
0,16.99,1.01,2,8.495,0,0,0,0,1,0
1,10.34,1.66,3,3.446667,0,1,0,0,1,0
2,21.01,3.5,3,7.003333,0,1,0,0,1,0
3,23.68,3.31,2,11.84,0,1,0,0,1,0
4,24.59,3.61,4,6.1475,0,0,0,0,1,0


Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

- The most important features I chose to determinr the tip amount are:
    - total_bill
    - size of the party
    - price per person


In [5]:
# split the data into training and test sets
train, validate, test = wrangle.split_dataframe(df)
# seperate the target variable from the features
y_train = train.tip
X_train = train.drop('tip', axis=1)
y_validate = validate.tip
X_validate = validate.drop('tip', axis=1)
y_test = test.tip
X_test = test.drop('tip', axis=1)

print(train.shape)
print(validate.shape)
print(test.shape)

(136, 10)
(59, 10)
(49, 10)


In [6]:
# scale the data 
scaler = skl_pp.MinMaxScaler()
# Fit it
scaler.fit(X_train)
# Use it
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [7]:
# convert the data into a dataframe
X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_df

Unnamed: 0,total_bill,size,price_per_person,is_smoker,is_male,day_Fri,day_Sat,day_Sun,day_Thur
0,0.210561,0.2,0.187736,1.0,1.0,0.0,1.0,0.0,0.0
1,0.188688,0.2,0.158965,0.0,1.0,0.0,1.0,0.0,0.0
2,0.407203,0.6,0.133973,0.0,1.0,0.0,0.0,1.0,0.0
3,0.284578,0.2,0.285092,0.0,1.0,0.0,0.0,0.0,1.0
4,0.291648,0.2,0.294391,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
131,0.228679,0.2,0.211566,1.0,1.0,1.0,0.0,0.0,0.0
132,0.353734,0.2,0.376053,0.0,1.0,0.0,0.0,0.0,1.0
133,0.447636,0.2,0.499564,1.0,1.0,0.0,0.0,1.0,0.0
134,0.160407,0.2,0.121767,0.0,0.0,0.0,0.0,0.0,1.0


Use select k best to select the top 2 features for predicting tip amount. What are they?


In [8]:
# Use select k best to select the top 2 features for predicting tip amount.
kbest = SelectKBest(f_regression, k=2)
# kbest_class = SelectKBest(f_classif, k=2)
kbest.fit(X_train_df, y_train)
# kbest_class.fit(X_train_c, y_train)
kbest_results_num = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=X_train_df.columns)
kbest_results_num

Unnamed: 0,p,f
total_bill,5.022341e-17,92.909255
size,6.609409e-11,50.406244
price_per_person,0.002103984,9.835918
is_smoker,0.4162147,0.665093
is_male,0.09994374,2.744264
day_Fri,0.4022814,0.70597
day_Sat,0.4951505,0.467869
day_Sun,0.239863,1.393757
day_Thur,0.1338335,2.27495


In [9]:
X_train_df.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [10]:
X_train_transformed = pd.DataFrame(
    kbest.transform(X_train_df),
    index=X_train_df.index,
    columns=X_train_df.columns[kbest.get_support()]
)
X_train_transformed.head()

Unnamed: 0,total_bill,size
0,0.210561,0.2
1,0.188688,0.2
2,0.407203,0.6
3,0.284578,0.2
4,0.291648,0.2


Use recursive feature elimination to select the top 2 features for tip amount. What are they?


In [11]:
# Use recursive feature elimination to select the top 2 features for tip amount.
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X_train_df, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [12]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=X_train_df.columns)

Unnamed: 0,rfe_ranking
total_bill,1
size,7
price_per_person,1
is_smoker,8
is_male,6
day_Fri,2
day_Sat,5
day_Sun,3
day_Thur,4


In [13]:
X_train_df.columns[rfe.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

In [15]:
X_train_transformed = pd.DataFrame(
    rfe.transform(X_train_scaled),
    index=X_train_df.index,
    columns=X_train_df.columns[rfe.support_]
)
X_train_transformed.head()

Unnamed: 0,total_bill,price_per_person
0,0.210561,0.187736
1,0.188688,0.158965
2,0.407203,0.133973
3,0.284578,0.285092
4,0.291648,0.294391


Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?