## Feature Engineering / Feature Selection

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression

##### 1. Load the tips dataset.



In [2]:
from pydataset import data

In [3]:
tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


    a. Create a column named price_per_person. This should be the total bill divided by the party size.


In [4]:
tips['price_per_person'] = tips.total_bill / tips.size

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.009947
2,10.34,1.66,Male,No,Sun,Dinner,3,0.006054
3,21.01,3.5,Male,No,Sun,Dinner,3,0.012301
4,23.68,3.31,Male,No,Sun,Dinner,2,0.013864
5,24.59,3.61,Female,No,Sun,Dinner,4,0.014397


In [6]:
# tips.size is producing a length, not the value in the column because it's a reserved word.
# change this.

In [7]:
tips = tips.rename(columns={'size':'size_of_party'})

In [8]:
tips['price_per_person'] = tips.total_bill / tips.size_of_party

In [9]:
tips.head()
# now price_per_person looks right.

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size_of_party,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


    b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?


    - total bill most important and then maybe time of day

    c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [10]:
#  first we need to wrangle the tips data
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size_of_party     244 non-null    int64  
 7   price_per_person  244 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 17.2+ KB


In [11]:
# data looks clean. There are no nulls and all of the dtypes look appropriate.
# labels are pythonic. Proceed to encoding cols for modeling
# Then splitting and scaling.

In [12]:
tips.sex.value_counts()

Male      157
Female     87
Name: sex, dtype: int64

In [13]:
tips['sex_encoded'] = tips.sex.map({'Female': 1, 'Male': 0})

In [14]:
tips.smoker.value_counts()

No     151
Yes     93
Name: smoker, dtype: int64

In [15]:
tips['smoker_encoded'] = tips.smoker.map({'No': 0, 'Yes': 1})

In [16]:
tips.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [17]:
dummy_df = pd.get_dummies(tips[['day']], drop_first=True)
dummy_df.head()

Unnamed: 0,day_Sat,day_Sun,day_Thur
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,0,1,0


In [18]:
tips = pd.concat([tips, dummy_df], axis=1)
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size_of_party,price_per_person,sex_encoded,smoker_encoded,day_Sat,day_Sun,day_Thur
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495,1,0,0,1,0
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,0,0,0,1,0
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,0,0,0,1,0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84,0,0,0,1,0
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,1,0,0,1,0


In [19]:
tips.time.value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [20]:
# map time to Lunch and Dinner encoded column
tips['time_encoded'] = tips.time.map({'Lunch': 0, 'Dinner': 1})
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size_of_party,price_per_person,sex_encoded,smoker_encoded,day_Sat,day_Sun,day_Thur,time_encoded
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495,1,0,0,1,0,1
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,0,0,0,1,0,1
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,0,0,0,1,0,1
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84,0,0,0,1,0,1
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,1,0,0,1,0,1


In [21]:
from sklearn.model_selection import train_test_split

In [22]:
train_val, test = train_test_split(tips, test_size=0.2, random_state=9751)
train, val = train_test_split(train_val, test_size=0.3, random_state=9751)

In [23]:
# now scale the continuous feature 'total_bill' and
# 'price_per_person'
import sklearn.preprocessing

In [24]:
# make the scaler
scaler = sklearn.preprocessing.MinMaxScaler()

In [25]:
cols_to_scale = train[['total_bill', 'price_per_person']]

In [26]:
df_scaled = pd.DataFrame(
    scaler.fit_transform(cols_to_scale), index=train.index, columns=('total_bill_scaled', 'price_per_person_scaled'))

In [27]:
df_scaled.columns

Index(['total_bill_scaled', 'price_per_person_scaled'], dtype='object')

In [28]:
train_scaled = pd.concat([train, df_scaled], axis=1)
train_scaled.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size_of_party,price_per_person,sex_encoded,smoker_encoded,day_Sat,day_Sun,day_Thur,time_encoded,total_bill_scaled,price_per_person_scaled
82,16.66,3.4,Male,No,Thur,Lunch,2,8.33,0,0,0,0,1,0,0.284667,0.387934
18,16.29,3.71,Male,No,Sun,Dinner,3,5.43,0,0,0,1,0,1,0.276917,0.1817
169,10.59,1.61,Female,Yes,Sat,Dinner,2,5.295,1,1,1,0,0,1,0.15752,0.172099
6,25.29,4.71,Male,No,Sun,Dinner,4,6.3225,0,0,0,1,0,1,0.465438,0.24517
232,15.69,3.0,Male,Yes,Sat,Dinner,3,5.23,0,1,1,0,0,1,0.264349,0.167477


In [29]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size_of_party,price_per_person,sex_encoded,smoker_encoded,day_Sat,day_Sun,day_Thur,time_encoded
82,16.66,3.4,Male,No,Thur,Lunch,2,8.33,0,0,0,0,1,0
18,16.29,3.71,Male,No,Sun,Dinner,3,5.43,0,0,0,1,0,1
169,10.59,1.61,Female,Yes,Sat,Dinner,2,5.295,1,1,1,0,0,1
6,25.29,4.71,Male,No,Sun,Dinner,4,6.3225,0,0,0,1,0,1
232,15.69,3.0,Male,Yes,Sat,Dinner,3,5.23,0,1,1,0,0,1


In [32]:
# now scale the total_bill column in the val and test sets
val[['total_bill_scaled', 'price_per_person_scaled']] = scaler.transform(
    val[['total_bill', 'price_per_person']])
test[['total_bill_scaled', 'price_per_person_scaled']] = scaler.transform(
    test[['total_bill', 'price_per_person']])

In [None]:
# make list of best cols for modeling:
train.columns

In [None]:
X_cols = []

In [None]:
# now split the features from the target
train.columns

In [None]:
# make and fit the KBest Selector with f_regression for the target 'tip' which is continuous
f_selector = SelectKBest()