# Feature Engineering Exercises

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Exercise 1
Load the tips dataset.

In [2]:
tips = sns.load_dataset('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB


##### category columns

In [5]:
#creating a mask of T/F that holds 'category' data
mask = np.array(tips.dtypes == "category")

In [6]:
#gathering the data in the columns 
#where the data is 'category'
cat_tips = tips.iloc[:, mask]

In [7]:
#checking for all the unique values and their counts
#in the columns labeled 'category'
for col in cat_tips.columns:
    print(cat_tips[col].value_counts())
    print("\n")

Male      157
Female     87
Name: sex, dtype: int64


No     151
Yes     93
Name: smoker, dtype: int64


Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64


Dinner    176
Lunch      68
Name: time, dtype: int64




##### dummy variables

In [8]:
# creating dummy variables for all 'category' columns
dummy_tips = pd.get_dummies(cat_tips, dummy_na=False, drop_first=True)

In [9]:
# adding the dummy variables to the main dataframe
tips = pd.concat([tips, dummy_tips], axis=1)

In [10]:
# dropping all the columns that are duplicates
tips.drop(columns=cat_tips.columns, inplace=True)

In [11]:
tips.head()

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,1,1,0,0,1,1
1,10.34,1.66,3,0,1,0,0,1,1
2,21.01,3.5,3,0,1,0,0,1,1
3,23.68,3.31,2,0,1,0,0,1,1
4,24.59,3.61,4,1,1,0,0,1,1


### Exercise 1a
Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [12]:
tips['tip_percentage'] = round((tips.tip / tips.total_bill) * 100, 2)

In [13]:
tips.head()

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner,tip_percentage
0,16.99,1.01,2,1,1,0,0,1,1,5.94
1,10.34,1.66,3,0,1,0,0,1,1,16.05
2,21.01,3.5,3,0,1,0,0,1,1,16.66
3,23.68,3.31,2,0,1,0,0,1,1,13.98
4,24.59,3.61,4,1,1,0,0,1,1,14.68


### Exercise 1b
Create a column named price_per_person. This should be the total bill divided by the party size.

In [14]:
tips['price_per_person'] = tips.total_bill / tips['size']

In [15]:
tips.head()

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner,tip_percentage,price_per_person
0,16.99,1.01,2,1,1,0,0,1,1,5.94,8.495
1,10.34,1.66,3,0,1,0,0,1,1,16.05,3.446667
2,21.01,3.5,3,0,1,0,0,1,1,16.66,7.003333
3,23.68,3.31,2,0,1,0,0,1,1,13.98,11.84
4,24.59,3.61,4,1,1,0,0,1,1,14.68,6.1475


### Exercise 1c
Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

**I believe the most important features in predicting the tip amount and tip percentage will be total bill, time and day.**


### Exercise 1d
Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

##### splitting the data

In [16]:
from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(tips,
                                        test_size=.2,
                                        random_state=123)
train, validate = train_test_split(train_validate, 
                                   test_size=.3,
                                   random_state=123)

In [17]:
train.shape, validate.shape, test.shape

((136, 11), (59, 11), (49, 11))

##### split into X and y dataframes

In [18]:
#dropping 'total_bill' and 'size' because we created a variable
#called 'price_per_person' that is a combo of both
#dropping 'tip' because it is out target variable
X_train = train.drop(columns = ['tip', 'total_bill', 'size'])
X_validate = validate.drop(columns = ['tip', 'total_bill', 'size'])
X_test = test.drop(columns = ['tip',  'total_bill', 'size'])

y_train = train[['tip']]
y_validate = validate[['tip']]
y_test = test[['tip']]

In [19]:
X_train.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner,tip_percentage,price_per_person
18,1,1,0,0,1,1,20.62,5.656667
172,0,0,0,0,1,1,71.03,3.625
118,1,1,0,0,0,0,14.48,6.215
28,0,1,0,1,0,1,19.82,10.85
237,0,0,0,1,0,1,3.56,16.415


##### Scale

In [20]:
from sklearn.preprocessing import MinMaxScaler

#fitting the data
scaler = MinMaxScaler(copy=True)
scaler.fit(X_train)

#scaling the data
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [21]:
#storing the scaled data as a dataframe 
X_train_scaled = pd.DataFrame(X_train_scaled, 
             columns = X_train.columns.values).\
             set_index(X_train.index.values)

X_validate_scaled = pd.DataFrame(X_validate_scaled, 
             columns = X_validate.columns.values).\
             set_index(X_validate.index.values)

X_test_scaled = pd.DataFrame(X_test_scaled, 
             columns = X_test.columns.values).\
             set_index(X_test.index.values)

In [22]:
X_train_scaled.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner,tip_percentage,price_per_person
18,1.0,1.0,0.0,0.0,1.0,1.0,0.252853,0.150344
172,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.032258
118,1.0,1.0,0.0,0.0,0.0,0.0,0.16185,0.182796
28,0.0,1.0,0.0,1.0,0.0,1.0,0.240996,0.452194
237,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.775647


##### SelectKbest

In [23]:
from sklearn.feature_selection import SelectKBest, f_regression

In [24]:
#Initialize the f_selector object, 
#which defines the test for scoring the features 
#and the number of features we want to keep,
f_selector = SelectKBest(f_regression, k = 2)

#fitting the data into the model
#scoring, ranking and identifying the top k features
f_selector = f_selector.fit(X_train_scaled, y_train.tip)

#transforming the data to include only k best features
X_train_reduced = f_selector.transform(X_train_scaled)

#creating a list of the features that remain
f_support = f_selector.get_support()

#We get a list of the feature names selected from 
#X_train using .loc with our mask, 
#using .columns to get the column names, 
#and convert the values to a list using .tolist()
f_feature = X_train_scaled.iloc[:, f_support].columns.tolist()

f_feature

['tip_percentage', 'price_per_person']

##### RFE

In [25]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [26]:
#Initialize the linear regression object
lm = LinearRegression()

#Initialize the RFE object, 
#setting the hyperparameters to be our linear regression 
#(as the algorithm to test the features on) 
#and the number of features to return to be 2
rfe = RFE(lm, 2)

#Fit the RFE object to our data. 
#(This means create multiple linear regression models,
#find the one that performs best, 
#and identify the features that are used in that model.
#Those are the features we want.)
#Transform our X dataframe to include only those 2 features.
X_rfe = rfe.fit_transform(X_train_scaled, y_train.tip)

#Create a mask to hold a list of the features that remain
mask = rfe.support_

#We get a list of the feature names selected from 
#X_train using .loc with our mask, 
#using .columns to get the column names, 
#and convert the values to a list using .tolist()
X_reduced_scaled_rfe = X_train_scaled.iloc[:, mask].columns.tolist()

X_reduced_scaled_rfe

['tip_percentage', 'price_per_person']

### Exercise 1e
Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [27]:
#dropping 'total_bill' and 'size' because we created a variable
#called 'price_per_person' that is a combo of both
#dropping 'tip_percentage' because it is out target variable
X_train2 = train.drop(columns = ['tip_percentage', 'total_bill', 'size'])
X_validate2 = validate.drop(columns = ['tip_percentage', 'total_bill', 'size'])
X_test2 = test.drop(columns = ['tip_percentage', 'total_bill', 'size'])

y_train2 = train[['tip_percentage']]
y_validate2 = validate[['tip_percentage']]
y_test2 = test[['tip_percentage']]

In [28]:
X_train2.head()

Unnamed: 0,tip,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner,price_per_person
18,3.5,1,1,0,0,1,1,5.656667
172,5.15,0,0,0,0,1,1,3.625
118,1.8,1,1,0,0,0,0,6.215
28,4.3,0,1,0,1,0,1,10.85
237,1.17,0,0,0,1,0,1,16.415


In [29]:
#initializing the identifier
scaler = MinMaxScaler(copy=True)
#fitting the data
scaler.fit(X_train2)

#scaling the data
X_train_scaled2 = scaler.transform(X_train2)
X_validate_scaled2 = scaler.transform(X_validate2)
X_test_scaled2 = scaler.transform(X_test2)

In [30]:
#storing the scaled data as a dataframe
X_train_scaled2 = pd.DataFrame(X_train_scaled2, 
             columns = X_train2.columns.values).\
             set_index(X_train2.index.values)

X_validate_scaled2 = pd.DataFrame(X_validate_scaled2, 
             columns = X_validate.columns.values).\
             set_index(X_validate.index.values)

X_test_scaled2 = pd.DataFrame(X_test_scaled2, 
             columns = X_test.columns.values).\
             set_index(X_test.index.values)

In [31]:
X_train_scaled2.head()

Unnamed: 0,tip,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner,price_per_person
18,0.3125,1.0,1.0,0.0,0.0,1.0,1.0,0.150344
172,0.51875,0.0,0.0,0.0,0.0,1.0,1.0,0.032258
118,0.1,1.0,1.0,0.0,0.0,0.0,0.0,0.182796
28,0.4125,0.0,1.0,0.0,1.0,0.0,1.0,0.452194
237,0.02125,0.0,0.0,0.0,1.0,0.0,1.0,0.775647


##### SelectKBest

In [32]:
#Initialize the f_selector object, 
#which defines the test for scoring the features 
#and the number of features we want to keep,
f_selector = SelectKBest(f_regression, k = 2)

#fitting the data into the model
#scoring, ranking and identifying the top k features
f_selector = f_selector.fit(X_train_scaled2, y_train2.tip_percentage)

#transforming the data to include only k best features
X_train_reduced = f_selector.transform(X_train_scaled2)

#creating a list of the features that remain
f_support = f_selector.get_support()

#We get a list of the feature names selected from 
#X_train using .loc with our mask, 
#using .columns to get the column names, 
#and convert the values to a list using .tolist()
f_feature = X_train_scaled2.iloc[:, f_support].columns.tolist()

f_feature

['tip', 'price_per_person']

##### RFE

In [33]:
#Initialize the linear regression object
lm = LinearRegression()

#Initialize the RFE object, 
#setting the hyperparameters to be our linear regression 
#(as the algorithm to test the features on) 
#and the number of features to return to be 2
rfe = RFE(lm, 2)

#Fit the RFE object to our data. 
#(This means create multiple linear regression models,
#find the one that performs best, 
#and identify the features that are used in that model.
#Those are the features we want.)
#Transform our X dataframe to include only those 2 features.
X_rfe = rfe.fit_transform(X_train_scaled2, y_train2.tip_percentage)

#Create a mask to hold a list of the features that remain
mask = rfe.support_

#We get a list of the feature names selected from 
#X_train using .loc with our mask, 
#using .columns to get the column names, 
#and convert the values to a list using .tolist()
X_reduced_scaled_rfe = X_train_scaled2.iloc[:, mask].columns.tolist()

X_reduced_scaled_rfe

['tip', 'price_per_person']

### Exercise 1f
Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

**When SelectKBest is based on how correlated each feature is with the target variable. When RFE and a linear regression algorithm is used, we get back which features lead to the best performing linear regression model. Although both are similar, they are not exactly the same so that is why we can get 2 different answers.**

**The top features still differ even when you increase the number of important features but they have a lot of overlapping features**

### Exercise 2
Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [34]:
def select_kbest(predictors, target, number_of_features):
    #Initialize the f_selector object, 
    #which defines the test for scoring the features 
    #and the number of features we want to keep,
    f_selector = SelectKBest(f_regression, k = number_of_features)
    
    #fitting the data into the model
    #scoring, ranking and identifying the top k features
    f_selector = f_selector.fit(predictors, target)
    
    #creating a list of the features that remain
    f_support = f_selector.get_support()

    #We get a list of the feature names selected from 
    #X_train using .loc with our mask, 
    #using .columns to get the column names, 
    #and convert the values to a list using .tolist()
    f_feature = predictors.iloc[:, f_support].columns.tolist()

    return f_feature

In [35]:
select_kbest(X_train_scaled, y_train.tip, 7)

['sex_Female',
 'smoker_No',
 'day_Fri',
 'day_Sun',
 'time_Dinner',
 'tip_percentage',
 'price_per_person']

### Exercise 3
Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [36]:
def rfe(predictors, target, number_of_features):
    #Initialize the linear regression object
    lm = LinearRegression()
    
    #Initialize the RFE object, 
    #setting the hyperparameters to be our linear regression 
    #(as the algorithm to test the features on) 
    #and the number of features to be returned
    rfe = RFE(lm, number_of_features)

    #Fit the RFE object to our data. 
    #(This means create multiple linear regression models,
    #find the one that performs best, 
    #and identify the predictors that are used in that model.
    #Those are the features we want.)
    #Transform our X dataframe to include only 
    #the 'number_of_features' that performed the best
    X_rfe = rfe.fit_transform(predictors, target)

    #Create a mask to hold a list of the features that remain
    mask = rfe.support_

    #We get a list of the feature names selected from 
    #X_train using .loc with our mask, 
    #using .columns to get the column names, 
    #and convert the values to a list using .tolist()
    X_reduced_scaled_rfe = predictors.iloc[:, mask].columns.tolist()

    return X_reduced_scaled_rfe

In [37]:
rfe(X_train_scaled, y_train.tip, 7)

['sex_Female',
 'smoker_No',
 'day_Fri',
 'day_Sat',
 'day_Sun',
 'tip_percentage',
 'price_per_person']

### Exercise 4
Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [38]:
from pydataset import data
import modeling
import prepare

In [39]:
swiss = data('swiss')
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [40]:
swiss.describe()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
count,47.0,47.0,47.0,47.0,47.0,47.0
mean,70.142553,50.659574,16.489362,10.978723,41.14383,19.942553
std,12.491697,22.711218,7.977883,9.615407,41.70485,2.912697
min,35.0,1.2,3.0,1.0,2.15,10.8
25%,64.7,35.9,12.0,6.0,5.195,18.15
50%,70.4,54.1,16.0,8.0,15.14,20.0
75%,78.45,67.65,22.0,12.0,93.125,21.7
max,92.5,89.7,37.0,53.0,100.0,26.6


In [42]:
train, validate, test = prepare.telco_split(swiss)

In [43]:
train.shape, validate.shape, test.shape

((27, 6), (10, 6), (10, 6))

In [44]:
X_train = train.drop(columns=['Fertility'])
X_validate = validate.drop(columns=['Fertility'])
X_test = test.drop(columns=['Fertility'])

y_train = train[['Fertility']]
y_validate = validate[['Fertility']]
y_test = test[['Fertility']]

In [45]:
X_train_scaled, X_validate_scaled, X_test_scaled = modeling.scaling(X_train, X_validate, X_test)

In [46]:
X_train_scaled.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rive Droite,0.47439,0.40625,0.903226,0.492786,0.316327
Aubonne,0.729268,0.34375,0.193548,0.0,0.408163
Rolle,0.647561,0.40625,0.290323,0.055766,0.122449
Lavaux,0.796341,0.5,0.258065,0.005832,0.5
Nyone,0.526829,0.59375,0.354839,0.131689,0.163265


In [47]:
modeling.select_kbest(X_train_scaled, y_train.Fertility, 3)

['Examination', 'Catholic', 'Infant.Mortality']

In [48]:
modeling.rfe(X_train_scaled, y_train.Fertility, 3)

['Examination', 'Education', 'Infant.Mortality']

In [None]:
def scaling(train, validate, test):
    scaler = MinMaxScaler(copy=True)
    
    scaler.fit(train)
    train_scaled = scaler.transform(train)
    train_scaled = pd.DataFrame(train_scaled, 
             columns = train.columns.values).\
             set_index(train.index.values)
    
    scaler.fit(validate)
    validate_scaled = scaler.transform(validate)
    validate_scaled = pd.DataFrame(validate_scaled, 
             columns = validate.columns.values).\
             set_index(validate.index.values)
    
    scaler.fit(test)
    test_scaled = scaler.transform(test)
    test_scaled = pd.DataFrame(test_scaled, 
             columns = test.columns.values).\
             set_index(test.index.values)
    return train_scaled, validate_scaled, test_scaled