In [1]:
# imports

import pandas as pd
import numpy as np

from pydataset import data

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import prepare
import wrangle


#### 1. Load the tips dataset.

#### - a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [2]:
# load data into dataframe pydataset
tips= data('tips')

# peak into data
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# check data types and Null values
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [4]:
# create a column
tips['price_per_person'] = tips['total_bill'] / tips['size']

# peak into data
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


#### - b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

total bill, size 

In [5]:
tips.corr()

Unnamed: 0,total_bill,tip,size,price_per_person
total_bill,1.0,0.675734,0.598315,0.647497
tip,0.675734,1.0,0.489299,0.347393
size,0.598315,0.489299,1.0,-0.175412
price_per_person,0.647497,0.347393,-0.175412,1.0


In [6]:
# create dummy varibales for categorical variables
tips = pd.get_dummies(tips, columns = ['sex', 'smoker', 'day', 'time','size'])

# peak into data
tips.head()

Unnamed: 0,total_bill,tip,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
1,16.99,1.01,8.495,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0
2,10.34,1.66,3.446667,0,1,1,0,0,0,1,0,1,0,0,0,1,0,0,0
3,21.01,3.5,7.003333,0,1,1,0,0,0,1,0,1,0,0,0,1,0,0,0
4,23.68,3.31,11.84,0,1,1,0,0,0,1,0,1,0,0,1,0,0,0,0
5,24.59,3.61,6.1475,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0


In [7]:
# drop repetative variables
tips = tips.drop(columns= ['sex_Female', 'smoker_No'])

# peak into data
tips.head()

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
1,16.99,1.01,8.495,0,0,0,0,1,0,1,0,0,1,0,0,0,0
2,10.34,1.66,3.446667,1,0,0,0,1,0,1,0,0,0,1,0,0,0
3,21.01,3.5,7.003333,1,0,0,0,1,0,1,0,0,0,1,0,0,0
4,23.68,3.31,11.84,1,0,0,0,1,0,1,0,0,1,0,0,0,0
5,24.59,3.61,6.1475,0,0,0,0,1,0,1,0,0,0,0,1,0,0


In [8]:
# split data into train, validate and test data using a function from wrangle module
train, validate, test= wrangle.train_val_test(tips)

# get shape of train, validate and test data
train.shape,validate.shape, test.shape

((136, 17), (59, 17), (49, 17))

In [9]:
# Scale data using a function from prepare module
train_scaled, validate_scaled, test_scaled= prepare.scale_data(train, validate, test, 
                                               columns_to_scale=train.drop(columns='tip').columns.tolist(),
                                                return_scaler=False)

# peak into data
train_scaled.head()

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
13,0.187557,1.57,0.298393,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
149,0.058081,1.73,0.092403,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
53,0.632691,5.2,0.370891,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
116,0.230946,3.5,0.367421,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48,0.577365,6.0,0.326881,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
# assign dependent variables
X_train_scaled = train_scaled.drop(columns='tip')

# assign independent variables
y_train = train.tip

#### c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [11]:
# SelectKBest

# initialize the f_selector object
f_selector = SelectKBest(f_regression, k=2 )

# fit the object
f_selector.fit(X_train_scaled, y_train)

# create boolean mask
kbest_feature_mask = f_selector.get_support()

# print boolean
# print(kbest_feature_mask)

# print list of columns name
# print(X_train_scaled.columns)

# get top features
kbest_f_feature = X_train_scaled.iloc[:,kbest_feature_mask].columns.tolist()

# print top features
kbest_f_feature

['total_bill', 'size_2']

#### d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [12]:
# Recursive Feature Elimination


# initialize the ML algorithm
lm = LinearRegression()

# initialize the rfe object
rfe = RFE(lm, n_features_to_select=2)

# fit the object
rfe.fit(X_train_scaled, y_train)

# create boolean mask
rfe_feature_mask = rfe.support_

# print boolean
# print(rfe_feature_mask)

# print list of columns name
# print(X_train_scaled.columns)

# get top features
rfe_feature = X_train_scaled.iloc[:, rfe_feature_mask].columns.tolist()

# print top features
rfe_feature 

['total_bill', 'price_per_person']

##### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [13]:
def select_kbest(X, y, number): 
    
    '''takes in the predictors, the target, and the number of features to select
    and return top features
    '''
    
    kbest = SelectKBest(f_regression, k=number )
    kbest.fit(X, y)
    kbest_feature_mask = kbest.get_support()
    f_feature = X.iloc[:,kbest_feature_mask].columns.tolist()
    return f_feature
    

In [14]:
# call a function select_kbest(X, y, number)
select_kbest(train_scaled.drop(columns='tip'),train_scaled.tip, 2)

['total_bill', 'size_2']

#### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [15]:
def rfe(X, y, number):
    
    '''takes in the predictors, the target, and the number of features to select
    and return top features
    '''
    
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=number)
    rfe.fit(X, y)
    rfe_feature_mask = rfe.support_
    rfe_feature = X.iloc[:, rfe_feature_mask].columns.tolist()
    return rfe_feature


In [16]:
# call a function rfe(X, y, number)
rfe(train_scaled.drop(columns='tip'),train_scaled.tip, 2)

['total_bill', 'price_per_person']

#### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [17]:
# load swiss data
swiss = data('swiss')

# peak into data
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [18]:
# check Null values and data types
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [19]:
# split data into train, validate and test data using a function from wrangle module
train, validate, test= wrangle.train_val_test(swiss)

# get shape of train, validate and test data
train.shape,validate.shape, test.shape

((25, 6), (12, 6), (10, 6))

In [20]:
# scale swiss data
train_scaled, validate_scaled, test_scaled= prepare.scale_data(train, validate, test, 
                                               columns_to_scale=train.drop(columns='Fertility').columns.tolist(),
                                                return_scaler=False)

# peak into data
train_scaled.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Herens,77.3,1.0,0.0,0.0,1.0,0.214286
Glane,92.4,0.752542,0.28125,0.117647,0.970776,1.0
Sarine,82.9,0.497175,0.34375,0.215686,0.911299,0.940476
Monthey,79.4,0.719774,0.0625,0.019608,0.981683,0.440476
Gruyere,82.4,0.588701,0.21875,0.098039,0.976024,0.535714


In [21]:
select_kbest(train_scaled.drop(columns='Fertility'),train_scaled.Fertility, 3)

['Examination', 'Education', 'Infant.Mortality']

In [22]:
rfe(train_scaled.drop(columns='Fertility'),train_scaled.Fertility, 3)

['Agriculture', 'Education', 'Catholic']