In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import wrangle as w
import env
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

### 1. Load the tips dataset.

In [2]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


#### a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [3]:
tips['price_per_person'] = tips['total_bill'] / tips['size']

In [4]:
tips.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667


In [66]:
tips.corr()

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
total_bill,1.0,0.675734,0.647497,0.144877,-0.144877,0.085721,-0.085721,-0.138174,-0.086168,0.054919,0.122953,-0.183118,0.183118
tip,0.675734,1.0,0.347393,0.088862,-0.088862,0.005929,-0.005929,-0.095879,-0.055463,-0.00279,0.125114,-0.121629,0.121629
price_per_person,0.647497,0.347393,1.0,0.108604,-0.108604,0.229938,-0.229938,-0.09329,0.024442,0.07653,-0.005598,-0.122431,0.122431
sex_Male,0.144877,0.088862,0.108604,1.0,-1.0,0.002816,-0.002816,-0.194445,-0.07106,0.053957,0.168106,-0.205231,0.205231
sex_Female,-0.144877,-0.088862,-0.108604,-1.0,1.0,-0.002816,0.002816,0.194445,0.07106,-0.053957,-0.168106,0.205231,-0.205231
smoker_Yes,0.085721,0.005929,0.229938,0.002816,-0.002816,1.0,-1.0,-0.128534,0.244316,0.155744,-0.181624,-0.054921,0.054921
smoker_No,-0.085721,-0.005929,-0.229938,-0.002816,0.002816,-1.0,1.0,0.128534,-0.244316,-0.155744,0.181624,0.054921,-0.054921
day_Thur,-0.138174,-0.095879,-0.09329,-0.194445,0.194445,-0.128534,0.128534,1.0,-0.169608,-0.43448,-0.392566,0.917996,-0.917996
day_Fri,-0.086168,-0.055463,0.024442,-0.07106,0.07106,0.244316,-0.244316,-0.169608,1.0,-0.216319,-0.195451,0.058159,-0.058159
day_Sat,0.054919,-0.00279,0.07653,0.053957,-0.053957,0.155744,-0.155744,-0.43448,-0.216319,1.0,-0.500682,-0.462709,0.462709


In [6]:
#Making dummy variables
# Get dummies for non-binary categorical variables
dummy_df = pd.get_dummies(tips[['sex', \
                              'smoker', \
                              'day', \
                              'time']], dummy_na=False, \
                              drop_first=False)

In [7]:
#Add this df back to tips
tips = pd.concat([tips, dummy_df], axis=1)

In [8]:
tips.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,1,0,0,1,0,0,0,1,0,1


In [10]:
#Drop columns no longer needed, including size (since we have price per person)
tips = tips.drop(columns=['sex', 'smoker', 'day', 'time', 'size'])

In [11]:
tips.head(2)

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,8.495,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,3.446667,1,0,0,1,0,0,0,1,0,1


#### b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

In [None]:
## The most important predictor should be total bill

#### b. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [12]:
#split our data
def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    return train, validate, test DataFrames.
    '''
    
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    return train, validate, test

In [13]:
train, validate, test = split_data(tips)

In [14]:
train.head(2)

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
18,16.97,3.5,5.656667,0,1,0,1,0,0,0,1,0,1
172,7.25,5.15,3.625,1,0,1,0,0,0,0,1,0,1


In [15]:
#Scaling Train column 'total bill' and price per person
# create the scaler and fit/transform
mms = MinMaxScaler()
train['total_bill'] = mms.fit_transform(train[['total_bill']])
train['price_per_person'] = mms.fit_transform(train[['price_per_person']])

In [16]:
train.head(2)

Unnamed: 0,total_bill,tip,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
18,0.307114,3.5,0.150344,0,1,0,1,0,0,0,1,0,1
172,0.092355,5.15,0.032258,1,0,1,0,0,0,0,1,0,1


In [17]:
def x_and_y_train(train,validate,test):
    '''
    This function prepares train, validate, test for modeling
    '''
    # Split data into predicting variables (X) and target variable (y) and reset the index for each dataframe
    X_train_scaled = train.drop(columns='tip').reset_index(drop=True)
    y_train = train[['tip']].reset_index(drop=True)

    X_validate_scaled = validate.drop(columns='tip').reset_index(drop=True)
    y_validate = validate[['tip']].reset_index(drop=True)

    X_test_scaled = test.drop(columns='tip').reset_index(drop=True)
    y_test = test[['tip']].reset_index(drop=True)
    
    return X_train_scaled, X_validate_scaled, X_test_scaled, y_train, y_validate, y_test

In [18]:
X_train_scaled, X_validate_scaled, X_test_scaled, y_train, y_validate, y_test = x_and_y_train(train,validate,test)

In [19]:
X_train_scaled.head(2)

Unnamed: 0,total_bill,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,0.307114,0.150344,0,1,0,1,0,0,0,1,0,1
1,0.092355,0.032258,1,0,1,0,0,0,0,1,0,1


In [20]:
# parameters: f_regression stats test, give me 2 features
f_selector = SelectKBest(f_regression, k=2)

In [21]:
# find the top 2 X's correlated with y
f_selector.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x1557d60d0>)

In [22]:
# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

In [65]:
f_selector.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False, False])

In [23]:
# get list of top K features. 
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [24]:
f_feature

['total_bill', 'price_per_person']

#### Best two features are total_bill and price_per_person

### d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [26]:
#Initalize linear regression object
lm = LinearRegression()

In [27]:
#Initialize the RFE object, setting the hyperparameters to be our linear regression object created
# above (as the algorithm to test the features on) and the number of features to return to be 2.
rfe = RFE(lm, n_features_to_select=2)

In [28]:
# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train_scaled,y_train)

In [29]:
#Fitting the data to model
lm.fit(X_rfe,y_train)

LinearRegression()

In [34]:
X_rfe

array([[0., 1.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.

In [30]:
# If we want a list of the features that remain, we can use .support_ similar to how 
# we used .get_support() with SelectKBest.
mask = rfe.support_

In [35]:
rfe.support_

array([False, False, False, False, False, False, False, False,  True,
        True, False, False])

In [31]:
rfe_features = X_train_scaled.loc[:,mask].columns.tolist()

In [32]:
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['day_Sat', 'day_Sun']


In [None]:
# We can also get a ranking of the features using rfe.ranking_. This will return a 1 for the features
# that were selected. So, since we said we wanted 2 features to remain, the top two features will have
# a rank of 1. The features that were eliminated will be ranked accordingly. In this case, the third feature
# will have a rank of 2. However, if we had more than 1 feature that was eliminated, they would all have 
# different ranks.

In [33]:
var_ranks = rfe.ranking_
var_names = X_train_scaled.columns.tolist()

pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

Unnamed: 0,Var,Rank
0,total_bill,10
1,price_per_person,11
2,sex_Male,7
3,sex_Female,6
4,smoker_Yes,4
5,smoker_No,5
6,day_Thur,3
7,day_Fri,2
8,day_Sat,1
9,day_Sun,1


### e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

#### I

In [36]:
#Trying different way

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=2)

# fit the data using RFE
rfe.fit(X_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [37]:
rfe_feature

['day_Sat', 'day_Sun']

In [38]:
# view list of columns and their ranking

# get the ranks
var_ranks = rfe.ranking_
# get the variable names
var_names = X_train_scaled.columns.tolist()
# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
8,day_Sat,1
9,day_Sun,1
7,day_Fri,2
6,day_Thur,3
4,smoker_Yes,4
5,smoker_No,5
3,sex_Female,6
2,sex_Male,7
10,time_Lunch,8
11,time_Dinner,9


In [39]:
##Still the same output

### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [62]:
def select_kbest(X,y,num_features):
    # parameters: f_regression stats test, give me 2 features
    f_selector = SelectKBest(f_regression, k=num_features)
    # find the top 2 X's correlated with y
    f_selector.fit(X, y)
    # boolean mask of whether the column was selected or not. 
    feature_mask = f_selector.get_support()
    # get list of top K features. 
    f_feature = X.iloc[:,feature_mask].columns.tolist()
    return f_feature

In [63]:
f_feature = select_kbest(X_train_scaled, y_train, 3)

In [64]:
f_feature

['total_bill', 'price_per_person', 'day_Sun']

### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [46]:
def rfe(X,y, k=2):
    #Set Regression Model
    lm = LinearRegression()
    #Initialize RFE
    rfe = RFE(lm, n_features_to_select=2)
    rfe.fit(X, y)
    #Making results into a readable dataframe
    rfe_rank=pd.DataFrame(
    {
        'rfe_ranking': rfe.ranking_
    },index = X.columns)
    rfe_rank.sort_values(by='rfe_ranking')
    return rfe_rank

In [48]:
rfe_rank = rfe(X_train_scaled, y_train)

In [49]:
rfe_rank

Unnamed: 0,rfe_ranking
total_bill,10
price_per_person,11
sex_Male,7
sex_Female,6
smoker_Yes,4
smoker_No,5
day_Thur,3
day_Fri,2
day_Sat,1
day_Sun,1


### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [68]:
from pydataset import data

In [69]:
swiss = data('swiss')

In [70]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [71]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [73]:
X = swiss.drop(columns=['Fertility'])
y = swiss['Fertility']

In [74]:
select_kbest(X,y,3)

['Examination', 'Education', 'Catholic']

In [75]:
rfe(X,y,3)

Unnamed: 0,rfe_ranking
Agriculture,3
Examination,2
Education,1
Catholic,4
Infant.Mortality,1
