In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from pydataset import data

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

### 1. Load the `tips` dataset.

In [2]:
df = sns.load_dataset('tips')

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB


### a. Create a column named `tip_percentage`. This should be the tip amount divided by the total bill.

In [4]:
df['tip_percentage'] = df['tip'] / df['total_bill']

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


### b. Create a column named `price_per_person`. This should be the total bill divided by the party size.

In [5]:
df['price_per_person'] = df['total_bill'] / df['size']

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


### c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

Features predicting tip amount:
- total_bill
- size
- day
- time

Features predicting tip percentage:
- total_bill
- tip
- size
- day
- time

In [6]:
df.dtypes

total_bill           float64
tip                  float64
sex                 category
smoker              category
day                 category
time                category
size                   int64
tip_percentage       float64
price_per_person     float64
dtype: object

In [7]:
# Creating dummy variables of the object columns.

mask = np.array(df.dtypes == "category")
mask

array([False, False,  True,  True,  True,  True, False, False, False])

In [8]:
obj_df = df.iloc[:, mask]
obj_df.columns

Index(['sex', 'smoker', 'day', 'time'], dtype='object')

In [9]:
# Create a new dummy dataframe:

dummy_df = pd.get_dummies(obj_df, dummy_na = False, drop_first = True)
dummy_df

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1
...,...,...,...,...,...,...
239,0,1,0,1,0,1
240,1,0,0,1,0,1
241,0,0,0,1,0,1
242,0,1,0,1,0,1


In [10]:
df = pd.concat([df, dummy_df], axis = 1)
df.drop(columns = obj_df.columns, inplace = True)
df.head()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,0.059447,8.495,1,1,0,0,1,1
1,10.34,1.66,3,0.160542,3.446667,0,1,0,0,1,1
2,21.01,3.5,3,0.166587,7.003333,0,1,0,0,1,1
3,23.68,3.31,2,0.13978,11.84,0,1,0,0,1,1
4,24.59,3.61,4,0.146808,6.1475,1,1,0,0,1,1


In [11]:
# Making the dummy columns all lower case

df.columns = map(str.lower, df.columns)
df.head()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person,sex_female,smoker_no,day_fri,day_sat,day_sun,time_dinner
0,16.99,1.01,2,0.059447,8.495,1,1,0,0,1,1
1,10.34,1.66,3,0.160542,3.446667,0,1,0,0,1,1
2,21.01,3.5,3,0.166587,7.003333,0,1,0,0,1,1
3,23.68,3.31,2,0.13978,11.84,0,1,0,0,1,1
4,24.59,3.61,4,0.146808,6.1475,1,1,0,0,1,1


In [12]:
def split(df):
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)
    
    return train, validate, test

In [13]:
train, validate, test = split(df)

train.shape, validate.shape, test.shape

((136, 11), (59, 11), (49, 11))

### d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [14]:
# set up all variables except tip
X_train = train.drop(columns=['tip'])
X_validate = validate.drop(columns=['tip'])
X_test = test.drop(columns=['tip'])

# Setting up just tip as my target variable
y_train = train[['tip']]
y_validate = validate[['tip']]
y_test = test[['tip']]

X_train.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person,sex_female,smoker_no,day_fri,day_sat,day_sun,time_dinner
18,16.97,3,0.206246,5.656667,1,1,0,0,1,1
172,7.25,2,0.710345,3.625,0,0,0,0,1,1
118,12.43,2,0.144811,6.215,1,1,0,0,0,0
28,21.7,2,0.198157,10.85,0,1,0,1,0,1
237,32.83,2,0.035638,16.415,0,0,0,1,0,1


In [15]:
from sklearn.preprocessing import MinMaxScaler

#make                            #fit
scaler = MinMaxScaler(copy=True).fit(X_train)

#use
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [16]:
X_train_scaled = pd.DataFrame(X_train_scaled, 
                              columns=X_train.columns.values).\
                            set_index([X_train.index.values])

X_validate_scaled = pd.DataFrame(X_validate_scaled, 
                                columns=X_validate.columns.values).\
                            set_index([X_validate.index.values])

X_test_scaled = pd.DataFrame(X_test_scaled, 
                                columns=X_test.columns.values).\
                            set_index([X_test.index.values])
X_train_scaled.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person,sex_female,smoker_no,day_fri,day_sat,day_sun,time_dinner
18,0.307114,0.4,0.252863,0.150344,1.0,1.0,0.0,0.0,1.0,1.0
172,0.092355,0.2,1.0,0.032258,0.0,0.0,0.0,0.0,1.0,1.0
118,0.206805,0.2,0.161808,0.182796,1.0,1.0,0.0,0.0,0.0,0.0
28,0.411622,0.2,0.240873,0.452194,0.0,1.0,0.0,1.0,0.0,1.0
237,0.657534,0.2,0.0,0.775647,0.0,0.0,0.0,1.0,0.0,1.0


#### Select K Best

In [17]:
# parameters: f_regression stats test, give me 2 features (make the thing)
f_selector = SelectKBest(f_regression, k=2)

# find the top 2 X's correlated with y (fit the thing)
f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not (use the thing)
feature_mask = f_selector.get_support()

In [18]:
# boolean mask is a name for an array of booleans
feature_mask

array([ True,  True, False, False, False, False, False, False, False,
       False])

In [19]:
# get list of top K features. 
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

f_feature

['total_bill', 'size']

#### Recursive Feature Elimination

In [20]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(X_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()


rfe_feature

['total_bill', 'tip_percentage']

In [21]:
# view list of columns and their ranking

# get the ranks
var_ranks = rfe.ranking_

# get the variable names
var_names = X_train_scaled.columns.tolist()

# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
0,total_bill,1
2,tip_percentage,1
1,size,2
3,price_per_person,3
5,smoker_no,4
6,day_fri,5
8,day_sun,6
4,sex_female,7
9,time_dinner,8
7,day_sat,9


In [22]:
print(f'Top 2 features to predict tip amount using Select K Best are: {f_feature}')
print(f'Top 2 features to predict tip amount using Recursive Feature Elimination are: {rfe_feature}')

Top 2 features to predict tip amount using Select K Best are: ['total_bill', 'size']
Top 2 features to predict tip amount using Recursive Feature Elimination are: ['total_bill', 'tip_percentage']


### e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [23]:
# set up all variables except tip_percentage
X_train = train.drop(columns=['tip_percentage'])
X_validate = validate.drop(columns=['tip_percentage'])
X_test = test.drop(columns=['tip_percentage'])

# Setting up just tip_percentage as my target variable
y_train = train[['tip_percentage']]
y_validate = validate[['tip_percentage']]
y_test = test[['tip_percentage']]

X_train.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_female,smoker_no,day_fri,day_sat,day_sun,time_dinner
18,16.97,3.5,3,5.656667,1,1,0,0,1,1
172,7.25,5.15,2,3.625,0,0,0,0,1,1
118,12.43,1.8,2,6.215,1,1,0,0,0,0
28,21.7,4.3,2,10.85,0,1,0,1,0,1
237,32.83,1.17,2,16.415,0,0,0,1,0,1


In [24]:
scaler = MinMaxScaler(copy=True).fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [25]:
X_train_scaled = pd.DataFrame(X_train_scaled, 
                              columns=X_train.columns.values).\
                            set_index([X_train.index.values])

X_validate_scaled = pd.DataFrame(X_validate_scaled, 
                                columns=X_validate.columns.values).\
                            set_index([X_validate.index.values])

X_test_scaled = pd.DataFrame(X_test_scaled, 
                                columns=X_test.columns.values).\
                            set_index([X_test.index.values])
X_train_scaled.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_female,smoker_no,day_fri,day_sat,day_sun,time_dinner
18,0.307114,0.3125,0.4,0.150344,1.0,1.0,0.0,0.0,1.0,1.0
172,0.092355,0.51875,0.2,0.032258,0.0,0.0,0.0,0.0,1.0,1.0
118,0.206805,0.1,0.2,0.182796,1.0,1.0,0.0,0.0,0.0,0.0
28,0.411622,0.4125,0.2,0.452194,0.0,1.0,0.0,1.0,0.0,1.0
237,0.657534,0.02125,0.2,0.775647,0.0,0.0,0.0,1.0,0.0,1.0


#### Select K Best

In [26]:
# parameters: f_regression stats test, give me 2 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 2 X's correlated with y
f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

In [27]:
# boolean mask is a name for an array of booleans
feature_mask

array([False,  True, False,  True, False, False, False, False, False,
       False])

In [28]:
# get list of top K features. 
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

f_feature

['tip', 'price_per_person']

#### Recursive Feature Elimination

In [29]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(X_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()


rfe_feature

['total_bill', 'tip']

In [30]:
# view list of columns and their ranking

# get the ranks
var_ranks = rfe.ranking_

# get the variable names
var_names = X_train_scaled.columns.tolist()

# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
0,total_bill,1
1,tip,1
2,size,2
3,price_per_person,3
5,smoker_no,4
8,day_sun,5
6,day_fri,6
9,time_dinner,7
7,day_sat,8
4,sex_female,9


In [31]:
print(f'Top 2 features to predict tip percentage using Select K Best are: {f_feature}')
print(f'Top 2 features to predict tip percentage using Recursive Feature Elimination are: {rfe_feature}')

Top 2 features to predict tip percentage using Select K Best are: ['tip', 'price_per_person']
Top 2 features to predict tip percentage using Recursive Feature Elimination are: ['total_bill', 'tip']


### f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

- SelectKBest was used to select the top 2 features based on how correlated each feature is with the target variable. 




- RFE and a linear regression algorithm was used to keep the top 2 features based on which features lead to the best performing linear regression model. 




- As the target changes, the top two features change. 




- Features interplay together and as we tack on more features, it can affect the importance of the feature. 




- The importance of the feature is not independent. 




- The importance of the feature is entirely dependent on the number of features.




- As more features are added, the interaction between newly added features and previous features may cause those combinations to be better predictors of the target.

### 2. Write a function named `select_kbest` that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the `SelectKBest` class. Test your function with the `tips` dataset. You should see the same results as when you did the process manually.

In [32]:
def select_kbest(X, y, k_num):
    '''
    This function takes in the predictors X (X_train) or (X_train_scaled), 
    the target Y (y_train), 
    and the number of features to select (k_num),
    
    fits the object to the data, 
    
    uses get_support() to get the list of features,
    
    and returns the names of the top k selected features
    '''
    
    #make
    f_selector = SelectKBest(f_regression, k=k_num)
    
    #fit
    f_selector.fit(X, y)
    
    #use
    feature_mask = f_selector.get_support()
    
    #get list
    f_feature = X.iloc[:, feature_mask].columns.tolist()

    return f_feature

In [33]:
#test out function by starting from scratch
df = sns.load_dataset('tips')
df['tip_percentage'] = df['tip'] / df['total_bill']
df['price_per_person'] = df['total_bill'] / df['size']

In [34]:
#recreating X_train_scaled and y_train to test out function (using only numeric cols)
#target var: tip
mask = np.array(df.dtypes == "category")
obj_df = df.iloc[:, mask]
dummy_df = pd.get_dummies(obj_df, dummy_na = False, drop_first = True)
df = pd.concat([df, dummy_df], axis = 1)
df.drop(columns = obj_df.columns, inplace = True)


train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

X_train = train.drop(columns=['tip'])
y_train = train[['tip']]

scaler = MinMaxScaler(copy=True).fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns.values).set_index([X_train.index.values])
X_train_scaled.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
18,0.307114,0.4,0.252863,0.150344,1.0,1.0,0.0,0.0,1.0,1.0
172,0.092355,0.2,1.0,0.032258,0.0,0.0,0.0,0.0,1.0,1.0
118,0.206805,0.2,0.161808,0.182796,1.0,1.0,0.0,0.0,0.0,0.0
28,0.411622,0.2,0.240873,0.452194,0.0,1.0,0.0,1.0,0.0,1.0
237,0.657534,0.2,0.0,0.775647,0.0,0.0,0.0,1.0,0.0,1.0


In [35]:
#calling function with X_train_scaled (predictors), y_train (target), and k
print(f'Top 2 features to predict tip amount are: {select_kbest(X_train_scaled, y_train, 2)}')

Top 2 features to predict tip amount are: ['total_bill', 'size']


In [36]:
#test out function by starting from scratch
df = sns.load_dataset('tips')
df['tip_percentage'] = df['tip'] / df['total_bill']
df['price_per_person'] = df['total_bill'] / df['size']

In [37]:
#recreating X_train_scaled and y_train to test out function (using only numeric cols)
#target var: tip_percentage
mask = np.array(df.dtypes == "category")
obj_df = df.iloc[:, mask]
dummy_df = pd.get_dummies(obj_df, dummy_na = False, drop_first = True)
df = pd.concat([df, dummy_df], axis = 1)
df.drop(columns = obj_df.columns, inplace = True)

train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

X_train = train.drop(columns=['tip_percentage'])
y_train = train[['tip_percentage']]

scaler = MinMaxScaler(copy=True).fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns.values).set_index([X_train.index.values])
X_train_scaled.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
18,0.307114,0.3125,0.4,0.150344,1.0,1.0,0.0,0.0,1.0,1.0
172,0.092355,0.51875,0.2,0.032258,0.0,0.0,0.0,0.0,1.0,1.0
118,0.206805,0.1,0.2,0.182796,1.0,1.0,0.0,0.0,0.0,0.0
28,0.411622,0.4125,0.2,0.452194,0.0,1.0,0.0,1.0,0.0,1.0
237,0.657534,0.02125,0.2,0.775647,0.0,0.0,0.0,1.0,0.0,1.0


In [38]:
print(f'Top 2 features to predict tip percentage are: {select_kbest(X_train_scaled, y_train, 2)}')

Top 2 features to predict tip percentage are: ['tip', 'price_per_person']


### 3. Write a function named `rfe` that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the `RFE` class. Test your function with the `tips` dataset. You should see the same results as when you did the process manually.

In [39]:
def rfe(X, y, k_num):
    '''
    This function takes in the predictors X (X_train) or (X_train_scaled), 
    the target Y (y_train), 
    and the number of features to select (k_num),
    
    initializes the ML algorithm,
    
    creates the rfe object, indicating the ML object and the number of features,
    
    fits the rfe object to the data,
    
    gets the list of features,
    
    and returns the names of the top k selected features
    '''
    
    #make
    lm = LinearRegression()
    rfe = RFE(lm, k_num)
    
    #fit
    rfe.fit(X, y)
    
    #use
    feature_mask = rfe.support_
    
    #get list
    rfe_feature = X.iloc[:, feature_mask].columns.tolist()

    return rfe_feature

In [40]:
#test out function by starting from scratch
df = sns.load_dataset('tips')
df['tip_percentage'] = df['tip'] / df['total_bill']
df['price_per_person'] = df['total_bill'] / df['size']

In [41]:
#recreating X_train_scaled and y_train to test out function (using only numeric cols)
#target var: tip
mask = np.array(df.dtypes == "category")
obj_df = df.iloc[:, mask]
dummy_df = pd.get_dummies(obj_df, dummy_na = False, drop_first = True)
df = pd.concat([df, dummy_df], axis = 1)
df.drop(columns = obj_df.columns, inplace = True)

train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

X_train = train.drop(columns=['tip'])
y_train = train[['tip']]

scaler = MinMaxScaler(copy=True).fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns.values).set_index([X_train.index.values])
X_train_scaled.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
18,0.307114,0.4,0.252863,0.150344,1.0,1.0,0.0,0.0,1.0,1.0
172,0.092355,0.2,1.0,0.032258,0.0,0.0,0.0,0.0,1.0,1.0
118,0.206805,0.2,0.161808,0.182796,1.0,1.0,0.0,0.0,0.0,0.0
28,0.411622,0.2,0.240873,0.452194,0.0,1.0,0.0,1.0,0.0,1.0
237,0.657534,0.2,0.0,0.775647,0.0,0.0,0.0,1.0,0.0,1.0


In [42]:
#calling function with X_train_scaled (predictors), y_train (target), and k
print(f'Top 2 features to predict tip amount are: {rfe(X_train_scaled, y_train, 2)}')


Top 2 features to predict tip amount are: ['total_bill', 'tip_percentage']


In [43]:
#test out function by starting from scratch
df = sns.load_dataset('tips')
df['tip_percentage'] = df['tip'] / df['total_bill']
df['price_per_person'] = df['total_bill'] / df['size']

In [44]:
#recreating X_train_scaled and y_train to test out function (using only numeric cols)
#target var: tip_percentage
mask = np.array(df.dtypes == "category")
obj_df = df.iloc[:, mask]
dummy_df = pd.get_dummies(obj_df, dummy_na = False, drop_first = True)
df = pd.concat([df, dummy_df], axis = 1)
df.drop(columns = obj_df.columns, inplace = True)

train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

X_train = train.drop(columns=['tip_percentage'])
y_train = train[['tip_percentage']]
scaler = MinMaxScaler(copy=True).fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns.values).set_index([X_train.index.values])
X_train_scaled.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
18,0.307114,0.3125,0.4,0.150344,1.0,1.0,0.0,0.0,1.0,1.0
172,0.092355,0.51875,0.2,0.032258,0.0,0.0,0.0,0.0,1.0,1.0
118,0.206805,0.1,0.2,0.182796,1.0,1.0,0.0,0.0,0.0,0.0
28,0.411622,0.4125,0.2,0.452194,0.0,1.0,0.0,1.0,0.0,1.0
237,0.657534,0.02125,0.2,0.775647,0.0,0.0,0.0,1.0,0.0,1.0


In [45]:
print(f'Top 2 features to predict tip percentage are: {rfe(X_train_scaled, y_train, 2)}')

Top 2 features to predict tip percentage are: ['total_bill', 'tip']


### 4. Load the `swiss` dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [53]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [54]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [55]:
train, validate, test = split(swiss)

train.shape, validate.shape, test.shape

((25, 6), (12, 6), (10, 6))

In [56]:
train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rolle,60.5,60.8,16,10,7.72,16.3
Lavaux,65.1,73.0,19,9,2.84,20.0
Nyone,56.6,50.9,22,12,15.14,16.7
Conthey,75.5,85.9,3,2,99.71,15.1
Yverdon,65.4,49.5,15,8,6.1,22.5


In [57]:
# set up all variables except Fertility
X_train = train.drop(columns=['Fertility'])
X_validate = validate.drop(columns=['Fertility'])
X_test = test.drop(columns=['Fertility'])

# Setting up just Fertility as my target variable
y_train = train[['Fertility']]
y_validate = validate[['Fertility']]
y_test = test[['Fertility']]

X_train.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rolle,60.8,16,10,7.72,16.3
Lavaux,73.0,19,9,2.84,20.0
Nyone,50.9,22,12,15.14,16.7
Conthey,85.9,3,2,99.71,15.1
Yverdon,49.5,15,8,6.1,22.5


In [62]:
scaler = MinMaxScaler(copy=True).fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

#turn X_train_scaled into a dataframe to be able to run select_kbest function
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns.values).set_index([X_train.index.values])

In [63]:
features = select_kbest(X_train_scaled, y_train, 3)
print(f'Top 3 features to predict Fertility using Select K Best are: {features}')


Top 3 features to predict Fertility using Select K Best are: ['Examination', 'Catholic', 'Infant.Mortality']


In [64]:
features = rfe(X_train_scaled, y_train, 3)
print(f'Top 3 features to predict Fertility using RFE are: {features}')

Top 3 features to predict Fertility using RFE are: ['Agriculture', 'Examination', 'Infant.Mortality']
