In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression


from pydataset import data

## 1. Load the tips dataset.

In [2]:
df = data('tips')

### a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [3]:
df['price_per_person'] = df.total_bill / df.size

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.009947
2,10.34,1.66,Male,No,Sun,Dinner,3,0.006054
3,21.01,3.5,Male,No,Sun,Dinner,3,0.012301
4,23.68,3.31,Male,No,Sun,Dinner,2,0.013864
5,24.59,3.61,Female,No,Sun,Dinner,4,0.014397


### b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

I think that total bill, time, and day will be the most important features.

### c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [5]:
df[['sex', 'smoker', 'day', 'time']] = df[['sex', 'smoker', 'day', 'time']].apply(LabelEncoder().fit_transform)

In [6]:
train_validate, test = train_test_split(df, test_size = .2, random_state=311)

train, validate = train_test_split(train_validate, test_size = .25, random_state=311)

In [7]:
scaler = MinMaxScaler().set_output(transform="pandas")

scaler.fit(train)

train = scaler.transform(train)
validate = scaler.transform(validate)
test = scaler.transform(test)


In [8]:
X_train = train.drop(columns='tip')
y_train = train.tip
X_validate = validate.drop(columns='tip')
y_validate = validate.tip
X_test = test.drop(columns='tip')
y_test = test.tip

In [9]:
X_train.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,price_per_person
85,0.200413,1.0,0.0,1.0,1.0,0.2,0.200413
129,0.094812,0.0,0.0,1.0,1.0,0.2,0.094812
196,0.007117,1.0,0.0,1.0,1.0,0.2,0.007117
87,0.132691,1.0,0.0,1.0,1.0,0.2,0.132691
77,0.244949,1.0,1.0,0.333333,0.0,0.2,0.244949


In [10]:
f_selector = SelectKBest(f_regression, k=2)

In [11]:
f_selector.fit(X_train, y_train)

In [12]:
X_reduced = f_selector.transform(X_train)

print(X_train.shape)
print(X_reduced.shape)

(146, 7)
(146, 2)


In [13]:
f_support = f_selector.get_support()

In [14]:
f_feature = X_train.loc[:,f_support].columns.tolist()

# you could also get the list this way (among many others)
# f_feature = [X_train_scaled.columns.values[i] for i in range(len(feature_mask)) if feature_mask[i]==True]

print(str(len(f_feature)), 'selected features')
print(f_feature)


2 selected features
['total_bill', 'price_per_person']


### d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [15]:
lm = LinearRegression()

In [16]:
rfe = RFE(lm, n_features_to_select=2)

In [17]:
# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train, y_train)  

In [18]:
mask = rfe.support_

rfe_features = X_train.loc[:,mask].columns.tolist()

print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['total_bill', 'price_per_person']


## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [19]:
def select_kbest(X_train, y_train, n_features):
    
    f_selector = SelectKBest(f_regression, k=n_features)
    
    f_selector.fit(X_train, y_train)
    
    X_reduced = f_selector.transform(X_train)
    
    f_support = f_selector.get_support()
    
    f_feature = X_train.loc[:,f_support].columns.tolist()
    
    print(str(len(rfe_features)), 'selected features')
    print(rfe_features)

In [20]:
select_kbest(X_train, y_train, 2)

2 selected features
['total_bill', 'price_per_person']


## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [21]:
def rfe(X_train, y_train, n_features):
    
    lm = LinearRegression()
    
    rfe = RFE(lm, n_features_to_select=n_features)
    
    # Transforming data using RFE
    X_rfe = rfe.fit_transform(X_train, y_train) 
    
    mask = rfe.support_

    rfe_features = X_train.loc[:,mask].columns.tolist()

    print(str(len(rfe_features)), 'selected features')
    print(rfe_features)

In [22]:
rfe(X_train, y_train, 2)

2 selected features
['total_bill', 'price_per_person']


## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [23]:
df = data('swiss')

In [24]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [25]:
train_validate, test = train_test_split(df, test_size = .2, random_state=311)

train, validate = train_test_split(train_validate, test_size = .25, random_state=311)

In [26]:
scaler = MinMaxScaler().set_output(transform="pandas")

scaler.fit(train)

train = scaler.transform(train)
validate = scaler.transform(validate)
test = scaler.transform(test)


In [27]:
X_train = train.drop(columns='Fertility')
y_train = train.Fertility
X_validate = validate.drop(columns='Fertility')
y_validate = validate.Fertility
X_test = test.drop(columns='Fertility')
y_test = test.Fertility

In [31]:
select_kbest(X_train, y_train, 3)

2 selected features
['total_bill', 'price_per_person']


In [29]:
rfe(X_train, y_train, 3)

3 selected features
['Examination', 'Catholic', 'Infant.Mortality']
