# Feature Engineering Exercises

## 1. Load `tips` dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings

from pydataset import data

tips = data("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


### a. Create a column named `tip_percentage`. This should be the tip amount divided by the total bill.

In [2]:
tips["tip_percentage"] = tips.tip / tips.total_bill

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


### b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
tips["price_per_person"] = tips.total_bill / tips["size"]

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


### c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

The most important variable to predict tip amount is probably going to be total_bill or the party size.

### c. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [6]:
# Select K best

new_tips = tips

new_tips= tips.select_dtypes(exclude=['object'])

In [7]:
import split_scale                                

from sklearn.model_selection import train_test_split

train, test = train_test_split(new_tips, train_size = .8, random_state=123)

In [8]:
X_train = train.drop(columns="tip")
y_train = train[["tip"]]
X_test = test.drop(columns="tip")
y_test = test[["tip"]]

In [9]:
from sklearn.feature_selection import SelectKBest, f_regression

In [10]:
f_selector = SelectKBest(f_regression, k=2)

In [11]:
f_selector.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SelectKBest(k=2, score_func=<function f_regression at 0x129f6d200>)

In [12]:
X_reduced = f_selector.transform(X_train)

In [13]:
f_support = f_selector.get_support()
print(f_support)

[ True  True False False]


In [14]:
f_feature = X_train.loc[:, f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)

2 selected features
['total_bill', 'size']


In [15]:
# Recursive Feature Elimination

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [16]:
lm = LinearRegression()

In [17]:
rfe = RFE(lm, 2)

In [18]:
X_rfe = rfe.fit_transform(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [19]:
lm.fit(X_rfe, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
mask = rfe.support_

rfe_features = X_train.loc[:,mask].columns.tolist()


print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['total_bill', 'tip_percentage']


### e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [21]:
X_train = train.drop(columns="tip_percentage")
y_train = train[["tip_percentage"]]
X_test = test.drop(columns="tip_percentage")
y_test = test[["tip_percentage"]]

In [22]:
X_reduced2 = SelectKBest(f_regression, k=2).fit_transform(X_train, y_train)
f_support = f_selector.get_support()
f_feature = X_train.loc[:,f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)

2 selected features
['total_bill', 'tip']


  y = column_or_1d(y, warn=True)


In [23]:
# RFE

lm = LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X_train, y_train)
lm.fit(X_rfe, y_train)
mask = rfe.support_
rfe_features = X_train.loc[:,mask].columns.tolist()
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['tip', 'size']


  y = column_or_1d(y, warn=True)


## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [24]:
def select_kbest(X, y, k):
    X_reduced2 = SelectKBest(f_regression, k=k).fit_transform(X, y)
    f_support = f_selector.get_support()
    f_feature = X.loc[:,f_support].columns.tolist()
    print(str(len(f_feature)), 'selected features')
    print(f_feature)
    

In [25]:
select_kbest(X_train, y_train, 2)

2 selected features
['total_bill', 'tip']


  y = column_or_1d(y, warn=True)


## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [26]:
def rfe(X, y, k):
    lm = LinearRegression()
    rfe = RFE(lm, k)
    X_rfe = rfe.fit_transform(X, y)
    lm.fit(X_rfe, y)
    mask = rfe.support_
    rfe_features = X.loc[:,mask].columns.tolist()
    print(str(len(rfe_features)), 'selected features')
    print(rfe_features)

In [27]:
rfe(X_train, y_train, 2)

2 selected features
['tip', 'size']


  y = column_or_1d(y, warn=True)


## 4.Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [28]:
swiss = data("swiss")
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
Fertility           47 non-null float64
Agriculture         47 non-null float64
Examination         47 non-null int64
Education           47 non-null int64
Catholic            47 non-null float64
Infant.Mortality    47 non-null float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [29]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [30]:
import feature_engineering

In [31]:
train, test = train_test_split(swiss, train_size = .8, random_state=123)

In [32]:
X_train = train.drop(columns="Fertility")
y_train = train[["Fertility"]]
X_test = test.drop(columns="Fertility")
y_test = test[["Fertility"]]

In [35]:
feature_engineering.select_kbest(X_train, y_train, 3)

3 selected features
['Examination', 'Education', 'Catholic']


  y = column_or_1d(y, warn=True)


In [36]:
feature_engineering.rfe(X_train, y_train, 3)

3 selected features
['Examination', 'Education', 'Infant.Mortality']


  y = column_or_1d(y, warn=True)
