# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

from pydataset import data

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_regression

### 1. Load the tips dataset.

In [2]:
# Acquire data
df = data("tips")

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.shape

(244, 7)

In [5]:
df.describe() 

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [6]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [7]:
df.isnull().sum() 

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [8]:
dummy_df = pd.get_dummies(df[['sex', 'smoker', 'day', 'time']], dummy_na=False, drop_first=True) 

In [9]:
df = pd.concat([df, dummy_df], axis=1)

In [10]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,Female,No,Sun,Dinner,2,0,0,0,1,0,0
2,10.34,1.66,Male,No,Sun,Dinner,3,1,0,0,1,0,0
3,21.01,3.5,Male,No,Sun,Dinner,3,1,0,0,1,0,0
4,23.68,3.31,Male,No,Sun,Dinner,2,1,0,0,1,0,0
5,24.59,3.61,Female,No,Sun,Dinner,4,0,0,0,1,0,0


In [11]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size', 'sex_Male',
       'smoker_Yes', 'day_Sat', 'day_Sun', 'day_Thur', 'time_Lunch'],
      dtype='object')

In [12]:
df.drop(columns=['sex', 'smoker', 'day', 'time'], inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   size        244 non-null    int64  
 3   sex_Male    244 non-null    uint8  
 4   smoker_Yes  244 non-null    uint8  
 5   day_Sat     244 non-null    uint8  
 6   day_Sun     244 non-null    uint8  
 7   day_Thur    244 non-null    uint8  
 8   time_Lunch  244 non-null    uint8  
dtypes: float64(2), int64(1), uint8(6)
memory usage: 9.1 KB


In [14]:
df.head()

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,0,0,0,1,0,0
2,10.34,1.66,3,1,0,0,1,0,0
3,21.01,3.5,3,1,0,0,1,0,0
4,23.68,3.31,2,1,0,0,1,0,0
5,24.59,3.61,4,0,0,0,1,0,0


#### a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [15]:
df['tip_percentage'] = df['tip'] / df['total_bill']

df.head()

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch,tip_percentage
1,16.99,1.01,2,0,0,0,1,0,0,0.059447
2,10.34,1.66,3,1,0,0,1,0,0,0.160542
3,21.01,3.5,3,1,0,0,1,0,0,0.166587
4,23.68,3.31,2,1,0,0,1,0,0,0.13978
5,24.59,3.61,4,0,0,0,1,0,0,0.146808


#### b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [16]:
df['price_per_person'] = df['total_bill'] / df['size']

df.head()

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch,tip_percentage,price_per_person
1,16.99,1.01,2,0,0,0,1,0,0,0.059447,8.495
2,10.34,1.66,3,1,0,0,1,0,0,0.160542,3.446667
3,21.01,3.5,3,1,0,0,1,0,0,0.166587,7.003333
4,23.68,3.31,2,1,0,0,1,0,0,0.13978,11.84
5,24.59,3.61,4,0,0,0,1,0,0,0.146808,6.1475


#### c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

- I believe that we might see tip percentage goes down with larger party sizes. This is based off of real world experience. 

- Perhaps total bill will be the most important for predicting tip amount and percentage. 

#### d. Use all the other numeric features to predict (tip amount). Use select k best and recursive feature elimination to select the top 2 features. What are they?


In [17]:
# Split into train and test
train_and_validate, test = train_test_split(df, train_size=0.8, random_state=123)
train, validate = train_test_split(train_and_validate, train_size=0.8, random_state=123)

In [18]:
# Create the scaler and fit it to the training dataset
scaler = MinMaxScaler(copy=True).fit(train)

# Scale the training data
train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])

# Set aside the validate and test datasets for our "out of sample" data
validate_scaled = pd.DataFrame(scaler.transform(validate), columns=validate.columns.values).set_index([validate.index.values])
test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])

X_train = train.drop(columns= ['tip'])
y_train = train[['tip']]

X_validate = validate.drop(columns= ['tip'])
y_validate = validate[['tip']]

#### Use K Best

In [36]:
# Initialize the f_selector objec
f_selector = SelectKBest(f_regression, k=2)

# Fit object to our data
f_selector.fit(X_train, y_train)

X_reduced = f_selector.transform(X_train)

f_mask = f_selector.get_support()

f_feature = X_train.iloc[:,f_mask].columns.tolist()

print(X_train.shape)
print(X_reduced.shape)
print(f_mask) 

print(str(len(f_feature)), 'selected features')
print(f_feature)

(156, 10)
(156, 2)
[ True  True False False False False False False False False]
2 selected features
['total_bill', 'tip']


#### Use Recursive Feature Elimination

In [20]:
# Initialize the linear regression object
lm = LinearRegression()

In [21]:
# Initialize the RFE object, the number of features to return to be 2
rfe = RFE(lm, 2)

In [22]:
# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train,y_train)  

In [23]:
#Fitting the data to model
lm.fit(X_rfe,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [37]:
# If we want a list of the features that remain, we can use .support_ similar to how we used .get_support() with SelectKBest.
mask = rfe.support_
# filter dataframe 
rfe_features = X_train.iloc[:,mask].columns.tolist()
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['tip', 'size']


In [25]:
var_ranks = rfe.ranking_
var_names = X_train.columns.tolist()

pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

Unnamed: 0,Var,Rank
0,total_bill,5
1,size,6
2,sex_Male,9
3,smoker_Yes,4
4,day_Sat,2
5,day_Sun,1
6,day_Thur,3
7,time_Lunch,8
8,tip_percentage,1
9,price_per_person,7


#### e. Use all the other numeric features to predict (tip percentage). Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [26]:
# Split into train and test
train_and_validate, test = train_test_split(df, train_size=0.8, random_state=123)
train, validate = train_test_split(train_and_validate, train_size=0.8, random_state=123)

In [27]:
# Create the scaler and fit it to the training dataset
scaler = MinMaxScaler(copy=True).fit(train)

# Scale the training data
train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])

# Set aside the validate and test datasets for our "out of sample" data
validate_scaled = pd.DataFrame(scaler.transform(validate), columns=validate.columns.values).set_index([validate.index.values])
test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])

X_train = train.drop(columns= ['tip_percentage'])
y_train = train[['tip_percentage']]

X_validate = validate.drop(columns= ['tip_percentage'])
y_validate = validate[['tip_percentage']]

#### Use K Best

In [28]:
# Initialize the f_selector objec
f_selector = SelectKBest(f_regression, k=2)

# Fit object to our data
f_selector.fit(X_train, y_train)

X_reduced = f_selector.transform(X_train)

f_support = f_selector.get_support()

f_feature = X_train.loc[:,f_support].columns.tolist()

print(X_train.shape)
print(X_reduced.shape)
print(f_support) 

print(str(len(f_feature)), 'selected features')
print(f_feature)

(156, 10)
(156, 2)
[ True  True False False False False False False False False]
2 selected features
['total_bill', 'tip']


#### Use Recursive Feature Elimination

In [29]:
# Initialize the linear regression object
lm = LinearRegression()

In [30]:
# Initialize the RFE object, the number of features to return to be 2
rfe = RFE(lm, 2)

In [31]:
# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train,y_train)  

In [32]:
#Fitting the data to model
lm.fit(X_rfe,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [33]:
# If we want a list of the features that remain, we can use .support_ similar to how we used .get_support() with SelectKBest.
mask = rfe.support_
rfe_features = X_train.loc[:,mask].columns.tolist()
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['tip', 'size']


In [34]:
var_ranks = rfe.ranking_
var_names = X_train.columns.tolist()

pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

Unnamed: 0,Var,Rank
0,total_bill,7
1,tip,1
2,size,1
3,sex_Male,8
4,smoker_Yes,2
5,day_Sat,6
6,day_Sun,4
7,day_Thur,5
8,time_Lunch,9
9,price_per_person,3


#### f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

#### K Best:
    Scores the features using a function and then removes all but the k highest scoring features.
    
#### RFE: 
    Works by starting with all features in the training dataset and successfully removing features until the desired number remains  
    

- K-best isn’t running any models, but RFE is
- Yes, it does change as you change the number of features you are selecting

### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [35]:
def select_kbest(x, y, kf):
    
    f_selector = SelectKBest(f_regression, k = kf)
    
    f_selector.fit(x, y)

    f_support = f_selector.get_support()

    f_feature = x.loc[:,f_support].columns.tolist()
    
    return f_feature

select_kbest()

TypeError: select_kbest() missing 3 required positional arguments: 'x', 'y', and 'kf'

### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [None]:
def rfe():
    return

select_kbest()

### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [None]:
# Acquire data
df = data("swiss")

In [None]:
df.head()