# Feature Engineering Exercises

In [None]:
from pydataset import data
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import sklearn.feature_selection
import sklearn.preprocessing

In [None]:
def split(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    return train, validate, test

In [None]:
# Exercise 1
df = data('tips')
df.head()

In [None]:
# Note the difference between
# df.size, df["size"]

In [None]:
# Rename the size column because .size is a built-in Pandas attribute
df = df.rename(columns={'size': 'number_of_people'})

In [None]:
# Exercise 1
# a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.
# b. Create a column named price_per_person. This should be the total bill divided by the party size.
# c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

# Bracket notation to create new columns
df["tip_percentage"] = df.tip / df.total_bill
df["price_per_person"] = df.total_bill / df.number_of_people

In [None]:
# For this specific exercise, we're only focusing on the numeric features
df["dinner_time"] = df.time == "Dinner"
df = df[["total_bill", "tip", "number_of_people", "tip_percentage", "price_per_person", "dinner_time"]]

In [None]:
# Split the data
train, validate, test = split(df)

In [None]:
# X and y splits
target = "tip"

X_train = train.drop(columns=[target])
y_train = train[target]

X_validate = validate.drop(columns=[target])
y_validate = validate[target]

X_test = test.drop(columns=[target])
y_test = test[target]

In [None]:
X_train.head()

In [None]:
# Scale (Make the thing)
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler, (fit the thing)
scaler.fit(X_train)

# Use the scaler to transform train, validate, test (use the thing)
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Exercise 1c
# Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?
# Perhaps tip percentage? NB. derived from target!

In [None]:
# Exercise 1d
# Use numeric features to predict tip_amount
# Use select-K-best and RFE to select the top 2 features
k = 2

# Make the thing
kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=2)
# fit the thing
kbest.fit(X_train_scaled, y_train)
# use the thing, 
# get_support() produces an array of booleans, so we can filter out the column names that matter the most
X_train.columns[kbest.get_support()]

In [None]:
# Now let's do RFE

# Make the thing(s)
lm = sklearn.linear_model.LinearRegression()
rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=2)
# Fit the thing
rfe.fit(X_train_scaled, y_train)
# use the thing
rfe_columns = X_train.columns[rfe.support_].tolist()
rfe_columns

In [None]:
# Exercise 2
# Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

def select_kbest(X, y, k):
    # make the object
    kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=k)

    # fit the object
    kbest.fit(X, y)
    
    # use the object (.get_support() is that array of booleans to filter the list of column names)
    return X.columns[kbest.get_support()].tolist()

In [None]:
select_kbest(X_train, y_train, 2)

In [None]:
# Exercise 3
# Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

# I use rfe as a variable name (perhaps more than I should), so we'll name this function select_rfe instead
def select_rfe(X, y, k, return_rankings=False, model=LinearRegression()):
    # Use the passed model, LinearRegression by default
    rfe = sklearn.feature_selection.RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    features = X.columns[rfe.support_].tolist()
    if return_rankings:
        rankings = pd.Series(dict(zip(X.columns, rfe.ranking_)))
        return features, rankings
    else:
        return features

In [None]:
select_rfe(X_train, y_train, 3)

In [None]:
# Exercise 4
# Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

swiss = data('swiss')
swiss.head()

In [None]:
# Split the data
train, validate, test = split(swiss)

# Setup X and y
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

X_validate = validate.drop(columns='Fertility')
y_validate = validate.Fertility

X_test = test.drop(columns='Fertility')
y_test = test.Fertility

In [None]:
# Scale the data
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler
scaler.fit(X_train)

# Use the scaler to transform train, validate, test
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)


# Turn everything into a dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_validate_scaled = pd.DataFrame(X_validate_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train.columns)