In [None]:
# set up the environment

from math import sqrt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import split_scale

from pydataset import data

import warnings
warnings.filterwarnings('ignore')

# Feature Engineering Exercises

Do your work for this exercise in a jupyter notebook named feature_engineering within the regression directory.

1. Load the `tips` dataset.
    - Create a column named `tip_percentage`. This should be the tip amount divided by the total bill.
    - Create a column named `price_per_person`. This should be the total bill divided by the party size.
    - Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?
    - Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?
    - Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?
    - Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [None]:
tips = data("tips")

In [None]:
tips.head()

In [None]:
train, test = split_scale.split_my_data(tips, .8)

In [None]:
train.head()

In [None]:
scaler, train_standard, test_standard = split_scale.standard_scaler(train[['total_bill', 'tip', 'size']], test[['total_bill', 'tip', 'size']])

In [None]:
train_standard.head()

In [None]:
train['total_bill'] = train_standard['total_bill']

In [None]:
train['tip'] = train_standard['tip']

In [None]:
train['size'] = train_standard['size']

In [None]:
train

In [None]:
train['tip_percentage'] = train.tip / train.total_bill

In [None]:
train['price_per_person'] = train.total_bill / train['size']

In [None]:
train.head()

In [None]:
X_train_tip_guess = train.drop(['tip','sex','smoker','day','time'], axis=1)
y_train_tip_guess = train[['tip']]
X_test_tip_guess = test.drop(['tip','sex','smoker','day','time'], axis=1)
y_test_tip_guess = test[['tip']]

In [None]:
X_train_tip_guess.head()

In [None]:
y_train_tip_guess.head()

## Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

### I think size will be the most important

In [None]:
f_selector = SelectKBest(f_regression, k = 2)

In [None]:
# running correlation test between each x and y and returning the score, f-statistic

f_selector.fit(X_train_tip_guess, y_train_tip_guess)

In [None]:
# select the k best features
X2 = f_selector.transform(X_train_tip_guess)

print(X2.shape)
print(X_train.shape)

In [None]:
f_support = f_selector.get_support()
f_support

In [None]:
f_feature = X_train_tip_guess.loc[:,f_support].columns.tolist()
f_feature

In [None]:
lm = LinearRegression()

In [None]:
rfe = RFE(lm, 2)

In [None]:
rfe.fit(X_train_tip_guess, y_train_tip_guess)

In [None]:
X_rfe = rfe.transform(X_train_tip_guess)
X_rfe[0:2]

In [None]:
lm.fit(X_rfe, y_train_tip_guess)

In [None]:
mask = rfe.support_
rfe_features = X_train_tip_guess.loc[:,mask].columns.tolist()
rfe_features

In [None]:
var_ranks = rfe.ranking_
var_names = X_train_tip_guess.columns.tolist()

pd.DataFrame({'Feature': var_names, 'Rank': var_ranks})

## Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [None]:
def select_kbest(X, y, k):
    """Removes all but highest scoring features
    Takes:
          k - int: number of features
          X - df of features
          y - df of target
    Returns:
          list of column names of highest scoring features
    """
    f_selector = SelectKBest(f_regression, k).fit(X, y).get_support()
    f_feature = X.loc[:,f_selector].columns.tolist()
    return f_feature

In [None]:
select_kbest(X_train_tip_guess, y_train_tip_guess, 2)

## Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [None]:
def select_rfe(X, y, k):
    lm = LinearRegression()
    rfe = RFE(lm, k)
    X_rfe = rfe.fit_transform(X, y)
    mask = rfe.support_
    rfe_features = X.loc[:,mask].columns.tolist()
    return rfe_features

In [None]:
rfe(X_train_tip_guess, y_train_tip_guess, 2)

## Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [None]:
swiss = data('swiss')

In [None]:
data('tips', show_doc=True)

In [None]:
swiss.head()

In [None]:
swiss.info()

In [None]:
train, test = split_scale.split_my_data(swiss, .8)

In [None]:
train.head()

In [None]:
X_train = train.drop(columns='Fertility')
y_train = train[['Fertility']]
X_test = test.drop(columns='Fertility')
y_test = test[['Fertility']]

In [None]:
select_kbest(X_train, y_train, 3)

In [None]:
select_rfe(X_train, y_train, 3)

In [None]:
lm = LinearRegression()
rfe = RFE(lm, 3)
X_rfe = rfe.fit_transform(X_train, y_train)

In [None]:
var_ranks = rfe.ranking_
var_names = X_train.columns.tolist()

pd.DataFrame({'Feature': var_names, 'Rank': var_ranks})

In [None]:
## Class verison

In [None]:
tips = data('tips')

In [None]:
tips.head()

In [None]:
tips = tips.rename(columns={'size': 'party_size'})

In [None]:
tips['tip_percentage'] = tips.tip / tips.total_bill
tips['price_per_person'] = tips.total_bill / tips.party_size

In [None]:
tips.head()

In [None]:
# prep our data

train, test = train_test_split(tips, random_state=123, train_size=.80)

In [None]:
x_cols = ['total_bill', 'party_size', 'tip_percentage', 'price_per_person']
X_train = train[x_cols]
y_train = train.tip

X_test = test[x_cols]
y_test = test.tip

In [None]:
import sklearn.linear_model
import sklearn.feature_selection

k = 2

# initialize the sklearn objects
lm = sklearn.linear_model.LinearRegression()
kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, 2)
rfe = sklearn.feature_selection.RFE(lm, 2)

In [None]:
# prep our data
import sklearn.model_selection

train, test = sklearn.model_selection.train_test_split(tips, random_state=123, train_size=.80)

x_cols = ['total_bill', 'party_size', 'tip_percentage', 'price_per_person']
X_train = train[x_cols]
y_train = train.tip

X_test = test[x_cols]
y_test = test.tip

In [None]:
kbest.fit(X_train, y_train)
X_train.columns[kbest.get_support()]

In [None]:
rfe.fit(X_train, y_train)
X_train.columns[rfe.support_]

In [None]:
rfe.support_

In [None]:
X_train.columns[rfe.support_]

In [None]:
# exploring what .transform does
X_train
X_train_kbest_features = kbest.transform(X_train)

In [None]:
def select_kbest(X, y, k):
    kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()]

In [None]:
select_kbest(X_train, y_train, 2)

In [None]:
def select_rfe(X, y, k):
    lm = sklearn.linear_model.LinearRegression()
    rfe = sklearn.feature_selection.RFE(lm, k)
    rfe.fit(X, y)
    return X.columns[rfe.support_]

In [None]:
select_rfe(X_train, y_train, 2)