In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data
import wrangle
import prepare

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler

# Exercises

**1. Load the tips dataset.**

In [None]:
df = data ('tips')

In [None]:
df.head()


In [None]:
df.info()

**a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.**

In [None]:
df['tip_percentage'] = round((df['tip'] / df['total_bill'])*100 , 2)

In [None]:
df.head()

**b. Create a column named price_per_person. This should be the total bill divided by the party size.**

In [None]:
df ['price_per_person']=  df['total_bill'] / df['size']

In [None]:
df.head()

**c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?**

total_bill and size

**d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?**

In [None]:
#split data in train, validate and split
train, validate, test = wrangle.split_data(df)

In [None]:
#split the target and the features
X_train = train.drop(columns = ['tip'])
y_train = train['tip']


In [None]:
X_validate = validate.drop(columns = ['tip'])
X_test = test.drop(columns = ['tip'])

In [None]:
#get all numerics columns
cols = X_train.select_dtypes(exclude='object').columns.to_list()

In [None]:
#scaled the columns
X_train_scaled , X_validate_scaled , X_test_scaled = prepare.scaled_mimmax(cols, X_train , X_validate, X_test)

In [None]:
X_train_scaled.head()

**SelectKBest**

Uses an F Test to compare how well each feature predicts the target variable.

In [None]:
f_selector = SelectKBest(score_func=f_regression, k=2)
f_selector.fit(X_train_scaled, y_train)

In [None]:
#get the top 2 features
mask = f_selector.get_support()
X_train_scaled.columns[mask]

**Recursive Feature Elimination (RFE)**

- Fits a model and recursively eliminates the worst performing features.

- Only works for models that can rank features.

In [None]:
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)

In [None]:
rfe.support_

In [None]:
#get the top 2 features

X_train_scaled.columns[rfe.support_]

In [None]:

pd.Series(dict(zip(X_train_scaled.columns, rfe.ranking_))).sort_values()

**takeaways**
- the top 2 features for SelectKBest are: total_bill', 'size'
- the top 2 features for Recursive Feature Elimination (RFE) are: 'total_bill', 'tip_percentage'

In [None]:
train.corr()

**e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?**

In [None]:
train.head()

In [None]:
#split the target and the features
X_train = train.drop(columns = ['tip_percentage'])
y_train = train['tip_percentage']

In [None]:
X_validate = validate.drop(columns = ['tip_percentage'])
X_test = test.drop(columns = ['tip_percentage'])

In [None]:
cols = X_train.select_dtypes(exclude='object').columns.to_list()
cols

In [None]:
#scaled the columns
X_train_scaled , X_validate_scaled , X_test_scaled = prepare.scaled_mimmax(cols, X_train , X_validate, X_test)

In [None]:
X_train_scaled.head()

**SelectKBest**

In [None]:
f_selector = SelectKBest(score_func=f_regression, k=2)
f_selector.fit(X_train_scaled, y_train)

In [None]:
mask = f_selector.get_support()
X_train_scaled.columns[mask]

**Recursive Feature Elimination (RFE)**

In [None]:
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)

In [None]:
rfe.support_

In [None]:
X_train_scaled.columns[rfe.support_]

In [None]:
#let's see the ranks 
pd.Series(dict(zip(X_train_scaled.columns, rfe.ranking_))).sort_values()

**takeaways**

the top 2 features for SelectKBest are: ''tip_minmax', 'price_per_person_minmax'
the top 2 features for Recursive Feature Elimination (RFE) are: 'total_bill_minmax', 'tip_minmax'

**f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?**

In [None]:
f_selector = SelectKBest(score_func=f_regression, k=2)
f_selector.fit(X_train_scaled, y_train)
mask = f_selector.get_support()
X_train_scaled.columns[mask]

In [None]:
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select= 2)
rfe.fit(X_train_scaled, y_train)
rfe.support_
X_train_scaled.columns[rfe.support_]

**2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.**

In [7]:
def select_kbest  (X_df, y_df, n_features):
    '''
    Takes in the predictors, the target, and the number of features to select (k) ,
    and returns the names of the top k selected features based on the SelectKBest class
    
    X_df : the predictors
    y_df : the target
    n_features : he number of features to select (k)
    Example
    select_kbest(X_train_scaled, y_train, 2)
    '''
    
    f_selector = SelectKBest(score_func=f_regression, k= n_features)
    f_selector.fit(X_df, y_df)
    mask = f_selector.get_support()
    X_df.columns[mask]
    top = list(X_df.columns[mask])
    
    return print(f'The top {n_features} selected feautures based on the SelectKBest class are: {top}' )

In [None]:
select_kbest (X_train_scaled, y_train, 2)

**3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.**

In [8]:
def rfe (X_df, y_df, n_features):
    lm = LinearRegression()
    rfe = RFE(estimator=lm, n_features_to_select= n_features)
    rfe.fit(X_df, y_df)
    rfe.support_
    top = list(X_df.columns[rfe.support_])
    return print(f'The top {n_features} selected feautures based on the the RFE class class are: {top}' )

In [None]:
rfe (X_train_scaled, y_train, 2)

**4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).**

In [2]:
swiss_df = data('swiss')

In [3]:
swiss_df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [4]:
swiss_df.shape

(47, 6)

In [5]:
#split data in train, validate and split
train, validate, test = wrangle.split_data(swiss_df)

train -> (25, 6)
validate -> (12, 6)
test -> (10, 6)


In [None]:
#split X, y
def split_Xy (train, validate, test, target):
    '''
    This function takes in three dataframe (train, validate, test) and a target  and splits each of the 3 samples
    into a dataframe with independent variables and a series with the dependent, or target variable.
    The function returns 3 dataframes and 3 series:
    X_train (df) & y_train (series), X_validate & y_validate, X_test & y_test.
    '''
    
    #split train
    X_train = train.drop(columns= [target])
    y_train= train[target]
    #split validate
    X_validate = validate.drop(columns= [target])
    y_validate= validate[target]
    #split validate
    X_test = test.drop(columns= [target])
    y_test= test[target]
    return  X_train, y_train, X_validate, y_validate, X_test, y_test
    

In [6]:
#split Xy using my function
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle.split_Xy (train, validate, test, 'Fertility' )

X_train -> (25, 5)               y_train->(25,)
X_validate -> (12, 5)         y_validate->(12,) 
X_test -> (10, 5)                  y_test>(10,)


In [16]:
columns = list(X_train.select_dtypes(exclude='object').columns)
columns

['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']

In [20]:
#scaled
X_train_scaled_df, validate_scaled_df, test_scaled_df = prepare.scaled_mimmax(columns, X_train, X_validate, X_test)

In [21]:
#kbest
select_kbest(X_train_scaled_df, y_train, 3)

The top 3 selected feautures based on the SelectKBest class are: ['Examination', 'Catholic', 'Infant.Mortality']


In [23]:
#rfe
rfe(X_train_scaled_df, y_train, 3)

The top 3 selected feautures based on the the RFE class class are: ['Agriculture', 'Examination', 'Infant.Mortality']
