In [37]:
# Standard imports
import numpy as np
import pandas as pd

# To interact with the operating system
import os

# For statistical modeling
import scipy.stats as stats

# To acquire MYSQL Data
import acquire
from env import username, password, host
import wrangle

# For data visualization
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# 
#import evaluate

# For running modeling
import sklearn.metrics as mtc
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

#### 1. Load the tips dataset.
    a. Create a column named price_per_person. This should be the total bill divided by the party size.
    b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
    c. Use select k best to select the top 2 features for predicting tip amount. What are they?
    d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?
    e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?
#### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

#### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

#### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [5]:
tips = pd.read_csv('tips.csv')
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


    a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [12]:
tips['ppp'] = tips.total_bill / tips['size']

In [194]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,ppp
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,7.003333


    b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

**total bill and time**

    c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [195]:
train, validate, test = wrangle.split(tips)

df shape: (244, 8)
Train shape: (146, 8)
Validate shape: (49, 8)
Test shape: (49, 8)


In [196]:
train.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,ppp
20,17.92,4.08,Male,No,Sat,Dinner,2,8.96
67,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07
63,18.29,3.76,Male,Yes,Sat,Dinner,4,4.5725


In [216]:
t = pd.get_dummies(data = tips, columns={'sex', 'smoker', 'day', 'time'})

In [217]:
t.head(3)

Unnamed: 0,total_bill,tip,size,ppp,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,smoker_No,smoker_Yes,sex_Female,sex_Male
0,16.99,1.01,2,8.495,0,0,1,0,1,0,1,0,1,0
1,10.34,1.66,3,3.446667,0,0,1,0,1,0,1,0,0,1
2,21.01,3.5,3,7.003333,0,0,1,0,1,0,1,0,0,1


In [218]:
train = t[['total_bill','tip','ppp','smoker_No','smoker_Yes', 'time_Dinner', 'sex_Female', 'size']]

In [219]:
X_train, y_train = train[['total_bill','ppp','smoker_No','smoker_Yes', 'time_Dinner', 'sex_Female', 'size']], train.tip
X_validate, y_validate = validate[['total_bill', 'ppp', 'size']], validate.tip
X_test, y_test = test[['total_bill', 'ppp', 'size']], test.tip

In [220]:
# Make the model
kbest = SelectKBest(f_regression, k = 2)
# fit model
_ = kbest.fit(X_train, y_train)

In [221]:
kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns)

In [222]:
kbest_results

Unnamed: 0,p,f
total_bill,6.692471e-34,203.357723
ppp,2.502102e-08,33.213257
smoker_No,0.9265932,0.008506
smoker_Yes,0.9265932,0.008506
time_Dinner,0.05780153,3.633815
sex_Female,0.1664562,1.926155
size,4.300543e-16,76.175426


In [223]:
kbest.transform(X_train)[:5]

array([[16.99,  2.  ],
       [10.34,  3.  ],
       [21.01,  3.  ],
       [23.68,  2.  ],
       [24.59,  4.  ]])

In [224]:
X_train_transformed = pd.DataFrame(
    kbest.transform(X_train),
    columns=X_train.columns[kbest.get_support()],
    index=X_train.index
)

In [225]:
X_train_transformed.head(3)

Unnamed: 0,total_bill,size
0,16.99,2.0
1,10.34,3.0
2,21.01,3.0


## Testing Predictors for the new df that has dummy variables

In [197]:
t = pd.get_dummies(data = tips, columns={'sex', 'smoker', 'day', 'time', 'size'})

In [25]:
t.columns

Index(['total_bill', 'tip', 'ppp', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch', 'size_1',
       'size_2', 'size_3', 'size_4', 'size_5', 'size_6', 'sex_Female',
       'sex_Male'],
      dtype='object')

In [27]:
train, validate, test = wrangle.split(t)

df shape: (244, 19)
Train shape: (146, 19)
Validate shape: (49, 19)
Test shape: (49, 19)


In [28]:
X_train, y_train = train[['ppp', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch', 'size_1',
       'size_2', 'size_3', 'size_4', 'size_5', 'size_6', 'sex_Female',
       'sex_Male']], train.tip
X_validate, y_validate = validate[['ppp', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch', 'size_1',
       'size_2', 'size_3', 'size_4', 'size_5', 'size_6', 'sex_Female',
       'sex_Male']], validate.tip
X_test, y_test = test[['ppp', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch', 'size_1',
       'size_2', 'size_3', 'size_4', 'size_5', 'size_6', 'sex_Female',
       'sex_Male']], test.tip

In [29]:
# Make the model
kbest = SelectKBest(f_regression, k = 2)
# fit model
_ = kbest.fit(X_train, y_train)

In [30]:
# statistical f-value:
kbest.scores_
#p value: 
kbest.pvalues_

array([3.38104833e-04, 6.23024953e-01, 6.23024953e-01, 2.98304767e-01,
       1.65700505e-01, 1.56169975e-02, 7.17697281e-01, 4.73731314e-01,
       4.73731314e-01, 2.19499864e-02, 2.44503881e-07, 1.26636780e-01,
       1.59603700e-04, 1.14552634e-02, 6.27204157e-05, 3.07739009e-01,
       3.07739009e-01])

In [31]:
kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns)

In [32]:
kbest_results

Unnamed: 0,p,f
ppp,0.0003381048,13.485489
smoker_No,0.623025,0.242683
smoker_Yes,0.623025,0.242683
day_Fri,0.2983048,1.089619
day_Sat,0.1657005,1.941059
day_Sun,0.015617,5.986864
day_Thur,0.7176973,0.131224
time_Dinner,0.4737313,0.515966
time_Lunch,0.4737313,0.515966
size_1,0.02194999,5.365493


In [33]:
kbest.transform(X_train)[:5]

array([[1., 0.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [0., 0.]])

In [34]:
X_train_transformed = pd.DataFrame(
    kbest.transform(X_train),
    columns=X_train.columns[kbest.get_support()],
    index=X_train.index
)

In [35]:
X_train_transformed.head(3)

Unnamed: 0,size_2,size_6
20,1.0,0.0
67,0.0,0.0
63,0.0,0.0


    d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [38]:
model = LinearRegression()

In [39]:
rfe = RFE(model, n_features_to_select=2)
# fit the model
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [40]:
rfe.ranking_

array([ 8, 15, 16,  7,  9,  6, 10, 11, 12,  5,  4,  3,  2,  1,  1, 14, 13])

In [41]:
pd.DataFrame(
{
    'rfe_ranking': rfe.ranking_
}, index = X_train.columns
)

Unnamed: 0,rfe_ranking
ppp,8
smoker_No,15
smoker_Yes,16
day_Fri,7
day_Sat,9
day_Sun,6
day_Thur,10
time_Dinner,11
time_Lunch,12
size_1,5


In [42]:
rfe.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True, False, False])

In [43]:
X_train_transformed = pd.DataFrame(rfe.transform(X_train), index= X_train.index, columns=X_train.columns[rfe.support_])

In [44]:
X_train_transformed.head(3)

Unnamed: 0,size_5,size_6
20,0.0,0.0
67,0.0,0.0
63,0.0,0.0


    e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [188]:


def select_kbest(X_train, y_train, k):
    kbest = SelectKBest(f_regression, k=k)
    
    kbest.fit(X_train, y_train)
    
    X_train_transformed = pd.DataFrame(kbest.transform(X_train),
                                       columns=X_train.columns[kbest.get_support()],
                                       index=X_train.index)
    
    return X_train.columns[kbest.get_support()],X_train_transformed.head(3)

In [189]:
select_kbest(X_train, y_train, 4)

(Index(['Agriculture', 'Examination', 'Education', 'Catholic'], dtype='object'),
            Agriculture  Examination  Education  Catholic
 La Vallee         15.2         31.0       20.0      2.15
 Sierre            84.6          3.0        3.0     99.46
 Moudon            55.1         14.0        3.0      4.52)

In [None]:
kbest = SelectKBest(f_regression, k=2)
# fit model
_ = kbest.fit(X_train, y_train)

# statistical f-value:
kbest.scores_
#p value: 
kbest.pvalues_

kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns)

kbest_results

X_train_transformed = pd.DataFrame(
    kbest.transform(X_train),
    columns=X_train.columns[kbest.get_support()],
    index=X_train.index
)

X_train_transformed.head(3)

### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [236]:
'''RFE: takes in the predictors, the target, and the number of features to select and returns 
the top features based on the RFE class.'''

def rfe(X_train, y_train, k):
    model = LinearRegression()
    # Make the model
    rfe = RFE(model, n_features_to_select=k)
    # Fit the model
    rfe.fit(X_train, y_train)
    
    X_train_transformed = pd.DataFrame(rfe.transform(X_train), 
                                       index= X_train.index, 
                                       columns=X_train.columns[rfe.support_])
    
    return X_train.columns[rfe.support_], X_train_transformed.head(3)
    

In [237]:
rfe(X_train, y_train, 3)

(Index(['total_bill', 'ppp', 'smoker_No'], dtype='object'),
    total_bill       ppp  smoker_No
 0       16.99  8.495000        1.0
 1       10.34  3.446667        1.0
 2       21.01  7.003333        1.0)

In [232]:
model = LinearRegression()

rfe = RFE(model, n_features_to_select=2)
# fit the model
rfe.fit(X_train, y_train)

pd.DataFrame(
{
    'rfe_ranking': rfe.ranking_
}, index = X_train.columns
)

rfe.get_support()

X_train_transformed = pd.DataFrame(rfe.transform(X_train), index= X_train.index, columns=X_train.columns[rfe.support_])

### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [59]:
#pydataset.load_dataset('swiss')

In [47]:
#<iframe src="https://r-data.pmagunia.com/iframe/r-dataset-package-datasets-swiss.html" frameBorder="0" width="100%" height="307px" />
import requests

In [123]:
#url = "https://github.com/cs109/2014_data/blob/master/countries.csv"
#url = "https://r-data.pmagunia.com/iframe/r-dataset-package-datasets-swiss.html"
#url = "https://rstudio-pubs-static.s3.amazonaws.com/534472_4d89ba1506b24e22acf23d0482bd6a53.html"
#c = pd.read_csv('Sample-Data-Swiss-Fertility.csv')

In [125]:
#c.head(3)

In [126]:
#replace('Quantity[' : '')

In [127]:
#type(c.Fertility)

In [128]:
#c.Fertility = c.Fertility.str.replace('"Percent"','%')

In [129]:
#c.head(3)

In [117]:
#c.replace(to_replace('"Percent]"','%'))

In [118]:
#c.Fertility.str.replace('Quantity[','n')

In [119]:
#c.values.remove('Quantity[')

In [121]:
from pydataset import data

In [238]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [239]:
train, validate, test = wrangle.split(swiss)

df shape: (47, 6)
Train shape: (27, 6)
Validate shape: (10, 6)
Test shape: (10, 6)


In [240]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27 entries, La Vallee to Martigwy
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         27 non-null     float64
 1   Agriculture       27 non-null     float64
 2   Examination       27 non-null     int64  
 3   Education         27 non-null     int64  
 4   Catholic          27 non-null     float64
 5   Infant.Mortality  27 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 1.5+ KB


In [241]:
X_train, y_train = train[['Agriculture','Examination','Education','Catholic','Infant.Mortality']], train.Fertility
X_validate, y_validate = validate[['Agriculture','Examination','Education','Catholic','Infant.Mortality']], validate.Fertility
X_test, y_test = test[['Agriculture','Examination','Education','Catholic','Infant.Mortality']], test.Fertility

In [243]:
rfe(X_train, y_train, 3)

(Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object'),
            Examination  Education  Infant.Mortality
 La Vallee         31.0       20.0              10.8
 Sierre             3.0        3.0              16.3
 Moudon            14.0        3.0              22.4)

In [244]:
select_kbest(X_train, y_train, 3)

(Index(['Agriculture', 'Examination', 'Education'], dtype='object'),
            Agriculture  Examination  Education
 La Vallee         15.2         31.0       20.0
 Sierre            84.6          3.0        3.0
 Moudon            55.1         14.0        3.0)

In [142]:
kbest = SelectKBest(f_regression, k=3)
# fit model
_ = kbest.fit(X_train, y_train)

In [143]:
# statistical f-value:
kbest.scores_
#p value: 
kbest.pvalues_

array([8.80334021e-03, 2.45509799e-05, 1.31475704e-05, 1.83966651e-02,
       8.97779012e-02])

In [144]:
kbest.scores_

array([ 8.07501955, 26.64154471, 29.20451537,  6.36201886,  3.11523751])

In [145]:
kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns)

In [146]:
kbest_results

Unnamed: 0,p,f
Agriculture,0.008803,8.07502
Examination,2.5e-05,26.641545
Education,1.3e-05,29.204515
Catholic,0.018397,6.362019
Infant.Mortality,0.089778,3.115238


In [147]:
X_train_transformed = pd.DataFrame(
    kbest.transform(X_train),
    columns=X_train.columns[kbest.get_support()],
    index=X_train.index
)


In [148]:
X_train_transformed.head(3)

Unnamed: 0,Agriculture,Examination,Education
La Vallee,15.2,31.0,20.0
Sierre,84.6,3.0,3.0
Moudon,55.1,14.0,3.0
