In [34]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from pydataset import data
import prepare
from sklearn.linear_model import LinearRegression

In [6]:
tips = data('tips')

In [57]:
tips.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,8.495,0,0,0,1,0,0
2,10.34,1.66,3,3.446667,1,0,0,1,0,0
3,21.01,3.5,3,7.003333,1,0,0,1,0,0
4,23.68,3.31,2,11.84,1,0,0,1,0,0
5,24.59,3.61,4,6.1475,0,0,0,1,0,0


In [48]:
tips.dtypes

total_bill          float64
tip                 float64
size                  int64
price_per_person    float64
sex_Male              uint8
smoker_Yes            uint8
day_Sat               uint8
day_Sun               uint8
day_Thur              uint8
time_Lunch            uint8
dtype: object

In [18]:
tips = prepare.dem_dummies(tips)

In [36]:
tips['price_per_person'] = tips.total_bill / tips['size']

In [10]:
#the total bill and the party size will likely influnce tip the most

In [37]:
X_train = tips.drop(columns='tip')
y_train = tips['tip']

In [38]:
kbest = SelectKBest(f_regression,k=1)
_ = kbest.fit(X_train, y_train)

In [39]:
kbest.scores_

array([2.03357723e+02, 7.61754260e+01, 3.32132566e+01, 1.92615456e+00,
       8.50601355e-03, 1.88381326e-03, 3.84838990e+00, 2.24530209e+00,
       3.63381548e+00])

In [40]:
kbest.pvalues_

array([6.69247065e-34, 4.30054333e-16, 2.50210155e-08, 1.66456235e-01,
       9.26593152e-01, 9.65416142e-01, 5.09401215e-02, 1.35324008e-01,
       5.78015348e-02])

In [41]:
X_train.columns[kbest.get_support()]

Index(['total_bill'], dtype='object')

In [49]:
def select_kbest(X, y, k):
    """
    Select the top k features from X based on their correlation with y using the f_regression method.
    """
    selector = SelectKBest(f_regression, k=k)  # Create a SelectKBest object with the f_regression method and k as input
    selector.fit(X, y)  # Fit the selector to the data
    mask = selector.get_support()  # Get a mask of the selected features
    selected_features = []  # Create an empty list to store the names of the selected features
    for bool, feature in zip(mask, X.columns):  # Loop through the mask and the columns of X
        if bool:  # If the feature is selected
            selected_features.append(feature)  # Add the name of the feature to the selected_features list
    return selected_features  # Return the list of selected features

In [50]:
def rfe(X, y, k):
    """
    Perform Recursive Feature Elimination to select the top k features from X based on their correlation with y.
    """
    estimator = LinearRegression()  # Create a LinearRegression object as the estimator
    selector = RFE(estimator, n_features_to_select=k)  # Create an RFE object with k as the number of features to select
    selector.fit(X, y)  # Fit the selector to the data
    mask = selector.support_  # Get a mask of the selected features
    selected_features = []  # Create an empty list to store the names of the selected features
    for bool, feature in zip(mask, X.columns):  # Loop through the mask and the columns of X
        if bool:  # If the feature is selected
            selected_features.append(feature)  # Add the name of the feature to the selected_features list
    return selected_features  # Return the list of selected features

In [42]:
swiss = data('swiss')

In [46]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [53]:
select_kbest(swiss.drop(columns='Fertility'), swiss.Fertility, 3)

['Examination', 'Education', 'Catholic']

In [56]:
rfe(swiss.drop(columns='Fertility'), swiss.Fertility, 3)

['Examination', 'Education', 'Infant.Mortality']

In [None]:
tip