In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

pd.pandas.set_option('display.max_columns', None)

In [4]:
X_train_scaled = pd.read_csv('./data/X_train_scaled.csv')
X_test_scaled = pd.read_csv('./data/X_test_scaled.csv')
X_train_scaled.head()

Unnamed: 0,id,tip_amount,VendorID,passenger_count,trip_distance,RatecodeID_one,RatecodeID_five,RatecodeID_two,RatecodeID_three,RatecodeID_four,store_and_fwd_flag_N,store_and_fwd_flag_Y,PULocationID_OTHER,PULocationID_162,PULocationID_161,PULocationID_236,PULocationID_170,PULocationID_186,PULocationID_230,PULocationID_142,PULocationID_48,PULocationID_234,PULocationID_237,DOLocationID_OTHER,DOLocationID_237,DOLocationID_161,DOLocationID_186,DOLocationID_236,DOLocationID_234,DOLocationID_48,DOLocationID_142,DOLocationID_230,DOLocationID_170,DOLocationID_162,payment_type_one,payment_type_two,payment_type_three,payment_type_four,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount,trip_seconds
0,25324,1.36,1.0,0.166667,0.013209,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.373393,0.111111,1.0,0.0,1.0,0.242886,0.040803
1,25134,2.58,1.0,0.333333,0.014172,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.405735,0.0,1.0,0.0,1.0,0.276784,0.046823
2,27635,3.7,0.0,0.166667,0.111447,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.476668,0.222222,1.0,0.0,1.0,0.355153,0.063545
3,9501,0.0,0.0,0.166667,0.004128,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.354548,0.0,1.0,0.0,1.0,0.203477,0.039465
4,21993,3.8,0.0,0.166667,0.052284,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.428681,0.111111,1.0,0.0,1.0,0.306046,0.046823


In [8]:
y_train = X_train_scaled['tip_amount']
y_test = X_test_scaled['tip_amount']

X_train_scaled.drop(['id', 'tip_amount'], axis=1, inplace=True)
X_test_scaled.drop(['id', 'tip_amount'], axis=1, inplace=True)

### Feature selection

In [9]:
# model fit then feature selection
# alpha: penalty, the larger the fewer features selected
# sel_: will select features that are non-zero

lasso = Lasso(alpha=0.005, random_state=1)

sel_ = SelectFromModel(lasso)
sel_.fit(X_train_scaled, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=1,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [11]:
selected_features = X_train_scaled.columns[(sel_.get_support())]

selected_features

Index(['VendorID', 'trip_distance', 'RatecodeID_one', 'PULocationID_161',
       'DOLocationID_OTHER', 'payment_type_one', 'tolls_amount',
       'total_amount'],
      dtype='object')

In [12]:
print(f'Total features: {X_train_scaled.shape[1]}')
print(f'Selected features: {len(selected_features)}')

Total features: 43
Selected features: 8


In [14]:
# save only the list of selected features
pd.Series(selected_features)

pd.Series(selected_features).to_csv('selected_features.csv', index=False)