In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

pd.pandas.set_option('display.max_columns', None)

In [2]:
X_train_scaled = pd.read_csv('../data/train_scaled.csv')
X_val_scaled = pd.read_csv('../data/val_scaled.csv')
# test_scaled = pd.read_csv('./data/X_test_scaled.csv')
X_train_scaled.head()

Unnamed: 0,id,tip_amount,VendorID_2,VendorID_1,passenger_count,trip_distance,RatecodeID_one,RatecodeID_two,RatecodeID_five,RatecodeID_three,RatecodeID_four,store_and_fwd_flag_N,store_and_fwd_flag_Y,PULocationID_OTHER,PULocationID_236,PULocationID_186,PULocationID_161,PULocationID_230,PULocationID_237,PULocationID_234,PULocationID_162,PULocationID_48,PULocationID_170,DOLocationID_236,DOLocationID_OTHER,DOLocationID_186,DOLocationID_48,DOLocationID_230,DOLocationID_170,DOLocationID_161,DOLocationID_234,DOLocationID_237,DOLocationID_162,payment_type_one,payment_type_two,payment_type_three,payment_type_four,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount,trip_seconds,pickup_hour,pickup_min,pickup_sec,dropoff_hour,dropoff_min,dropoff_sec,trip_day
0,1,1.96,1.0,0.0,0.2,0.04938,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.680321,0.0,1.0,0.0,0.0,0.537155,0.006259,0.521739,0.169492,0.0,0.521739,0.338983,0.0,0.166667
1,11,1.0,1.0,0.0,0.0,0.072002,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.700391,0.222222,1.0,0.0,0.0,0.560787,0.006259,0.782609,0.20339,0.0,0.782609,0.372881,0.0,0.166667
2,13,0.0,0.0,1.0,0.0,0.024325,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.690859,0.222222,1.0,0.0,0.0,0.537656,0.009736,0.782609,0.474576,0.0,0.782609,0.728814,0.0,0.3
3,14,0.0,1.0,0.0,0.0,0.075894,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.720873,0.0,1.0,0.0,0.0,0.566045,0.009736,0.478261,0.067797,0.0,0.478261,0.322034,0.0,0.333333
4,15,2.36,1.0,0.0,0.0,0.051569,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.690859,0.222222,1.0,0.0,0.0,0.564592,0.00765,0.695652,0.745763,0.0,0.695652,0.949153,0.0,0.633333


In [3]:
y_train = X_train_scaled['tip_amount']
y_val = X_val_scaled['tip_amount']

X_train_scaled.drop(['id', 'tip_amount'], axis=1, inplace=True)
X_val_scaled.drop(['id', 'tip_amount'], axis=1, inplace=True)

### Feature selection

In [4]:
# model fit then feature selection
# alpha: penalty, the larger the fewer features selected
# sel_: will select features that are non-zero

lasso = Lasso(alpha=0.005, random_state=1)

sel_ = SelectFromModel(lasso)
sel_.fit(X_train_scaled, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=1,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [5]:
selected_features = X_train_scaled.columns[(sel_.get_support())]

selected_features

Index(['trip_distance', 'RatecodeID_one', 'DOLocationID_OTHER',
       'payment_type_one', 'tolls_amount', 'total_amount', 'pickup_hour',
       'trip_day'],
      dtype='object')

In [6]:
print(f'Total features: {X_train_scaled.shape[1]}')
print(f'Selected features: {len(selected_features)}')

Total features: 49
Selected features: 8


In [7]:
# save only the list of selected features
pd.Series(selected_features)

pd.Series(selected_features).to_csv('selected_features.csv', index=False)