# Feature Selection

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

from sklearn.preprocessing import MinMaxScaler

In [2]:
X_train = pd.read_csv('data/xtrain.csv')
X_test = pd.read_csv('data/xtest.csv')

X_train.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,931,12.21106,0.0,0.75,0.461171,0.377048,1.0,1.0,0.333333,1.0,...,0.75,1.0,0.0,0.545455,0.75,0.666667,0.75,0.0,0.0,0.0
1,657,11.887931,0.0,0.75,0.456066,0.399443,1.0,1.0,0.333333,0.333333,...,0.5,1.0,0.0,0.636364,0.5,0.666667,0.75,0.0,0.0,0.0
2,46,12.675764,0.588235,0.75,0.394699,0.347082,1.0,1.0,0.0,0.333333,...,0.75,1.0,0.0,0.090909,1.0,0.666667,0.75,0.0,0.0,0.0
3,1349,12.278393,0.0,0.75,0.388581,0.493677,1.0,1.0,0.666667,0.666667,...,0.75,1.0,0.0,0.636364,0.25,0.666667,0.75,1.0,0.0,0.0
4,56,12.103486,0.0,0.75,0.577658,0.402702,1.0,1.0,0.333333,0.333333,...,0.75,1.0,0.0,0.545455,0.5,0.666667,0.75,0.0,0.0,0.0


In [3]:
y_train = X_train['SalePrice']
y_test = X_test['SalePrice']

X_train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
X_test.drop(['Id', 'SalePrice'], axis=1, inplace=True)

In [4]:
# feature selection with Lasso and Select from model
# Lasso will drive the coefficient for some variables to zero
# SelectFrom Model will then prune those variables whose coefficient is zero from my model

sel = SelectFromModel(Lasso(alpha=0.005, random_state=0))
sel.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [5]:
#The array of T and F answers whether 'this particular column should be kept?'
sel.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True,  True,
       False,  True,  True, False, False, False,  True, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [9]:
selected_feat = X_train.columns[sel.get_support()]
selected_feat

print("The total number of features in the beginning was {}".format(X_train.shape[1]))
print("The total number of features selected by our pruner was {}".format(len(selected_feat)))

The total number of features in the beginning was 82
The total number of features selected by our pruner was 22


In [13]:
X_train.columns[sel.get_support()]
#or
X_train.columns[sel.estimator_.coef_ != 0]

Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
       'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
       'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'PavedDrive'],
      dtype='object')

In [15]:
pd.Series(selected_feat).to_csv('data/selected_features.csv', index=False)