# Feature Selection Strategy

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv("../data/numeric_feature_table.csv")
df.head()

Unnamed: 0,id,item_nbr,store_nbr,Year,month,prev_month_sales,prev_3month_sales,avg_prev_3months_sales,min_prev_3months_sales,max_prev_3months_sales,...,cluster9,cluster10,cluster11,cluster12,cluster13,cluster14,cluster15,cluster16,cluster17,total_unit_sales
0,1,1003679,1,2013,4,3.850148,3.988984,4.071872,3.78419,4.454347,...,0,0,0,0,1,0,0,0,0,3.806662
1,2,1003679,1,2013,5,3.806662,4.465908,3.799228,3.78419,3.828641,...,0,0,0,0,1,0,0,0,0,3.806662
2,3,1003679,1,2013,6,3.806662,3.850148,3.806662,3.78419,3.850148,...,0,0,0,0,1,0,0,0,0,3.871201
3,4,1003679,1,2013,7,3.871201,3.806662,3.688879,3.367296,3.850148,...,0,0,0,0,1,0,0,0,0,3.401197
4,5,1003679,1,2013,8,3.401197,3.806662,3.637586,3.367296,3.850148,...,0,0,0,0,1,0,0,0,0,3.663562


In [3]:
#df['rejected'] = df['rejected'].astype('category')
#df['condition'] = df['condition'].astype('category')

df.dtypes

id                                 int64
item_nbr                           int64
store_nbr                          int64
Year                               int64
month                              int64
prev_month_sales                 float64
prev_3month_sales                float64
avg_prev_3months_sales           float64
min_prev_3months_sales           float64
max_prev_3months_sales           float64
monthly_oilprice_avg             float64
total_month_holidays               int64
total_month_transactions           int64
prev_month_transactions            int64
prev_3month_transactions           int64
avg_prev_3months_transactions    float64
min_prev_3months_transactions      int64
max_prev_3months_transactions      int64
BREAD.BAKERY                       int64
BEVERAGES                          int64
DAIRY                              int64
PERSONAL.CARE                      int64
HOME.CARE                          int64
FROZEN.FOODS                       int64
POULTRY         

In [4]:
df.shape

(1734047, 78)

In [5]:
varSel = pd.DataFrame({'Variable': df.columns[1:77]})
varSel

Unnamed: 0,Variable
0,item_nbr
1,store_nbr
2,Year
3,month
4,prev_month_sales
5,prev_3month_sales
6,avg_prev_3months_sales
7,min_prev_3months_sales
8,max_prev_3months_sales
9,monthly_oilprice_avg


## Multivariable Analysis

In [7]:
### drop na
##df = df.dropna()

In [6]:
## remove unnecessary vars
X = df.loc[:,df.columns[1:77]]
X.head()

Unnamed: 0,item_nbr,store_nbr,Year,month,prev_month_sales,prev_3month_sales,avg_prev_3months_sales,min_prev_3months_sales,max_prev_3months_sales,monthly_oilprice_avg,...,cluster8,cluster9,cluster10,cluster11,cluster12,cluster13,cluster14,cluster15,cluster16,cluster17
0,1003679,1,2013,4,3.850148,3.988984,4.071872,3.78419,4.454347,92.021364,...,0,0,0,0,0,1,0,0,0,0
1,1003679,1,2013,5,3.806662,4.465908,3.799228,3.78419,3.828641,94.509545,...,0,0,0,0,0,1,0,0,0,0
2,1003679,1,2013,6,3.806662,3.850148,3.806662,3.78419,3.850148,95.7725,...,0,0,0,0,0,1,0,0,0,0
3,1003679,1,2013,7,3.871201,3.806662,3.688879,3.367296,3.850148,104.670909,...,0,0,0,0,0,1,0,0,0,0
4,1003679,1,2013,8,3.401197,3.806662,3.637586,3.367296,3.850148,106.572727,...,0,0,0,0,0,1,0,0,0,0


In [7]:
 y = df.loc[:,df.columns[77]]
print([X.shape,y.shape])

[(1734047, 76), (1734047,)]


In [8]:
y

0          3.806662
1          3.806662
2          3.871201
3          3.401197
4          3.663562
5          3.912023
6          3.637586
7          4.043051
8          5.298317
9          3.828641
10         4.343805
11         4.262680
12         4.158883
13         3.663562
14         3.637586
15         3.713572
16         3.555348
17         3.828641
18         3.610918
19         5.913503
20         3.784190
21         4.262680
22         4.143135
23         4.343805
24         4.634729
25         4.454347
26         3.828641
27         3.784190
28         3.931826
29         3.761200
             ...   
1734017    5.666427
1734018    5.710427
1734019    5.755742
1734020    5.332719
1734021    5.634790
1734022    5.886104
1734023    6.001415
1734024    5.438079
1734025    5.823046
1734026    5.659482
1734027    5.398163
1734028    5.476464
1734029    5.407172
1734030    5.627621
1734031    5.393628
1734032    5.513429
1734033    4.574711
1734034    4.584967
1734035    4.795791


### Variable Selection using LASSO (L1 penalization)

In [9]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

lassomod = Lasso(alpha=0.05).fit(X, y)

  positive)


In [10]:
model = SelectFromModel(lassomod, prefit=True)
model.get_support()

array([False, False, False, False, False, False,  True,  True,  True,
        True, False,  True, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [11]:
varSel['Lasso'] = model.get_support().astype('int64')
varSel

Unnamed: 0,Variable,Lasso
0,item_nbr,0
1,store_nbr,0
2,Year,0
3,month,0
4,prev_month_sales,0
5,prev_3month_sales,0
6,avg_prev_3months_sales,1
7,min_prev_3months_sales,1
8,max_prev_3months_sales,1
9,monthly_oilprice_avg,1


### Variable Selection using Random Forest

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

rfmod = RandomForestRegressor().fit(X, y)
#rfmod.feature_importances_ 



In [13]:
model = SelectFromModel(rfmod, prefit=True)
model.get_support()

array([False, False, False, False,  True, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [14]:
varSel['RandomForest'] = model.get_support().astype('int64')
varSel

Unnamed: 0,Variable,Lasso,RandomForest
0,item_nbr,0,0
1,store_nbr,0,0
2,Year,0,0
3,month,0,0
4,prev_month_sales,0,1
5,prev_3month_sales,0,0
6,avg_prev_3months_sales,1,1
7,min_prev_3months_sales,1,1
8,max_prev_3months_sales,1,0
9,monthly_oilprice_avg,1,0


### Variable Selection using Gradient Boosting classification

In [15]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel

gbmod = GradientBoostingRegressor().fit(X,y)

In [16]:
model = SelectFromModel(gbmod, prefit=True)
model.get_support()

array([False, False, False, False, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [17]:
varSel['GradientBoost'] = model.get_support().astype('int64')
varSel

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost
0,item_nbr,0,0,0
1,store_nbr,0,0,0
2,Year,0,0,0
3,month,0,0,0
4,prev_month_sales,0,1,0
5,prev_3month_sales,0,0,0
6,avg_prev_3months_sales,1,1,1
7,min_prev_3months_sales,1,1,1
8,max_prev_3months_sales,1,0,0
9,monthly_oilprice_avg,1,0,0


### Variable Selection using SVM classification

In [None]:
from sklearn import svm
from sklearn.feature_selection import SelectFromModel

svmmod = svm.SVR( C=1, gamma=0.1).fit(X, y)

In [None]:
model = SelectFromModel(svmmod, prefit=True)
model.get_support()

In [None]:
varSel['SVM'] = model.get_support().astype('int64')
varSel

# Ada Boost

In [16]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.feature_selection import SelectFromModel

ABRmod = AdaBoostRegressor(random_state=1).fit(X,y)

In [18]:
model = SelectFromModel(ABRmod, prefit=True)
model.get_support()

array([False,  True,  True,  True,  True,  True,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False])

In [19]:
varSel['Ada Boost'] = model.get_support().astype('int64')
varSel

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost,Ada Boost
0,Year,1,0,0,0
1,month,1,0,0,1
2,prev_month_sales,1,1,0,1
3,prev_3month_sales,1,0,0,1
4,avg_prev_3months_sales,1,1,1,1
5,min_prev_3months_sales,1,1,1,1
6,max_prev_3months_sales,1,1,1,1
7,monthly_oilprice_avg,1,0,0,0
8,total_month_holidays,1,0,0,0
9,total_month_transactions,1,0,0,0


## KNN

In [34]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectFromModel
neigh = KNeighborsRegressor(n_neighbors=5).fit(X, y)

In [35]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(neigh, prefit=True)
model.get_support()

ValueError: The underlying estimator KNeighborsRegressor has no `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to SelectFromModel or call fit before calling transform.

In [None]:
varSel['KNN'] = model.get_support().astype('int64')
varSel

## Ridge L2 regularization

In [36]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectFromModel
Rdg = Ridge(alpha=0.5).fit(X,y)

  overwrite_a=True).T


In [37]:
model = SelectFromModel(Rdg, prefit=True)
model.get_support()

array([False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
        True, False, False,  True,  True,  True, False,  True, False,
       False, False, False, False, False, False, False,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True, False,  True,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False])

In [38]:
varSel['Ridge'] = model.get_support().astype('int64')
varSel

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost,Ada Boost,Ridge
0,Year,1,0,0,0,0
1,month,1,0,0,1,0
2,prev_month_sales,1,1,0,1,0
3,prev_3month_sales,1,0,0,1,0
4,avg_prev_3months_sales,1,1,1,1,1
5,min_prev_3months_sales,1,1,1,1,0
6,max_prev_3months_sales,1,1,1,1,0
7,monthly_oilprice_avg,1,0,0,0,0
8,total_month_holidays,1,0,0,0,0
9,total_month_transactions,1,0,0,0,0


## Decision tree

In [39]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectFromModel
dtree= DecisionTreeRegressor(random_state=2).fit(X,y)

In [40]:
model = SelectFromModel(dtree, prefit=True)
model.get_support()

array([False, False,  True, False,  True,  True,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False])

In [41]:
varSel['dtree'] = model.get_support().astype('int64')
varSel

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost,Ada Boost,Ridge,dtree
0,Year,1,0,0,0,0,0
1,month,1,0,0,1,0,0
2,prev_month_sales,1,1,0,1,0,1
3,prev_3month_sales,1,0,0,1,0,0
4,avg_prev_3months_sales,1,1,1,1,1,1
5,min_prev_3months_sales,1,1,1,1,0,1
6,max_prev_3months_sales,1,1,1,1,0,1
7,monthly_oilprice_avg,1,0,0,0,0,0
8,total_month_holidays,1,0,0,0,0,0
9,total_month_transactions,1,0,0,0,0,0


## XG boost

In [None]:
from sklearn import svm
from sklearn.feature_selection import SelectFromModel
clf = svm.SVR(C=1, gamma=0.1).fit(X,y)

### Summarization and Selection of Variables 

In [1]:
varSel['Sum'] =  np.sum(varSel,axis=1)
varSel

NameError: name 'np' is not defined

In [133]:
varSel.groupby('Sum')['Variable'].count()


Sum
0     3
1     4
2     4
3     8
4    10
5     7
Name: Variable, dtype: int64

We can now decide a threshold for selecting our variables!