In [22]:
import pandas as pd             
import numpy as np

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")

import sklearn
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('./data/train_imputed.csv')

In [3]:
df.shape

(523021, 45)

In [4]:
df.head()

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,...,Max_VisibilityKm,Min_VisibilitykM,Mean_VisibilityKm,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
0,1000,2016-01-03,0,1,0,326,7,495,5676,9643,...,19.0,6.0,11.0,1,0,0,0,1,0,0
1,1000,2016-02-03,0,1,0,326,7,608,8111,9643,...,23.0,10.0,13.0,1,0,0,0,1,0,0
2,1000,2016-04-03,0,1,0,326,7,665,8300,9643,...,31.0,8.0,11.0,1,0,0,0,1,0,0
3,1000,2016-05-03,0,1,0,326,7,630,7154,9643,...,31.0,10.0,15.0,1,0,0,0,1,0,0
4,1000,2016-06-03,0,0,0,326,7,0,0,9643,...,31.0,5.0,12.0,1,0,0,0,1,0,0


In [5]:
df.columns

Index(['StoreID', 'Date', 'IsHoliday', 'IsOpen', 'HasPromotions',
       'NearestCompetitor', 'Region', 'NumberOfCustomers', 'NumberOfSales',
       'Region_AreaKM2', 'Region_GDP', 'Region_PopulationK', 'Max_Dew_PointC',
       'Max_Gust_SpeedKm_h', 'Max_Humidity', 'Max_Sea_Level_PressurehPa',
       'Max_TemperatureC', 'Max_Wind_SpeedKm_h', 'Mean_Dew_PointC',
       'Mean_Humidity', 'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC',
       'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity',
       'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Precipitationmm',
       'WindDirDegrees', 'CloudCover', 'Fog', 'Hail', 'Rain', 'Snow',
       'Thunderstorm', 'Max_VisibilityKm', 'Min_VisibilitykM',
       'Mean_VisibilityKm', 'Hyper_Market', 'Shopping_Center',
       'Standard_Market', 'Super_Market', 'General', 'With_Fish_Department',
       'With_Non-Food_Department'],
      dtype='object')

In [6]:
target = df['NumberOfSales']

In [7]:
df = df.drop(df[['Date', 'NumberOfSales', 'WindDirDegrees']], axis=1)

# Feature Selection

In [68]:
# Feature Extraction with RFE
from sklearn.feature_selection import RFE
# load data
array = df.values
X = df
Y = target

In [69]:
rfe = RFE(linear_model.LinearRegression(), 10) #10 is the number of features to consider
fit = rfe.fit(X, Y)

In [70]:
feature_mask = fit.get_support() #mask of the chosen features
feature_mask

array([False,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True,  True,  True,  True,  True], dtype=bool)

In [71]:
feature_selected =df.columns[feature_mask]
feature_selected

Index(['IsHoliday', 'IsOpen', 'HasPromotions', 'Hyper_Market',
       'Shopping_Center', 'Standard_Market', 'Super_Market', 'General',
       'With_Fish_Department', 'With_Non-Food_Department'],
      dtype='object')

In [72]:
df[feature_selected]

Unnamed: 0,IsHoliday,IsOpen,HasPromotions,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
0,0,1,0,1,0,0,0,1,0,0
1,0,1,0,1,0,0,0,1,0,0
2,0,1,0,1,0,0,0,1,0,0
3,0,1,0,1,0,0,0,1,0,0
4,0,0,0,1,0,0,0,1,0,0
5,0,1,1,1,0,0,0,1,0,0
6,0,1,1,1,0,0,0,1,0,0
7,0,1,1,1,0,0,0,1,0,0
8,0,1,1,1,0,0,0,1,0,0
9,0,1,1,1,0,0,0,1,0,0
