In [1]:
import pandas as pd             
import numpy as np
from sklearn.preprocessing import Imputer

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")
train_data = pd.read_csv('./data/train.csv')

In [2]:
train_data.head()

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,NumberOfCustomers,...,Mean_TemperatureC,Mean_VisibilityKm,Mean_Wind_SpeedKm_h,Min_Dew_PointC,Min_Humidity,Min_Sea_Level_PressurehPa,Min_TemperatureC,Min_VisibilitykM,Precipitationmm,WindDirDegrees
0,1000,01/03/2016,0,1,0,Hyper Market,General,326,7,495,...,1,11.0,16,-2,70,1029,1,6.0,0.0,23
1,1000,02/03/2016,0,1,0,Hyper Market,General,326,7,608,...,3,13.0,10,-2,58,1025,1,10.0,0.0,56
2,1000,04/03/2016,0,1,0,Hyper Market,General,326,7,665,...,3,11.0,10,-3,55,1023,2,8.0,0.0,22
3,1000,05/03/2016,0,1,0,Hyper Market,General,326,7,630,...,3,15.0,10,-6,25,1022,-1,10.0,0.0,108
4,1000,06/03/2016,0,0,0,Hyper Market,General,326,7,0,...,1,12.0,5,-6,48,1022,-5,5.0,0.0,46


Percentage of rows where CloudCover is null

In [3]:
nCloudCover=len(train_data[train_data["CloudCover"].isnull()])
n=len(train_data)
nCloudCover/n*100

7.8736800243202465

# Imputation

We copy the dataframe to make imputation

In [5]:
train_data_imputed=train_data.copy()

## Events

Let's transform the events variables into dummies. All possible values that the attribute can assume are

In [6]:
train_data_imputed["Events"].value_counts()

Rain                           204733
Fog                             67797
Fog-Rain                        50908
Rain-Thunderstorm               21988
Rain-Snow                       18176
Snow                            15433
Fog-Rain-Thunderstorm            8567
Fog-Rain-Snow                    3225
Fog-Snow                         2857
Thunderstorm                     1485
Rain-Hail                        1331
Rain-Snow-Hail                    629
Rain-Hail-Thunderstorm            283
Fog-Rain-Snow-Hail                231
Rain-Snow-Thunderstorm            220
Snow-Hail                         194
Rain-Snow-Hail-Thunderstorm       193
Fog-Rain-Hail-Thunderstorm        178
Fog-Snow-Hail                     169
Fog-Rain-Hail                     169
Fog-Thunderstorm                  157
Name: Events, dtype: int64

In [7]:
print("Number of attributes before making the dummies: "+str(train_data_imputed.shape[1]))

Number of attributes before making the dummies: 36


In [8]:
dummies_event=train_data_imputed['Events'].str.get_dummies(sep='-')

Let's replace the Events column with new colmuns as Fog, Hail, Rain, Snow, Thunderstorm

In [9]:
train_data_imputed=train_data_imputed.drop('Events',axis=1)

In [10]:
train_data_imputed=pd.concat([train_data_imputed,dummies_event],axis=1)

In [11]:
print("Number of attributes after making the dummies: "+str(train_data_imputed.shape[1]))

Number of attributes after making the dummies: 40


## Date

In [13]:
train_data_imputed['Date']=pd.to_datetime(train_data_imputed['Date'])

In [14]:
train_data_imputed['Day'] = pd.DatetimeIndex(train_data_imputed['Date']).day
train_data_imputed['Month'] = pd.DatetimeIndex(train_data_imputed['Date']).month
train_data_imputed['Year'] = pd.DatetimeIndex(train_data_imputed['Date']).year
train_data_imputed=train_data_imputed.drop('Date',axis=1)

## Storetype

In [15]:
dummies_event=train_data_imputed['StoreType'].str.get_dummies()
train_data_imputed=train_data_imputed.drop('StoreType',axis=1)
train_data_imputed=pd.concat([train_data_imputed,dummies_event],axis=1)

In [16]:
train_data_imputed.columns=train_data_imputed.columns.str.replace('\s+', '_')

## AssortmentType

In [17]:
dummies_event=train_data_imputed['AssortmentType'].str.get_dummies()
train_data_imputed=train_data_imputed.drop('AssortmentType',axis=1)
train_data_imputed=pd.concat([train_data_imputed,dummies_event],axis=1)

# Replacing spaces with underscores

In [18]:
train_data_imputed.columns=train_data_imputed.columns.str.replace('\s+', '_')

## Changing Celsius to Kelvin to temperature attributes

In [19]:
toChange=['Max_Dew_PointC','Max_TemperatureC','Mean_Dew_PointC','Mean_TemperatureC','Min_Dew_PointC','Min_TemperatureC']

In [20]:
train_data_imputed[toChange]+=273

# Mission Accomplished

In [21]:
train_data_imputed.to_csv('./data/train_one_hot.csv', index=False)