In [1]:
import pandas as pd             
import numpy as np
from sklearn.preprocessing import Imputer

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")
train_data = pd.read_csv('./data/train.csv')

In [2]:
train_data.head()

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,NumberOfCustomers,...,Mean_TemperatureC,Mean_VisibilityKm,Mean_Wind_SpeedKm_h,Min_Dew_PointC,Min_Humidity,Min_Sea_Level_PressurehPa,Min_TemperatureC,Min_VisibilitykM,Precipitationmm,WindDirDegrees
0,1000,01/03/2016,0,1,0,Hyper Market,General,326,7,495,...,1,11.0,16,-2,70,1029,1,6.0,0.0,23
1,1000,02/03/2016,0,1,0,Hyper Market,General,326,7,608,...,3,13.0,10,-2,58,1025,1,10.0,0.0,56
2,1000,04/03/2016,0,1,0,Hyper Market,General,326,7,665,...,3,11.0,10,-3,55,1023,2,8.0,0.0,22
3,1000,05/03/2016,0,1,0,Hyper Market,General,326,7,630,...,3,15.0,10,-6,25,1022,-1,10.0,0.0,108
4,1000,06/03/2016,0,0,0,Hyper Market,General,326,7,0,...,1,12.0,5,-6,48,1022,-5,5.0,0.0,46


Percentage of rows where CloudCover is null

In [3]:
nCloudCover=len(train_data[train_data["CloudCover"].isnull()])
n=len(train_data)
nCloudCover/n*100

7.8736800243202465

# Imputation

We copy the dataframe to make imputation

In [4]:
train_data_imputed=train_data.copy()

## CloudCover

In [5]:
imputer = Imputer(missing_values="NaN",strategy="median", copy=True)
imputer.fit(train_data_imputed[['CloudCover']])
imputer_array = imputer.transform(train_data_imputed[['CloudCover']])
#Let's drop the columns that we don't need anymore
train_data_imputed=train_data_imputed.drop('CloudCover',axis=1) 
#Let's add the new inputed column 
train_data_imputed['CloudCover']=imputer_array

Let's check that there's no null value in CloudCover

In [6]:
train_data_imputed['CloudCover'].isnull().sum()

0

## Events

Let's transform the events variables into dummies. All possible values that the attribute can assume are

In [7]:
train_data_imputed["Events"].value_counts()

Rain                           204733
Fog                             67797
Fog-Rain                        50908
Rain-Thunderstorm               21988
Rain-Snow                       18176
Snow                            15433
Fog-Rain-Thunderstorm            8567
Fog-Rain-Snow                    3225
Fog-Snow                         2857
Thunderstorm                     1485
Rain-Hail                        1331
Rain-Snow-Hail                    629
Rain-Hail-Thunderstorm            283
Fog-Rain-Snow-Hail                231
Rain-Snow-Thunderstorm            220
Snow-Hail                         194
Rain-Snow-Hail-Thunderstorm       193
Fog-Rain-Hail-Thunderstorm        178
Fog-Snow-Hail                     169
Fog-Rain-Hail                     169
Fog-Thunderstorm                  157
Name: Events, dtype: int64

In [8]:
print("Number of attributes before making the dummies: "+str(train_data_imputed.shape[1]))

Number of attributes before making the dummies: 36


In [9]:
dummies_event=train_data_imputed['Events'].str.get_dummies(sep='-')

Let's replace the Events column with new colmuns as Fog, Hail, Rain, Snow, Thunderstorm

In [10]:
train_data_imputed=train_data_imputed.drop('Events',axis=1)

In [11]:
train_data_imputed=pd.concat([train_data_imputed,dummies_event],axis=1)

In [12]:
print("Number of attributes after making the dummies: "+str(train_data_imputed.shape[1]))

Number of attributes after making the dummies: 40


## Max_Gust_SpeedKm_h

We assume that when Max_Gust_SpeedKm_h is null, no gust happened. The number of null values for Max_Gust_SpeedKm_h is:

In [13]:
train_data_imputed["Max_Gust_SpeedKm_h"].isnull().sum()

409947

In [14]:
train_data_imputed=train_data_imputed.fillna(value={"Max_Gust_SpeedKm_h": 0})

Checking the correct application

In [15]:
train_data_imputed["Max_Gust_SpeedKm_h"].isnull().sum()

0

## VisibilitykM

In [16]:
visibilitykM=['Max_VisibilityKm','Min_VisibilitykM','Mean_VisibilityKm']

In [17]:
imputer = Imputer(missing_values="NaN",strategy="mean", copy=True)
imputer.fit(train_data_imputed[visibilitykM])
imputer_array = imputer.transform(train_data_imputed[visibilitykM])
#Let's drop the columns that we don't need anymore
train_data_imputed=train_data_imputed.drop(visibilitykM,axis=1)

In [18]:
df_imputed = pd.DataFrame(imputer_array,columns=visibilitykM)
train_data_imputed=pd.concat([train_data_imputed,df_imputed], axis=1)

Checking that all values are imputed:

In [19]:
unknown_per_columns=train_data_imputed.isnull().sum()
unknown_per_columns

StoreID                       0
Date                          0
IsHoliday                     0
IsOpen                        0
HasPromotions                 0
StoreType                     0
AssortmentType                0
NearestCompetitor             0
Region                        0
NumberOfCustomers             0
NumberOfSales                 0
Region_AreaKM2                0
Region_GDP                    0
Region_PopulationK            0
Max_Dew_PointC                0
Max_Gust_SpeedKm_h            0
Max_Humidity                  0
Max_Sea_Level_PressurehPa     0
Max_TemperatureC              0
Max_Wind_SpeedKm_h            0
Mean_Dew_PointC               0
Mean_Humidity                 0
Mean_Sea_Level_PressurehPa    0
Mean_TemperatureC             0
Mean_Wind_SpeedKm_h           0
Min_Dew_PointC                0
Min_Humidity                  0
Min_Sea_Level_PressurehPa     0
Min_TemperatureC              0
Precipitationmm               0
WindDirDegrees                0
CloudCov

In [20]:
train_data_imputed.to_csv('./data/train_imputed.csv')

# Mission Accomplished

In [None]:
train_data.head(100)