# Imputation yo

In [1]:
import pandas as pd             
import numpy as np

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv('./data/train.csv')

In [3]:
train_data.shape

(523021, 36)

# Finding the variables that contain NANs (will need to be imputed)

In [4]:
#Isnull() checks for nans
#any() returns true if there is any true in an array
#values() then converts everything into a single array
areasWithNansMask = train_data.isnull().any().values

#This is how we access column titles
areasWithNans = train_data.columns.values

#using boolean indexing here
areasWithNans = areasWithNans[areasWithNansMask]
print("Variable that will need to be imputed", areasWithNans)

Variable that will need to be imputed ['CloudCover' 'Events' 'Max_Gust_SpeedKm_h' 'Max_VisibilityKm'
 'Mean_VisibilityKm' 'Min_VisibilitykM']


# Counting the nan and non nan values

In [5]:
print("The variables and the count of null and not null \n")
#After using the isnull function, replace all instances of false to string "not null" and true instances to "Null"
#This makes it much more readable when using the value_counts function
cloudCoverPrintList = train_data['CloudCover'].isnull().replace(False, "Not Null").replace(True, "Null").value_counts()
print("CloudCover Null values:\n",cloudCoverPrintList, "\n")

eventsPrintList = train_data['Events'].isnull().replace(False, "Not Null").replace(True, "Null").value_counts()
print("Events Null values:\n",eventsPrintList, "\n")

maxGustPrintList = train_data['Max_Gust_SpeedKm_h'].isnull().replace(False, "Not Null").replace(True, "Null").value_counts()
print("Max_Gust_SpeedKm_h Null values:\n",maxGustPrintList, "\n")

maxVisPrintList=train_data['Max_VisibilityKm'].isnull().replace(False, "Not Null").replace(True, "Null").value_counts()
print("Max_VisibilityKm Null values:\n",maxVisPrintList , "\n")

meanVisPrintList=train_data['Mean_VisibilityKm'].isnull().replace(False, "Not Null").replace(True, "Null").value_counts()
print("Mean_VisibilityKm Null values:\n", meanVisPrintList, "\n")

minVisPrintList=train_data['Min_VisibilitykM'].isnull().replace(False, "Not Null").replace(True, "Null").value_counts()
print("Min_VisibilitykM Null values:\n",minVisPrintList , "\n")

The variables and the count of null and not null 

CloudCover Null values:
 Not Null    481840
Null         41181
Name: CloudCover, dtype: int64 

Events Null values:
 Not Null    398923
Null        124098
Name: Events, dtype: int64 

Max_Gust_SpeedKm_h Null values:
 Null        409947
Not Null    113074
Name: Max_Gust_SpeedKm_h, dtype: int64 

Max_VisibilityKm Null values:
 Not Null    511683
Null         11338
Name: Max_VisibilityKm, dtype: int64 

Mean_VisibilityKm Null values:
 Not Null    511683
Null         11338
Name: Mean_VisibilityKm, dtype: int64 

Min_VisibilitykM Null values:
 Not Null    511683
Null         11338
Name: Min_VisibilitykM, dtype: int64 



# Imputation dropping rows

 
We start with dropping the Max_Gust_SpeedKm_h column since it has too many missing values (as seen above in the variable count)

## Dropping the Max_Gust_SpeedKm_h column

In [None]:

#Drop Max_Gust regardless because it has too many missing values
droppingRows = droppingRows.drop(columns="Max_Gust_SpeedKm_h")
#print("Without Max gust speed now: \n",new.head())
print("Without Max gust speed now: \n")
print(droppingRows.shape)




# Fixing the Events
Here we fix the events column, converting nans to some other value

## Dropping the Events column


In [9]:
#droping rows with nans after fixing the events column
print("droping rows with nans after fixing the events column")
Events = train_data['Events']
Events = Events.fillna("Clear")

#Dropping columns
#new = train_data.dropna(axis=1)

#Dropping just events
#print("The orginial data: \n", train_data.head())
print("The orginial data: \n")
print(train_data.shape)
droppingRows = train_data.drop(columns="Events")
#print("Without Events now: \n",new.head())
print("Without Events now: \n")
print(droppingRows.shape)


droping rows with nans after fixing the events column
The orginial data: 

(523021, 36)
Without Events now: 

(523021, 35)


## Adding the events column back in

In [None]:
droppingRows = droppingRows.assign(Events = Events.values)
#print(new.describe())

#Event has been added back in with 
#print("Events had been added back in: \n",new.head())
print("Events had been added back in: \n")
print(droppingRows.shape)
droppingRows = droppingRows.dropna()
print("After dropping nan values: \n")
print(droppingRows.shape)

# Imputation dropping columns

In [8]:
#droping all columns with nans after fixing the events column
print("droping all columns with nans after fixing the events column")
Events = train_data['Events']
Events = Events.fillna("Clear")

#Dropping columns
#new = train_data.dropna(axis=1)

#Dropping just events
#print("The orginial data: \n", train_data.head())
print("The orginial data: \n")
print(train_data.shape)

droppedColumns = train_data.dropna(axis=1)
print("After dropping nan columns: \n")
print(droppedColumns.shape)

droppedColumns = droppedColumns.assign(Events = Events.values)
#print(new.describe())
#print("Events had been added back in: \n",new.head())
print("Events had been added back in: \n")
print(droppedColumns.shape)

droping all columns with nans after fixing the events column
The orginial data: 

(523021, 36)
After dropping nan columns: 

(523021, 30)
Events had been added back in: 

(523021, 31)
