Import Libraries

In [1]:
#Numpy is used for linear algebra
import numpy as np
#Pandas is used for data processing, reading the CSV File
import pandas as pd

#Matplotlib and Seaborn is used for plotting
import matplotlib.pyplot as plt
import seaborn as sns

#Remove any warnings
import warnings
warnings.filterwarnings('ignore')

from numpy import nan

from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn import ensemble

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

Import the CSV file into our project

In [2]:
#Loading dataset
df = pd.read_csv('data//weatherAUS.csv')

In [3]:
# Remove the field from the data set that we don't
# want to include in our model
del df['RISK_MM']
del df['Date']
del df['Cloud3pm']
del df['Cloud9am']
del df['Location']

In [4]:
# find numerical variables

numerical = [var for var in df.columns if df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :', numerical)

There are 14 numerical variables

The numerical variables are : ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']


In [5]:
# Impute the missing values with mean
df['Sunshine'].fillna(df['Sunshine'].median(), inplace=True)
df['Evaporation'].fillna(df['Evaporation'].median(), inplace=True)
df['MinTemp'].fillna(df['MinTemp'].median(), inplace=True)
df['MaxTemp'].fillna(df['MaxTemp'].median(), inplace=True)
df['Rainfall'].fillna(df['Rainfall'].median(), inplace=True)
df['WindGustSpeed'].fillna(df['WindGustSpeed'].median(), inplace=True)
df['WindSpeed9am'].fillna(df['WindSpeed9am'].median(), inplace=True)
df['WindSpeed3pm'].fillna(df['WindSpeed3pm'].median(), inplace=True)
df['Humidity9am'].fillna(df['Humidity9am'].median(), inplace=True)
df['Humidity3pm'].fillna(df['Humidity3pm'].median(), inplace=True)
df['Pressure9am'].fillna(df['Pressure9am'].median(), inplace=True)
df['Pressure3pm'].fillna(df['Pressure3pm'].median(), inplace=True)
df['Temp9am'].fillna(df['Temp9am'].median(), inplace=True)
df['Temp3pm'].fillna(df['Temp3pm'].median(), inplace=True)

In [6]:
categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :', categorical)

There are 5 categorical variables

The categorical variables are : ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


In [7]:
# Impute the missing values with mode

df['WindGustDir'].fillna(df['WindGustDir'].mode()[0], inplace=True)
df['WindDir9am'].fillna(df['WindDir9am'].mode()[0], inplace=True)
df['WindDir3pm'].fillna(df['WindDir3pm'].mode()[0], inplace=True)
df['RainToday'].fillna(df['RainToday'].mode()[0], inplace=True)
#We don't have to impute RainTomorrow because there is no missing data inside

## Removal of outliers

In [8]:
def max_value(df, variable, top):
    return np.where(df[variable]>top, top, df[variable])

df['Rainfall'] = max_value(df, 'Rainfall', 3.2)
df['Evaporation'] = max_value(df, 'Evaporation', 21.8)
df['WindSpeed9am'] = max_value(df, 'WindSpeed9am', 55)
df['WindSpeed3pm'] = max_value(df, 'WindSpeed3pm', 57)

## Converting RainToday & RainTomorrow into 1(Yes) or 0(No)

In [10]:
# First of all, we should deal with the Rain Today and Rain Tomorrow variables
# Changing the Yes/No to 1/0 for RainToday and RainTomorrow
df['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
df['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)

In [11]:
df.shape

(142193, 19)

### On-hot encode the categorical data

In [12]:
#Replace categorcial data with on-hot encoded data
features_df = pd.get_dummies(df, columns=['WindGustDir', 
                                          'WindDir9am', 
                                          'WindDir3pm', ])

features_df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,13.4,22.9,0.6,4.8,8.5,44.0,20.0,24.0,71.0,22.0,...,0,0,0,0,0,0,0,0,1,0
1,7.4,25.1,0.0,4.8,8.5,44.0,4.0,22.0,44.0,25.0,...,0,0,0,0,0,0,0,0,0,1
2,12.9,25.7,0.0,4.8,8.5,46.0,19.0,26.0,38.0,30.0,...,0,0,0,0,0,0,0,0,0,1
3,9.2,28.0,0.0,4.8,8.5,24.0,11.0,9.0,45.0,16.0,...,0,0,0,0,0,0,0,0,0,0
4,17.5,32.3,1.0,4.8,8.5,41.0,7.0,20.0,82.0,33.0,...,0,1,0,0,0,0,0,0,0,0


## Splitting data into train and test

In [13]:
# Remove the labled data
del features_df['RainTomorrow']
# Training and testing only accept matrix not data frame
X = features_df.to_numpy()
y = df['RainTomorrow'].to_numpy()

from sklearn import model_selection

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 0)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

#Fit Classifier model
from sklearn.ensemble import GradientBoostingClassifier
model = ensemble.GradientBoostingClassifier(learning_rate=0.05, 
                                            loss= 'exponential', 
                                            max_depth=6, 
                                            min_samples_leaf=2, 
                                            n_estimators=1000
)


model.fit(X_train, y_train)

#Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'models//rain_prediction_model.pkl')

['models//rain_prediction_model.pkl']

In [15]:
#Find the error rate on the training set
mse= mean_absolute_error(y_train, model.predict(X_train))
print ("Training Set Mean Absolute Error: %.4f" % mse)

#Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print ("Test Set Mean Absolute Error: %.4f" % mse)

#Mean Absolute Erro closer to 0 would be better.

Training Set Mean Absolute Error: 0.1051
Test Set Mean Absolute Error: 0.1428
