In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier, export_graphviz
from sklearn import svm
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix, log_loss, plot_roc_curve, auc, precision_recall_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import seaborn as sns
import missingno
from PIL import Image
%matplotlib inline

In [19]:
#readind csv file into pandas dataframe
data = pd.read_csv("weatherAUS.csv")

In [20]:
data.head(10000)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2011-08-26,CoffsHarbour,9.2,21.7,0.0,3.0,10.8,E,17.0,NNE,...,71.0,51.0,1025.4,1022.6,0.0,0.0,18.2,20.9,No,No
9996,2011-08-27,CoffsHarbour,12.0,17.4,0.2,3.4,0.7,ENE,30.0,NE,...,83.0,96.0,1022.6,1018.2,8.0,8.0,17.1,15.6,No,Yes
9997,2011-08-28,CoffsHarbour,12.2,20.9,25.0,0.6,10.7,SSW,35.0,SW,...,70.0,62.0,1019.7,1017.2,1.0,1.0,17.3,20.0,Yes,No
9998,2011-08-29,CoffsHarbour,11.1,22.7,0.0,2.4,9.6,SW,43.0,SE,...,54.0,79.0,1018.6,1016.0,5.0,7.0,19.8,20.3,No,Yes


In [21]:
del data['Date']

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       145460 non-null  object 
 1   MinTemp        143975 non-null  float64
 2   MaxTemp        144199 non-null  float64
 3   Rainfall       142199 non-null  float64
 4   Evaporation    82670 non-null   float64
 5   Sunshine       75625 non-null   float64
 6   WindGustDir    135134 non-null  object 
 7   WindGustSpeed  135197 non-null  float64
 8   WindDir9am     134894 non-null  object 
 9   WindDir3pm     141232 non-null  object 
 10  WindSpeed9am   143693 non-null  float64
 11  WindSpeed3pm   142398 non-null  float64
 12  Humidity9am    142806 non-null  float64
 13  Humidity3pm    140953 non-null  float64
 14  Pressure9am    130395 non-null  float64
 15  Pressure3pm    130432 non-null  float64
 16  Cloud9am       89572 non-null   float64
 17  Cloud3pm       86102 non-null

## Changing Categorical data

In [23]:
## mapping every unique location with an unique integer
location_encoder = LabelEncoder()
data.loc[:,'Location'] = location_encoder.fit_transform(data['Location'])

In [24]:
#Fill nan values
data.loc[:,"WindGustDir"] = data['WindGustDir'].fillna("NONE")
data.loc[:,'WindDir9am'] = data['WindDir9am'].fillna("NONE")
data.loc[:,'WindDir3pm'] = data['WindDir3pm'].fillna("NONE")
## below NONE will count as a part of category

In [25]:
windgustdir_encoder = LabelEncoder()
data.loc[:,'WindGustDir'] = windgustdir_encoder.fit_transform(data['WindGustDir'])
data.loc[:,'WindDir9am'] = windgustdir_encoder.fit_transform(data['WindDir9am'])
data.loc[:,'WindDir3pm'] = windgustdir_encoder.fit_transform(data['WindDir3pm'])

In [27]:
RainToday_map = {
    'No' : 0,
    'Yes' : 1
}
data.loc[:,'RainToday'] = data['RainToday'].map(RainToday_map)

In [31]:
data_target = data['RainTomorrow']
del data['RainTomorrow']

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       145460 non-null  int32  
 1   MinTemp        143975 non-null  float64
 2   MaxTemp        144199 non-null  float64
 3   Rainfall       142199 non-null  float64
 4   Evaporation    82670 non-null   float64
 5   Sunshine       75625 non-null   float64
 6   WindGustDir    145460 non-null  int32  
 7   WindGustSpeed  135197 non-null  float64
 8   WindDir9am     145460 non-null  int32  
 9   WindDir3pm     145460 non-null  int32  
 10  WindSpeed9am   143693 non-null  float64
 11  WindSpeed3pm   142398 non-null  float64
 12  Humidity9am    142806 non-null  float64
 13  Humidity3pm    140953 non-null  float64
 14  Pressure9am    130395 non-null  float64
 15  Pressure3pm    130432 non-null  float64
 16  Cloud9am       89572 non-null   float64
 17  Cloud3pm       86102 non-null