# Imports

In [1]:
import pandas
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from dateutil.parser import parse
from sklearn.metrics import accuracy_score as acc

# Metadata

**Date**
- The date of observation

**Location**
- The common name of the location of the weather station

**MinTemp**
- The minimum temperature in degrees celsius

**MaxTemp**
- The maximum temperature in degrees celsius

**Rainfall**
- The amount of rainfall recorded for the day in mm

**Evaporation**
- The so-called Class A pan evaporation (mm) in the 24 hours to 9am

**Sunshine**
- The number of hours of bright sunshine in the day.

**WindGustDir**
- The direction of the strongest wind gust in the 24 hours to midnight

**WindGustSpeed**
- The speed (km/h) of the strongest wind gust in the 24 hours to midnight

**WindDir9am**
- Direction of the wind at 9am

**WindDir3pm**
- Direction of the wind at 3pm

**WindSpeed9am**
- Wind speed (km/hr) averaged over 10 minutes prior to 9am

**WindSpeed3pm**
- Wind speed (km/hr) averaged over 10 minutes prior to 3pm

**Humidity9am**
- Humidity (percent) at 9am

**Humidity3pm**
- Humidity (percent) at 3pm

**Pressure9am**
- Atmospheric pressure (hpa) reduced to mean sea level at 9am

**Pressure3pm**
- Atmospheric pressure (hpa) reduced to mean sea level at 3pm

**Cloud9am**
- Fraction of sky obscured by cloud at 9am. This is measured in "oktas", which are a unit of eigths. It records how many eigths of the sky are obscured by cloud. A 0 measure indicates completely clear sky whilst an 8 indicates that it is completely overcast.

**Cloud3pm**
- Fraction of sky obscured by cloud (in "oktas": eighths) at 3pm. See Cload9am for a description of the values

**Temp9am**
- Temperature (degrees C) at 9am

**Temp3pm**
- Temperature (degrees C) at 3pm

**RainToday**
- Boolean: 1 if precipitation (mm) in the 24 hours to 9am exceeds 1mm, otherwise 0

**RISK_MM**
- The amount of rain. A kind of measure of the "risk".

**RainTomorrow**
- The target variable. Did it rain tomorrow?

# Read in data

In [2]:
data = pandas.read_csv('./data/weatherAUS.csv')
data.shape

(142193, 24)

In [3]:
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


# Check null composition

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 24 columns):
Date             142193 non-null object
Location         142193 non-null object
MinTemp          141556 non-null float64
MaxTemp          141871 non-null float64
Rainfall         140787 non-null float64
Evaporation      81350 non-null float64
Sunshine         74377 non-null float64
WindGustDir      132863 non-null object
WindGustSpeed    132923 non-null float64
WindDir9am       132180 non-null object
WindDir3pm       138415 non-null object
WindSpeed9am     140845 non-null float64
WindSpeed3pm     139563 non-null float64
Humidity9am      140419 non-null float64
Humidity3pm      138583 non-null float64
Pressure9am      128179 non-null float64
Pressure3pm      128212 non-null float64
Cloud9am         88536 non-null float64
Cloud3pm         85099 non-null float64
Temp9am          141289 non-null float64
Temp3pm          139467 non-null float64
RainToday        140787 non-null obje

# Drop nulls on row axis

In [5]:
data.dropna(axis = 0, inplace = True)
data.shape

(56420, 24)

# Inspect uniques

In [6]:
for col in data.columns:
    print(col, data[col].nunique(), data[col].dtype, sep = ' :: ')

Date :: 3416 :: object
Location :: 26 :: object
MinTemp :: 348 :: float64
MaxTemp :: 395 :: float64
Rainfall :: 410 :: float64
Evaporation :: 259 :: float64
Sunshine :: 145 :: float64
WindGustDir :: 16 :: object
WindGustSpeed :: 61 :: float64
WindDir9am :: 16 :: object
WindDir3pm :: 16 :: object
WindSpeed9am :: 36 :: float64
WindSpeed3pm :: 37 :: float64
Humidity9am :: 101 :: float64
Humidity3pm :: 101 :: float64
Pressure9am :: 504 :: float64
Pressure3pm :: 507 :: float64
Cloud9am :: 9 :: float64
Cloud3pm :: 10 :: float64
Temp9am :: 383 :: float64
Temp3pm :: 393 :: float64
RainToday :: 2 :: object
RISK_MM :: 498 :: float64
RainTomorrow :: 2 :: object


# Convert Dates to binary seasons

In [7]:
def return_season(date):
    if '-01-' in date or '-12-' in date or '-11-' in date:
        return 'summer'
    elif '-02-' in date or '-03-' in date or '-04-' in date:
        return 'autumn'
    elif '-05-' in date or '-06-' in date or '-07-' in date:
        return 'winter'
    elif '-09-' in date or '-10-' in date or '-08-' in date:
        return 'spring'

In [8]:
data['Season'] = data['Date'].apply(return_season)

In [9]:
data.reset_index(drop=True, inplace = True)

# Convert Wind directions to binary

In [10]:
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    for index in range(data.shape[0]):
        item = data.loc[index, column]
        
        if 'S' in item:
            data.loc[index, column + '_S'] = 1
        if 'N' in item:
            data.loc[index, column + '_N'] = 1
        if 'E' in item:
            data.loc[index, column + '_E'] = 1
        if 'W' in item:
            data.loc[index, column + '_W'] = 1

In [11]:
data.fillna(0, inplace = True)

In [12]:
for col in ['RainToday', 'Season']:
    dummies = pandas.get_dummies(data[col], prefix=col +'_', drop_first=True)
    data = pandas.concat([data, dummies], axis = 1)

# Drop unwanted features

In [13]:
print(data.columns)

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow', 'Season',
       'WindGustDir_S', 'WindGustDir_W', 'WindGustDir_N', 'WindGustDir_E',
       'WindDir9am_N', 'WindDir9am_E', 'WindDir9am_S', 'WindDir9am_W',
       'WindDir3pm_S', 'WindDir3pm_W', 'WindDir3pm_E', 'WindDir3pm_N',
       'RainToday__Yes', 'Season__spring', 'Season__summer', 'Season__winter'],
      dtype='object')


In [14]:
data.drop(['Location', 'Date', 'RISK_MM', 'Season', 'RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], axis = 1, inplace = True)

# Create feature matrix and target vector

In [15]:
X = data.drop('RainTomorrow', axis = 1)
y = data['RainTomorrow']

# Train test split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

# Model

In [25]:
model = RandomForestClassifier(n_jobs = -1)

In [26]:
param_grid = { 'n_estimators': [200,300,400, 500],
                'max_features': ['auto', 'sqrt', 'log2'],
                'max_depth' : [6,8, 10,12],
                'criterion' :['gini', 'entropy']}

In [27]:
model_gs = GridSearchCV(model, param_grid=param_grid, cv=5, verbose=3, n_jobs=-1)

In [None]:
model_gs.fit(x_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   52.4s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 15.1min


In [21]:
prediction = model_gs.predict(x_test)

In [22]:
model_gs.best_params_

{'criterion': 'gini',
 'max_depth': 12,
 'max_features': 'sqrt',
 'n_estimators': 400}

In [23]:
model_gs.best_score_

0.8598133049745953

In [24]:
acc(prediction, y_test)

0.8562211981566821