# 02 Rainy Day

![](https://images.unsplash.com/photo-1558920778-a82b686f0521?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=967&q=80)

Photo by [Ren zo](https://unsplash.com/photos/rsilYJQOoVo)

In this exercise, we will try to use a neural network on a typical prediction task: predicting whether tomorrow will be a rainy day.

The dataset is in `weatherAUS.csv`. Load it and explore it. The target value is the column `'RainTomorrow'`.

In [None]:
# TODO: Data exploration
### STRIP_START ###
import pandas as pd
import numpy as np

df = pd.read_csv('weatherAUS.csv')
df.head()
### STRIP_END ###

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [None]:
df.shape

(142193, 23)

In [None]:
df.isna().sum()

Date                 0
Location             0
MinTemp            637
MaxTemp            322
Rainfall          1406
Evaporation      60843
Sunshine         67816
WindGustDir       9330
WindGustSpeed     9270
WindDir9am       10013
WindDir3pm        3778
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm       3610
Pressure9am      14014
Pressure3pm      13981
Cloud9am         53657
Cloud3pm         57094
Temp9am            904
Temp3pm           2726
RainToday         1406
RainTomorrow         0
dtype: int64

Make data preparation.

In [None]:
# TODO: Data preparation
### STRIP_START ###
df = df.drop(columns=['Sunshine','Evaporation','Cloud3pm','Cloud9am','Date'])

df = df.dropna()
df.shape
### STRIP_END ###

(112925, 18)

In [None]:
df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,No,No
1,Albury,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,No,No
2,Albury,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,No,No
3,Albury,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,No,No
4,Albury,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,No,No


In [None]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()

scaled = scaler.fit_transform(df[['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am',
                                  'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
                                  'Temp9am', 'Temp3pm']])

scaled = pd.DataFrame(scaled, columns=['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am',
                                       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
                        x               'Temp9am', 'Temp3pm'], index=df.index)

dummies = pd.get_dummies(df[['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']], drop_first=True)

scaled.shape, dummies.shape

((112925, 12), (112925, 46))

In [None]:
X = pd.concat([dummies, scaled], axis=1)
y = pd.get_dummies(df['RainTomorrow'], drop_first=True)
X.shape, y.shape

((112925, 58), (112925, 1))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Now build a MLP model. Begin with for example 2 hidden layers of 20 units.

In [None]:
# TODO: Build a model
### STRIP_START ###
import tensorflow as tf

# Define a function
def model(input_dim):
    # We create a so called Sequential model
    model = tf.keras.models.Sequential()
    
    # Add the first "Dense" layer of 3 units, and give the input dimension (here 5)
    model.add(tf.keras.layers.Dense(20, input_dim=input_dim, activation='relu'))
    
    # Add the second "Dense" layer of 3 units
    # This time the input dimension is not needed anymore: it is known from the previous layer
    model.add(tf.keras.layers.Dense(20, activation='relu'))

    # Add finally the output layer with one unit: the predicted result
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    # return the created model
    return model  
### STRIP_END ###

Now compile and fit your model.

In [None]:
# TODO: Compile and fit the model
### STRIP_START ###
model = model(input_dim=X_train.shape[1])

### STRIP_START ###
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=128)
### STRIP_END ###

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f880d0e5da0>

Now check the accuracy on the test dataset.

In [None]:
# TODO: Compute the accuracy
### STRIP_START ###
print('accuracy on train with NN:', model.evaluate(X_train, y_train)[1])
print('accuracy on test with NN:', model.evaluate(X_test, y_test)[1])
### STRIP_END ###

accuracy on train with NN: 0.8601838
accuracy on test with NN: 0.8561435


---

Now try to use a classical machine learning classification method (of your choice). Make the fit and compute the accuracy of your model.

In [None]:
# TODO: Redo the classification with the model of your choice
### STRIP_START ###
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

print('score with RF on train', rf.score(X_train, y_train))
print('score with RF on train', rf.score(X_test, y_test))
### STRIP_END ###

  import sys


score with RF on train 0.9865508080584459
score with RF on train 0.8438344033650653
