In [0]:
!python --version

Python 3.6.9


In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

seed = 0

In [3]:
data = pd.read_csv('train_preprocess.csv', sep=';')
data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow,Year,Month,Day,WindSpeedDelta,WindDirChange,HumidityDelta,PressureDelta,CloudDelta,TempDelta
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,False,2008,12,1,4.0,True,-49.0,-0.6,,4.9
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,False,2008,12,2,18.0,True,-19.0,-2.8,,7.1
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,False,2008,12,3,7.0,True,-8.0,1.1,,2.2
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,False,2008,12,4,-2.0,True,-29.0,-4.8,,8.4
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,False,2008,12,5,13.0,True,-49.0,-4.8,1.0,11.9


In [0]:
y = data.RainTomorrow
data.drop('RainTomorrow', axis=1, inplace=True)

In [0]:
categorical_cols = [cat for cat in data.columns if data[cat].dtype == 'O']
numerical_cols = list(set(data.columns.values.tolist()) - set(categorical_cols))

In [0]:
numeric_means = data[numerical_cols].mean()
X_real_mean = data[numerical_cols].fillna(numeric_means)
X_cat = data[categorical_cols].fillna('NaN')

In [0]:
from sklearn.feature_extraction import DictVectorizer as DV

encoder = DV(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.astype('str').T.to_dict().values())

In [13]:
X_cat_oh.shape

(70240, 81)

In [14]:
X = np.hstack((X_real_mean, X_cat_oh))
X.shape

(70240, 106)

In [0]:
from sklearn.model_selection import train_test_split

(X_train, 
 X_test, 
 y_train, y_test) = train_test_split(X, y, 
                                     test_size=0.2, 
                                     random_state=seed,
                                    stratify=y)

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
y_train = y_train.values
y_test = y_test.values

In [39]:
y_test

array([False, False, False, ..., False, False, False])

In [0]:
import keras
from keras.layers import Input, Dense, Dropout, BatchNormalization, ReLU
from keras.models import Sequential
from keras import backend as K
from keras.callbacks import ModelCheckpoint

In [0]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [0]:
model = Sequential()
model.add(Dense(64, input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(ReLU())
model.add(Dropout(0.3))
model.add(Dense(32))
model.add(BatchNormalization())
model.add(ReLU())
model.add(Dropout(0.3))
model.add(Dense(16))
model.add(BatchNormalization())
model.add(ReLU())
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', f1])
checkpoint = ModelCheckpoint("./nn-{epoch:02d}-{val_f1:.2f}.hdf5",
                             monitor='val_f1', save_best_only=True, mode='max', period=1)

In [69]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 64)                6848      
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
re_lu_1 (ReLU)               (None, 64)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 32)                2080      
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128       
_________________________________________________________________
re_lu_2 (ReLU)               (None, 32)               

In [70]:
model_history = model.fit(X_train, y_train, batch_size=128, epochs=50, validation_split=0.1, callbacks=[checkpoint], shuffle=True)

Train on 50572 samples, validate on 5620 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [71]:
model.evaluate(X_test, y_test)



[0.3382171496986528, 0.8554954528808594, 0.6135863661766052]

In [49]:
f1(y_test, y_pred)

TypeError: ignored

In [61]:
sum(keras.metrics.accuracy(y_test, y_pred)[:, 0]) / len(y_test)

<tf.Tensor: shape=(), dtype=float32, numpy=0.78694475>