# Using Keras to Predict the Next Winner
### Model summary
1. Use Keras categorical neural network using sequential class

### Features summary
1. Chef's elo rating in the group
2. Chef's elo rating against the competition
3. Demographic difference between the chef and the guest the chef is cooking for

In [2]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
import pickle
import numpy as np
import pandas as pd
import copy
from collections import Counter
from helper import *
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from collections import defaultdict

# Step 1: Load dataset
The features are already engineered and I am now uploading the pickle file 

In [3]:
with open("data/_feature_match_data.pkl", "rb") as fp:
    match_data = pickle.load(fp)

In [4]:
match_data[:3]

Unnamed: 0,date_,chef_name,against,guest_name,nID,win_prob,result,c_age,g_age,age_delta,c_gender,g_gender,gender_delta,against_specific_win_p,chef_id,against_id
0,1/19/2015,최현석,박준우,소유진,-1.0,0.5,1,46,37,9,1,0,1,0.5,4,12
1,1/19/2015,홍석천,샘킴,소유진,-1.0,0.5,1,40,37,3,1,0,1,0.5,8,1
2,1/26/2015,미카엘,김풍,가희,-1.0,0.5,1,36,38,2,1,0,1,0.5,3,0


### About this table
* nID: A unique ID related to each recipe (or cook-off) where there is a recipe.
* result: 0 - lose, 1 - win
* c_age: chef age (as of 2018 Mar)
* g_age: guest age (as of 2018 Mar)
* age_delta: difference in years between the chef and the guest in age
* c_gender: gender of the chef
* g_gender: gender of the guest
* genrder_delta: difference between the chef and the guest (0 - female, 1 - male)

In [5]:
match_data = match_data.sample(frac=1).reset_index(drop=True)
match_data[:3]

Unnamed: 0,date_,chef_name,against,guest_name,nID,win_prob,result,c_age,g_age,age_delta,c_gender,g_gender,gender_delta,against_specific_win_p,chef_id,against_id
0,10/30/2017,이재훈,샘킴,진_(가수),6524.0,0.545922,1,38,26,12,1,1,0,0.661441,13,1
1,10/17/2016,김민준,김풍,김흥국,6318.0,0.479863,0,42,59,17,1,1,0,0.5,17,0
2,12/19/2016,미카엘,최현석,거미_(가수),6353.0,0.724538,0,36,37,1,1,0,1,0.488165,3,4


# Step 2: Prepare x (input) and y (output)

## Y

In [6]:
y_ = match_data["result"].tolist()
y_categorical = np_utils.to_categorical(y_, 2)
print(y_categorical.shape)
print(y_categorical[0].shape)
print(y_categorical[:3])

(630, 2)
(2,)
[[0. 1.]
 [1. 0.]
 [1. 0.]]


## X

In [7]:
x_df = match_data[["chef_id","against_id","win_prob","age_delta","gender_delta","against_specific_win_p"]]
x_ = x_df.as_matrix()
shp = x_[0].shape[0]
print(x_.shape)
print(x_[0].shape)

(630, 6)
(6,)


### Devide into train & test group

In [8]:
ratio = 0.80
x_train = np.array(x_[:int((len(x_)*ratio))])
x_test = np.array(x_[len(x_train):])
y_train = np.array(y_categorical[:int((len(y_categorical)*ratio))])
y_test = np.array(y_categorical[len(y_train):])

# Step 3: Define model architecture
Use Keras Sequential class which is a linear stack of layers

In [9]:
# Declare Sequential model
model = Sequential()

In [10]:
# Next, declare input layer
model.add(Dense(500, input_shape=(shp,)))
model.add(Activation('relu')) 

# If I print, the current shape of the model output, it will return me this.
print(model.output_shape)

(None, 500)


In [11]:
# Now add more layers.
# We want to add Dropout layer to regularize the model in order to prevent overfitting.
model.add(Dropout(0.2))
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(2)) # Corresponds to the final output size of 2.
model.add(Activation('softmax'))

# Step 4: Compile model and fit model on training data

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [19]:
model.fit(x_train, y_train,
          batch_size=32, epochs=4,
          verbose=1,
          validation_data=(x_test, y_test))

Train on 504 samples, validate on 126 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1204c5cf8>

### Not so difficult, huh?

# Step 5: Evaluate model on test data
* Score: mean of the loss for each test sample
* Accuracy: the predict_classes function outputs the highest probability class according to the trained classifier for each input example

In [20]:
# Score
score = model.evaluate(x_test, y_test, verbose = 1)
score



0.7836137745115492

In [21]:
# Accuracy
prediction = model.predict(x_test)
predicted_classes = model.predict_classes(x_test)

def categorical_accuracy(y_true, y_pred):
    accu = 0
    for idx, y in enumerate(y_true):
        y_t = 0
        if y[0] == 1:
            pass
        else:
            y_t += 1
            
        if y_t == y_pred[idx]:
            accu += 1
    accuracy = accu / len(y_true)
    print("Accuracy: ", accuracy)
    return accuracy

In [22]:
accuracy = categorical_accuracy(y_test,predicted_classes)

Accuracy:  0.5317460317460317


### Okay, so I am only slightly doing better than a monkey. 
Obviously, the dataset I have is way too small for neural network.