In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import vapeplot 
from scipy import interp
import scipy.stats
%matplotlib inline

#### Goal
--------------------

The goal of this project is to make a classifier that predicts the final rankings for bakers.
The idea is to make a model for each episode and to use data from previous episodes in the model.
Therefore, a classifier for episode 1 will likely be bad at predicting the final outcome, but a classifier for episode 5 might accurately predict who will be in the top 3 and who might be eliminated in the next episode



In [4]:
from datetime import datetime
def timestamp(): return datetime.today().strftime('%Y%m%d')

from sklearn.preprocessing import QuantileTransformer
def quantile_scale(df,feats):
    qua = df
    scaler = QuantileTransformer(
        n_quantiles=10,
        random_state=42,
        ignore_implicit_zeros=True, #sparse matrix
    )
    # fit the scaler
    scaler.fit(qua[feats])
    # transform values
    qua[feats] = scaler.transform(qua[feats])
    return qua
def tiered(classes):
    trans = []
    for x in classes:
        if x==1: c=0
        if x==2: c=1
        if x>=3 and x<=4: c=2
        if x>=5 and x<=7: c=3
        if x>=8: c=4
        trans.append(c)
    return trans

In [5]:
tech = pd.read_csv("../RESULTS/gbbo.techinical.data.s10.20190907.tsv",sep="\t")
star = pd.read_csv("../RESULTS/gbbo.starbaker.data.s10.e2.20190909.tsv",sep="\t")
feats = ['tech_mean','tech','mean_star','star','mean_good','good','mean_bad','bad']
merge_col = ['season','baker','index','episode','place']
gbbo = pd.merge(tech, star,  how='left', left_on=merge_col, right_on =merge_col)
gbbo = gbbo[['season','baker','episode','place','tech_mean','tech','mean_star','star','mean_good','good','mean_bad','bad']]
gbbo.to_csv("../RESULTS/gbbo.features.s10.e2.{}.tsv".format(timestamp()),sep="\t",index=False)
gbbo.head()

Unnamed: 0,season,baker,episode,place,tech_mean,tech,mean_star,star,mean_good,good,mean_bad,bad
0,10,Alice,1,0,5.0,5,0.0,0,0.0,0,0.0,0
1,10,Alice,2,0,3.0,1,0.5,1,0.5,1,0.0,0
2,10,Amelia,1,0,4.0,4,0.0,0,0.0,0,0.0,0
3,10,Amelia,2,0,6.5,9,0.0,0,0.0,0,0.5,1
4,10,Dan,1,13,9.0,9,0.0,0,0.0,0,1.0,1


In [6]:
max_epi = max(gbbo['episode'])
gbbo = gbbo.loc[gbbo['episode']==max_epi]
gbbo = quantile_scale(gbbo,feats)
gbbo.head()



Unnamed: 0,season,baker,episode,place,tech_mean,tech,mean_star,star,mean_good,good,mean_bad,bad
1,10,Alice,2,0,1e-07,0.08333333,0.9999999,0.9999999,0.9999999,0.9999999,1e-07,1e-07
3,10,Amelia,2,0,0.5555556,0.75,1e-07,1e-07,1e-07,1e-07,0.8888889,0.9999999
5,10,Dan,2,13,0.25,1e-07,1e-07,1e-07,1e-07,1e-07,0.8888889,1e-07
7,10,David,2,0,0.4166667,0.1666667,1e-07,1e-07,0.9999999,0.9999999,1e-07,1e-07
9,10,Helena,2,0,0.9999999,0.9999999,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07


In [7]:
# classifiers
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(100),max_iter=1000)
tech = pd.read_csv("../RESULTS/gbbo.features.20190909.tsv",sep='\t')
tech = tech.loc[tech['episode']==max_epi]
qua = quantile_scale(tech,feats)
qua['place']=tiered(qua['place'])
X, y = np.matrix(qua[feats]), np.array(qua['place'])
clf.fit(X,y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=100, learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [9]:
test = np.matrix(gbbo[feats])
preds = clf.predict(test)
probs = clf.predict_proba(test)
gbbo['preds']=preds
# probability baker is a finalist
top = probs[:,0]
top3 = probs[:,0]+probs[:,1]
bot = probs[:,-1]
gbbo['bottom']=np.round(bot*100,decimals=2)
gbbo['finalist']=np.round(top*100,decimals=2) 
gbbo['top3'] = np.round(top3*100,decimals=2)
gbbo.to_csv("../RESULTS/gbbo.techinical.s10.e2.preditions.txt",sep="\t",index=False)
sub = ['baker','preds','finalist','top3','bottom']
gbbo[sub].sort_values(by=['finalist'],ascending=False)

Unnamed: 0,baker,preds,finalist,top3,bottom
1,Alice,0,92.81,98.57,0.0
15,Michael,1,37.31,90.88,1.43
23,Rosie,2,18.76,48.85,0.0
25,Steph,2,14.94,44.48,0.07
7,David,1,10.38,87.98,2.76
19,Phil,3,0.29,3.36,16.0
21,Priya,3,0.06,12.11,22.67
17,Michelle,3,0.04,12.63,22.47
5,Dan,4,0.03,0.15,99.72
11,Henry,3,0.03,1.75,1.03


In [11]:
# bottom probability is the last two ranks
gbbo[sub].sort_values(by=['bottom'],ascending=False)

Unnamed: 0,baker,preds,finalist,top3,bottom
5,Dan,4,0.03,0.15,99.72
9,Helena,4,0.02,10.11,60.48
3,Amelia,4,0.0,6.19,56.7
13,Jamie,4,0.0,0.33,54.5
21,Priya,3,0.06,12.11,22.67
17,Michelle,3,0.04,12.63,22.47
19,Phil,3,0.29,3.36,16.0
7,David,1,10.38,87.98,2.76
15,Michael,1,37.31,90.88,1.43
11,Henry,3,0.03,1.75,1.03


In [12]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.activations import relu
reg = keras.regularizers.l1_l2(l1=0.0001, l2=0.0001)
opt = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999,  decay=1e-09)
nn = Sequential([
    Dense(800, input_shape=(8, ), activation='relu',kernel_regularizer=reg),
    Dropout(0.1),
    Dense(800, activation='relu',kernel_regularizer=reg),
    Dropout(0.1),
    Dense(300, activation='relu',kernel_regularizer=reg),
    Dropout(0.1),
    Dense(5, activation='softmax')
])
nn.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
nn.fit(X,y,validation_split=0., batch_size=30, epochs=10,verbose=0)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


<keras.callbacks.History at 0x1a304b1320>

In [14]:
test = np.matrix(gbbo[feats])
preds = nn.predict_classes(test)
probs = nn.predict_proba(test)
gbbo['preds']=preds
# probability baker is a finalist
top = probs[:,0]
top3 = probs[:,0]+probs[:,1]
bot = probs[:,-1]
gbbo['bottom']=np.round(bot*100,decimals=2)
gbbo['finalist']=np.round(top*100,decimals=2) 
gbbo['top3'] = np.round(top3*100,decimals=2)
gbbo.to_csv("../RESULTS/gbbo.techinical.s10.e2.keras.preditions.txt",sep="\t",index=False)
sub = ['baker','preds','finalist','top3','bottom']
gbbo[sub].sort_values(by=['finalist'],ascending=False)

Unnamed: 0,baker,preds,finalist,top3,bottom
1,Alice,1,18.030001,57.470001,14.75
7,David,1,17.309999,48.639999,19.809999
23,Rosie,1,17.17,41.029999,21.91
19,Phil,4,16.09,34.639999,24.780001
11,Henry,4,15.74,34.310001,23.99
15,Michael,1,14.81,41.130001,25.35
25,Steph,4,14.32,36.439999,25.59
5,Dan,4,13.47,29.629999,35.139999
21,Priya,4,13.2,30.48,28.91
17,Michelle,4,12.83,29.91,29.17


In [15]:
gbbo[sub].sort_values(by=['bottom'],ascending=False)

Unnamed: 0,baker,preds,finalist,top3,bottom
13,Jamie,4,6.62,16.959999,53.91
3,Amelia,4,8.46,20.33,47.369999
5,Dan,4,13.47,29.629999,35.139999
9,Helena,4,10.37,26.049999,33.580002
17,Michelle,4,12.83,29.91,29.17
21,Priya,4,13.2,30.48,28.91
25,Steph,4,14.32,36.439999,25.59
15,Michael,1,14.81,41.130001,25.35
19,Phil,4,16.09,34.639999,24.780001
11,Henry,4,15.74,34.310001,23.99


In [16]:
gbbo[sub].sort_values(by=['top3'],ascending=False)

Unnamed: 0,baker,preds,finalist,top3,bottom
1,Alice,1,18.030001,57.470001,14.75
7,David,1,17.309999,48.639999,19.809999
15,Michael,1,14.81,41.130001,25.35
23,Rosie,1,17.17,41.029999,21.91
25,Steph,4,14.32,36.439999,25.59
19,Phil,4,16.09,34.639999,24.780001
11,Henry,4,15.74,34.310001,23.99
21,Priya,4,13.2,30.48,28.91
17,Michelle,4,12.83,29.91,29.17
5,Dan,4,13.47,29.629999,35.139999
