In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import vapeplot 
from scipy import interp
import scipy.stats
%matplotlib inline

#### Goal
--------------------

The goal of this project is to make a classifier that predicts the final rankings for bakers.
The idea is to make a model for each episode and to use data from previous episodes in the model.
Therefore, a classifier for episode 1 will likely be bad at predicting the final outcome, but a classifier for episode 5 might accurately predict who will be in the top 3 and who might be eliminated in the next episode



#### Technical Challenge Rankings
--------------------------
* tech_med : median technical challenge ranking over each episode
* tech_mean : same as `tech_med` but the mean
* tech : technical challenge ranking for that episode

In [13]:
from sklearn.preprocessing import QuantileTransformer
def quantile_scale(df,feats):
    qua = df
    scaler = QuantileTransformer(
        n_quantiles=10,
        random_state=42,
        ignore_implicit_zeros=True, #sparse matrix
    )
    # fit the scaler
    scaler.fit(qua[feats])
    # transform values
    qua[feats] = scaler.transform(qua[feats])
    return qua
def tiered(classes):
    trans = []
    for x in classes:
        if x==1: c=0
        if x==2: c=1
        if x>=3 and x<=4: c=2
        if x>=5 and x<=7: c=3
        if x>=8: c=4
        trans.append(c)
    return trans

In [14]:
df = pd.read_csv("../RESULTS/gbbo.techinical.data.s10.20190907.tsv",sep="\t")
feats = ['tech_mean','tech']
max_epi = max(df['episode'])
df = df.loc[df['episode']==max_epi]
df = quantile_scale(df,feats)
df.head()



Unnamed: 0,season,baker,index,episode,tech_mean,tech_med,tech,place
1,10,Alice,0,2,0.0,3.0,0.083333,0
3,10,Amelia,1,2,0.555556,6.5,0.75,0
5,10,Dan,12,2,0.25,4.5,0.0,13
7,10,David,2,2,0.416667,6.0,0.166667,0
9,10,Helena,3,2,1.0,12.0,1.0,0


In [15]:
# classifiers
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(100),max_iter=1000)
tech = pd.read_csv("../RESULTS/gbbo.techinical.data.20190907.tsv",sep='\t')
tech = tech.loc[tech['episode']==max_epi]
qua = quantile_scale(tech,feats)
qua['place']=tiered(qua['place'])
X, y = np.matrix(qua[feats]), np.array(qua['place'])
clf.fit(X,y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=100, learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [18]:
test = np.matrix(df[feats])
preds = clf.predict(test)
probs = clf.predict_proba(test)
df['preds']=preds
# probability baker is a finalist
top = probs[:,0]
top3 = probs[:,0]+probs[:,1]
bot = probs[:,-1]
df['bottom']=np.round(bot*100,decimals=2)
df['finalist']=np.round(top*100,decimals=2) 
df['top3'] = np.round(top3*100,decimals=2)
df.to_csv("../RESULTS/gbbo.techinical.s10.preditions.txt",sep="\t",index=False)
sub = ['baker','preds','finalist','top3','bottom']
df[sub].sort_values(by=['finalist'],ascending=False)

Unnamed: 0,baker,preds,finalist,top3,bottom
7,David,4,19.34,43.61,32.15
1,Alice,4,18.22,23.12,37.13
5,Dan,4,17.62,22.29,64.25
19,Phil,3,15.76,44.49,14.06
23,Rosie,3,6.89,29.38,2.15
21,Priya,4,6.74,31.65,29.12
17,Michelle,4,6.29,24.31,29.82
11,Henry,3,6.15,27.98,2.37
15,Michael,1,5.74,54.18,30.3
3,Amelia,3,5.27,23.02,20.77


In [19]:
# bottom probability is the last two ranks
df[sub].sort_values(by=['bottom'],ascending=False)

Unnamed: 0,baker,preds,finalist,top3,bottom
9,Helena,4,2.34,4.73,69.94
13,Jamie,4,2.84,6.05,69.51
5,Dan,4,17.62,22.29,64.25
1,Alice,4,18.22,23.12,37.13
7,David,4,19.34,43.61,32.15
15,Michael,1,5.74,54.18,30.3
17,Michelle,4,6.29,24.31,29.82
21,Priya,4,6.74,31.65,29.12
25,Steph,2,4.54,18.97,21.8
3,Amelia,3,5.27,23.02,20.77


In [20]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.activations import relu
reg = keras.regularizers.l1_l2(l1=0.0001, l2=0.0001)
opt = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999,  decay=1e-09)
nn = Sequential([
    Dense(800, input_shape=(2, ), activation='relu',kernel_regularizer=reg),
    Dropout(0.1),
    Dense(800, activation='relu',kernel_regularizer=reg),
    Dropout(0.1),
    Dense(300, activation='relu',kernel_regularizer=reg),
    Dropout(0.1),
    Dense(5, activation='softmax')
])
nn.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
nn.fit(X,y,validation_split=0., batch_size=30, epochs=10,verbose=0)

<keras.callbacks.History at 0x7f56a2090450>

In [21]:
test = np.matrix(df[feats])
preds = nn.predict(test)
probs = nn.predict_proba(test)
df['preds']=preds
# probability baker is a finalist
top = probs[:,0]
top3 = probs[:,0]+probs[:,1]
bot = probs[:,-1]
df['bottom']=np.round(bot*100,decimals=2)
df['finalist']=np.round(top*100,decimals=2) 
df['top3'] = np.round(top3*100,decimals=2)
df.to_csv("../RESULTS/gbbo.techinical.s10.keras.preditions.txt",sep="\t",index=False)
sub = ['baker','preds','finalist','top3','bottom']
df[sub].sort_values(by=['finalist'],ascending=False)

Unnamed: 0,baker,preds,finalist,top3,bottom
1,Alice,0.183971,18.4,38.029999,21.32
5,Dan,0.176299,17.629999,36.75,23.530001
7,David,0.158133,15.81,34.220001,25.82
19,Phil,0.157167,15.72,34.169998,25.219999
23,Rosie,0.155281,15.53,34.310001,23.719999
11,Henry,0.14968,14.97,33.599998,24.1
15,Michael,0.130967,13.1,30.1,30.59
21,Priya,0.123082,12.31,29.08,30.129999
17,Michelle,0.118499,11.85,28.42,30.459999
3,Amelia,0.118006,11.8,28.52,29.549999


In [22]:
preds

array([[0.18397146, 0.19634429, 0.19915363, 0.20729835, 0.21323232],
       [0.11800599, 0.16722098, 0.19295484, 0.22629493, 0.2955233 ],
       [0.17629908, 0.19122763, 0.19365919, 0.2035376 , 0.2352765 ],
       [0.15813303, 0.18403514, 0.19161406, 0.20802481, 0.25819293],
       [0.09093494, 0.14751214, 0.180244  , 0.22438861, 0.3569203 ],
       [0.14968033, 0.18630698, 0.20073783, 0.22223   , 0.24104486],
       [0.09465252, 0.14967437, 0.18015046, 0.22208019, 0.3534425 ],
       [0.13096729, 0.17000616, 0.18344513, 0.20969233, 0.3058891 ],
       [0.11849929, 0.16567315, 0.18962266, 0.22156917, 0.3046358 ],
       [0.15716724, 0.18448992, 0.19455227, 0.21154112, 0.2522495 ],
       [0.1230818 , 0.1677616 , 0.18902123, 0.21883515, 0.30130017],
       [0.15528052, 0.18779117, 0.20030771, 0.21944326, 0.23717746],
       [0.11360196, 0.16533859, 0.19325468, 0.22886805, 0.29893672]],
      dtype=float32)