In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import vapeplot 
from scipy import interp
import scipy.stats
import warnings
warnings.simplefilter("ignore")
%matplotlib inline

#### Goal
--------------------

The goal of this project is to make a classifier that predicts the final rankings for bakers.
The idea is to make a model for each episode and to use data from previous episodes in the model.
Therefore, a classifier for episode 1 will likely be bad at predicting the final outcome, but a classifier for episode 5 might accurately predict who will be in the top 3 and who might be eliminated in the next episode



In [2]:
from datetime import datetime
def timestamp(): return datetime.today().strftime('%Y%m%d')

from sklearn.preprocessing import QuantileTransformer
def quantile_scale(df,feats):
    qua = df
    scaler = QuantileTransformer(
        n_quantiles=10,
        random_state=42,
        ignore_implicit_zeros=True, #sparse matrix
    )
    # fit the scaler
    scaler.fit(qua[feats])
    # transform values
    qua[feats] = scaler.transform(qua[feats])
    return qua
def tiered(classes):
    trans = []
    for x in classes:
        if x==1: c=0
        if x==2: c=1
        if x>=3 and x<=4: c=2
        if x>=5 and x<=7: c=3
        if x>=8: c=4
        trans.append(c)
    return trans

In [8]:
merge_col = ['season','baker','index','episode','place']
tech = pd.read_csv("../RESULTS/gbbo.techinical.data.s10.20191104.tsv",sep="\t")
star = pd.read_csv("../RESULTS/gbbo.starbaker.data.s10.final.tsv",sep="\t")
gbbo = pd.merge(tech, star,  how='left', left_on=merge_col, right_on =merge_col)
gbbo = gbbo[['season','baker','episode','place','tech_mean','tech','mean_star','star','mean_good','good','mean_bad','bad']]
gbbo.to_csv("../RESULTS/gbbo.features.s10.final.tsv".format(timestamp()),sep="\t",index=False)
gbbo.head()

Unnamed: 0,season,baker,episode,place,tech_mean,tech,mean_star,star,mean_good,good,mean_bad,bad
0,10,Alice,1,0,5.0,5,0.0,0,0.0,0,0.0,0
1,10,Alice,2,0,3.0,1,0.5,1,0.5,1,0.0,0
2,10,Alice,3,0,4.33,7,0.33,0,0.33,0,0.33,1
3,10,Alice,4,0,5.25,8,0.25,0,0.25,0,0.25,0
4,10,Alice,5,0,5.4,6,0.2,0,0.2,0,0.2,0


In [8]:
gbbo = pd.read_csv("../RESULTS/gbbo.features.s10.final.tsv",sep="\t")
feats = ['tech_mean','tech','mean_star','star','mean_good','good','mean_bad','bad']
max_epi = max(gbbo['episode'])
gbbo = gbbo.loc[gbbo['episode']==max_epi]
gbbo = quantile_scale(gbbo,feats)
gbbo.head()

Unnamed: 0,season,baker,episode,place,tech_mean,tech,mean_star,star,mean_good,good,mean_bad,bad
9,10,Alice,10,0,0.8974359,0.9166667,0.9047619,1e-07,0.8148148,1e-07,0.333333,1e-07
19,10,Amelia,10,10,0.1111111,1e-07,1e-07,1e-07,1e-07,1e-07,0.333333,1e-07
29,10,Dan,10,12,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.333333,1e-07
39,10,David,10,0,0.5185185,0.8333333,1e-07,1e-07,0.9090909,1e-07,0.333333,1e-07
49,10,Helena,10,8,0.8666667,1e-07,1e-07,1e-07,1e-07,1e-07,0.333333,1e-07


In [9]:
tech = pd.read_csv("../RESULTS/gbbo.features.20190909.tsv",sep='\t')
tech = tech.loc[tech['episode']==max_epi]
qua = quantile_scale(tech,feats)
qua['place']=tiered(qua['place'])
X, y = np.matrix(qua[feats]), np.array(qua['place'])

In [9]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.activations import relu

def create_model( nl1=1, nl2=1,  nl3=1, 
                 nn1=1000, nn2=500, nn3 = 200, lr=0.01, decay=0., l1=0.01, l2=0.01,
                act = 'relu', dropout=0,input_shape=input_shape,output_shape=output_shape):    
    '''This is a model generating function so that we can search over neural net 
    parameters and architecture
    https://www.kaggle.com/arrogantlymodest/randomised-cv-search-over-keras-neural-network
    '''
    opt = keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999,  decay=decay)
    reg = keras.regularizers.l1_l2(l1=l1, l2=l2)
    model = Sequential()
    first=True  
    for i in range(nl1):
        if first:
            model.add(Dense(nn1, input_dim=input_shape, activation=act, kernel_regularizer=reg))
            first=False
        else: 
            model.add(Dense(nn1, activation=act, kernel_regularizer=reg))
        if dropout!=0:
            model.add(Dropout(dropout))    
    for i in range(nl2):
        if first:
            model.add(Dense(nn2, input_dim=input_shape, activation=act, kernel_regularizer=reg))
            first=False
        else: 
            model.add(Dense(nn2, activation=act, kernel_regularizer=reg))
        if dropout!=0:
            model.add(Dropout(dropout))    
    for i in range(nl3):
        if first:
            model.add(Dense(nn3, input_dim=input_shape, activation=act, kernel_regularizer=reg))
            first=False
        else: 
            model.add(Dense(nn3, activation=act, kernel_regularizer=reg))
        if dropout!=0:
            model.add(Dropout(dropout))       
    model.add(Dense(output_shape, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'],)
    return model
##################################
feats = ['tech_mean','tech','mean_star','star','mean_good','good','mean_bad','bad']
tech = pd.read_csv("../RESULTS/gbbo.features.20190909.tsv",sep='\t')
tech['place']=tiered(tech['place'])
input_shape = len(feats)
output_shape = len(set(tech['place']))


l1 = 0.0001
l2 = 0.0001
lr = 0.0001
nl1 = 1
nl2 = 1
nl3 = 1
nn1 = 800
nn2 = 800
nn3 = 300

dropout = 0.1
decay = 1e-09
act='relu'
n_dims = len(feats)
n_classes = len(set(tech['place']))


BATCH,EPOCHS = 12, 25
####
GBBO = pd.read_csv("../RESULTS/gbbo.features.s10.final.tsv",sep="\t")

for e in set(GBBO['episode']):
    gbbo = GBBO.loc[GBBO['episode']==e]
    gbbo = quantile_scale(gbbo,feats)
    test = np.matrix(gbbo[feats])

    tech = pd.read_csv("../RESULTS/gbbo.features.20190909.tsv",sep='\t')
    tech = tech.loc[tech['episode']==e]
    qua = quantile_scale(tech,feats)
    qua['place']=tiered(qua['place'])
    
    X, y = np.matrix(qua[feats]), np.array(qua['place'])
    
    nn = create_model( nl1=nl1, nl2=nl2,  nl3=nl3, 
                     nn1=nn1, nn2=nn2, nn3 = nn3, 
                     lr=lr, decay=decay, l1=l1, l2=l2,
                     act = act, dropout=dropout,
                     input_shape=n_dims,
                     output_shape=n_classes)
    
    nn.fit(X,y,validation_split=0., batch_size=BATCH, epochs=EPOCHS,verbose=0)

    preds = nn.predict_classes(test)
    probs = nn.predict_proba(test)
    gbbo['preds']=preds
    # probability baker is a finalist
    top = probs[:,0]
    # probability baker is a finalist or a runner-up
    top3 = probs[:,0]+probs[:,1]
    # bottom tier (8th and below)
    bot = probs[:,-1]
    # 5th - 7th
    nextbot = probs[:,-2]
    third = probs[:,-3]

    gbbo['bottom']=np.round(bot*100,decimals=2)
    gbbo['finalist']=np.round(top*100,decimals=2) 
    gbbo['top3'] = np.round(top3*100,decimals=2)
    gbbo['fifthseventh'] = np.round(nextbot*100,decimals=2)
    gbbo['thirdforth'] = np.round(third*100,decimals=2)

    gbbo.to_csv("../RESULTS/gbbo.techinical.s10.week{}.final2.keras.preditions.txt".format(e),sep="\t",index=False)
    sub = ['baker','preds','finalist','top3','bottom','fifthseventh','thirdforth']
    print(e)

1
2
3
4
5
6
7
8
9
10


In [6]:
gbbo = gbbo[sub].sort_values(by=['finalist'],ascending=False)
print(gbbo)

        baker  preds  finalist       top3     bottom  fifthseventh  thirdforth
39      David      1     34.73  86.440002   3.400000      4.880000    5.280000
9       Alice      1     25.65  97.830002   0.270000      0.870000    1.030000
129     Steph      1     24.16  98.550003   0.220000      0.610000    0.630000
89   Michelle      2      8.64  16.940001  11.310000     33.340000   38.410000
29        Dan      4      2.09   4.150000  69.760002     18.910000    7.170000
19     Amelia      4      1.84   3.650000  68.760002     20.230000    7.360000
59      Henry      2      1.75   3.140000  20.250000     36.880001   39.730000
99       Phil      4      1.62   3.210000  60.009998     27.270000    9.500000
49     Helena      3      1.14   2.400000  33.660000     47.930000   16.010000
79    Michael      2      0.94   1.860000   5.060000     40.279999   52.799999
69      Jamie      4      0.48   0.940000  79.230003     15.760000    4.070000
119     Rosie      3      0.33   0.610000  35.540001