In [1]:
import os
import wget
import zipfile
import shutil
import glob
import re
import csv
import pandas as pd
import numpy as np
import pymysql
import sqlalchemy
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
import smtplib


In [2]:
conn = pymysql.connect(
    host = '127.0.0.1',
    port = 3306,
    user = 'root',
    passwd = 'root',
    db = 'f1test'
)

cur = conn.cursor()

e = sqlalchemy.create_engine("mysql+pymysql://root:root@localhost/f1test")


# Querying DF to start predictive stuff
#### Things to Try:
1. Logistic
2. Recode Categorical Vars -> SVM/Tree/RF (NN? Probably not big enough)
3. Other Stuff 

Asside: We don't have enough data to do any TS stuff

In [3]:
query = '''with 
base as (
	select
		re.race_id + 1 as upcoming_race
		, ra.year as prior_race_year, re.race_id as prior_race
		, concat(re.race_id, '-', re.driver_id) as comp_key
		, re.driver_id, re.constructor_id
		, re.points as prior_points_driver
		, re.position as prior_position_driver
		, cr.points as prior_points_constructor
	from results re
	  join races ra
		on re.race_id = ra.race_id
	  join constructor_results cr
		on re.race_id = cr.race_id and re.constructor_id = cr.constructor_id
	  join driver_standings ds
		on re.race_id = ds.race_id and re.driver_id = ds.driver_id
	  where ra.year = 2019
),
step1_zscore as (
	  select 
        lt.race_id
        , lt.driver_id
        , lt.milliseconds
		, avg(lt.milliseconds) OVER (partition by lt.race_id) as ms_avg_race
		, stddev(lt.milliseconds) OVER (partition by lt.race_id) as sd_ms
		, concat(lt.race_id, '-', lt.driver_id) as fk
	  from lap_times lt
		join races ra
		  on lt.race_id = ra.race_id
		where ra.year = 2019
),
z_score as (
	  select  
		avg((milliseconds - ms_avg_race) / sd_ms) as scaled_performance, fk
	  from step1_zscore
	  group by fk
),
quali as (
	  select 
		race_id, driver_id
		, substring_index(q1, ':', 1)*60*1000 as q1_min_ms
		, substring_index(substring_index(q1, '.', 1), ':', -1) * 1000 as q1_sec_ms
		, substring_index(q1, '.', -1) as q1_ms
		, substring_index(q2, ':', 1)*60*1000 as q2_min_ms
		, substring_index(substring_index(q2, '.', 1), ':', -1) * 1000 as q2_sec_ms
		, substring_index(q2, '.', -1) as q2_ms
		, substring_index(q3, ':', 1)*60*1000 as q3_min_ms
		, substring_index(substring_index(q3, '.', 1), ':', -1) * 1000 as q3_sec_ms
		, substring_index(q3, '.', -1) as q3_ms
		, position as prior_pole_position_quali
      from qualifying
),
quali_step_1 as (
	  select
		race_id, driver_id
        , prior_pole_position_quali
		, q1_min_ms + q1_sec_ms + q1_ms as q1_ms_tot
        , q2_min_ms + q2_sec_ms + q2_ms as q2_ms_tot
        , q3_min_ms + q3_sec_ms + q3_ms as q3_ms_tot
	  from quali		
),
quali_avgs as (
	  select
		race_id, driver_id
        , prior_pole_position_quali
        , q1_ms_tot
        , q2_ms_tot
        , q3_ms_tot
        , avg(q1_ms_tot) over (partition by race_id) as q1_avg
        , avg(q2_ms_tot) over (partition by race_id) as q2_avg
        , avg(q3_ms_tot) over (partition by race_id) as q3_avg
        , stddev(q1_ms_tot) over (partition by race_id) as q1_sd
        , stddev(q2_ms_tot) over (partition by race_id) as q2_sd
        , stddev(q3_ms_tot) over (partition by race_id) as q3_sd
        from quali_step_1
),
quali_z as (
	  select
		concat(race_id, '-', driver_id) as fk
        , prior_pole_position_quali as prior_pole_position
        , race_id, driver_id
        , (q1_ms_tot - q1_avg)/q1_sd as q1_z
        , (q2_ms_tot - q2_avg)/q2_sd as q2_z
        , (q3_ms_tot - q3_avg)/q3_sd as q3_z
        from quali_avgs
)
select
	distinct d.surname, c.constructor_ref as constructor_name
    , ra.year as upcoming_race_year -- Just to double check for backtesting (cant predict the first race of the next year with the last race of the prior year)
    , b.*
    , z.scaled_performance as prior_race_scaled_performance
    , q.q1_z, q.q2_z, q.q3_z
    , q.prior_pole_position
    , re.position as upcoming_race_result
from base b
  left join driver d
    on b.driver_id = d.driver_id
  left join constructors c
    on b.constructor_id = c.constructor_id
  left join races ra
    on b.upcoming_race = ra.race_id
  left join z_score z
    on b.comp_key = z.fk
  left join quali_z q
	on b.comp_key = q.fk
  left join results re
	on b.upcoming_race = re.race_id and b.driver_id = re.driver_id
order by surname, prior_race asc;'''



In [4]:
base_query = pd.read_sql_query(query, e)
base_query.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,prior_position_driver,prior_points_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result
0,Albon,toro_rosso,2019,1011,2019,1010,1010-848,848,5,0.0,14,1.0,0.092288,-0.222269,0.591081,-0.999922,13.0,9
1,Albon,toro_rosso,2019,1012,2019,1011,1011-848,848,5,2.0,9,2.0,0.008085,-0.406297,0.586967,-0.999979,12.0,10
2,Albon,toro_rosso,2019,1013,2019,1012,1012-848,848,5,1.0,10,1.0,0.038021,,,,,11
3,Albon,toro_rosso,2019,1014,2019,1013,1013-848,848,5,0.0,11,0.0,0.043757,-0.421294,1.048179,,12.0,11
4,Albon,toro_rosso,2019,1015,2019,1014,1014-848,848,5,0.0,11,2.0,0.008905,-0.326546,0.588753,-0.999911,12.0,8


In [5]:
base_query_1 = base_query

# DNF for all \N values
base_query_1 = base_query_1.replace(to_replace = '\\N', value = "DNF")

#21 = DNF
base_query_1["prior_position_driver"] = base_query_1["prior_position_driver"].replace('DNF', 21)

base_query_1.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,prior_position_driver,prior_points_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result
0,Albon,toro_rosso,2019,1011,2019,1010,1010-848,848,5,0.0,14,1.0,0.092288,-0.222269,0.591081,-0.999922,13.0,9
1,Albon,toro_rosso,2019,1012,2019,1011,1011-848,848,5,2.0,9,2.0,0.008085,-0.406297,0.586967,-0.999979,12.0,10
2,Albon,toro_rosso,2019,1013,2019,1012,1012-848,848,5,1.0,10,1.0,0.038021,,,,,11
3,Albon,toro_rosso,2019,1014,2019,1013,1013-848,848,5,0.0,11,0.0,0.043757,-0.421294,1.048179,,12.0,11
4,Albon,toro_rosso,2019,1015,2019,1014,1014-848,848,5,0.0,11,2.0,0.008905,-0.326546,0.588753,-0.999911,12.0,8


In [6]:
base_query_1.isnull().sum()

surname                           0
constructor_name                  0
upcoming_race_year                0
upcoming_race                     0
prior_race_year                   0
prior_race                        0
comp_key                          0
driver_id                         0
constructor_id                    0
prior_points_driver               0
prior_position_driver             0
prior_points_constructor          0
prior_race_scaled_performance     0
q1_z                              4
q2_z                             12
q3_z                             24
prior_pole_position               2
upcoming_race_result             20
dtype: int64

In [25]:
# Fill missing Values for Quali data
base_query_1.q1_z.max() 

q1_z_max = 4

base_query_1['q1_z'] = base_query_1['q1_z'].fillna(q1_z_max)
base_query_1['q2_z'] = base_query_1['q2_z'].fillna(base_query_1.q1_z)
base_query_1['q3_z'] = base_query_1['q3_z'].fillna((base_query_1.q1_z + base_query_1.q1_z)/2)
base_query_1['q_z_avg'] = (base_query_1.q1_z + base_query_1.q2_z + base_query_1.q3_z)/3

# Creating bool column indicating win
base_query_1['pred_win'] = np.where(base_query_1.upcoming_race_result == '1', 1, 0)

base_query_1.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,upcoming_race_result,q_z_avg,pred_win,rank,exp_points_driver,exp_posistion_driver,exp_points_constructor,exp_race_scaled_performance,exp_pole_position,exp_q
0,Albon,toro_rosso,2019,1011,2019,1010,1010-848,848,5,0.0,...,9,-0.21037,0,1.0,0.0,14.0,1.0,0.092288,13.0,-0.21037
1,Albon,toro_rosso,2019,1012,2019,1011,1011-848,848,5,2.0,...,10,-0.273103,0,2.0,1.176471,11.058824,1.588235,0.042757,12.411765,-0.247272
2,Albon,toro_rosso,2019,1013,2019,1012,1012-848,848,5,1.0,...,11,4.0,0,3.0,1.09589,10.575342,1.319635,0.040594,15.876712,1.692122
3,Albon,toro_rosso,2019,1014,2019,1013,1013-848,848,5,0.0,...,11,0.06853,0,4.0,0.663245,10.742992,0.798658,0.041843,14.34623,1.051146
4,Albon,toro_rosso,2019,1015,2019,1014,1014-848,848,5,0.0,...,8,-0.245901,0,5.0,0.424074,10.835671,1.23187,0.029965,13.500162,0.583421


In [26]:
# Drop DNF, will not use for train or test sets
base_query_1 = base_query_1[base_query_1.upcoming_race_result != 'DNF']
base_query_1.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,upcoming_race_result,q_z_avg,pred_win,rank,exp_points_driver,exp_posistion_driver,exp_points_constructor,exp_race_scaled_performance,exp_pole_position,exp_q
0,Albon,toro_rosso,2019,1011,2019,1010,1010-848,848,5,0.0,...,9,-0.21037,0,1.0,0.0,14.0,1.0,0.092288,13.0,-0.21037
1,Albon,toro_rosso,2019,1012,2019,1011,1011-848,848,5,2.0,...,10,-0.273103,0,2.0,1.176471,11.058824,1.588235,0.042757,12.411765,-0.247272
2,Albon,toro_rosso,2019,1013,2019,1012,1012-848,848,5,1.0,...,11,4.0,0,3.0,1.09589,10.575342,1.319635,0.040594,15.876712,1.692122
3,Albon,toro_rosso,2019,1014,2019,1013,1013-848,848,5,0.0,...,11,0.06853,0,4.0,0.663245,10.742992,0.798658,0.041843,14.34623,1.051146
4,Albon,toro_rosso,2019,1015,2019,1014,1014-848,848,5,0.0,...,8,-0.245901,0,5.0,0.424074,10.835671,1.23187,0.029965,13.500162,0.583421


In [27]:
base_query_1.isnull().sum()

surname                           0
constructor_name                  0
upcoming_race_year                0
upcoming_race                     0
prior_race_year                   0
prior_race                        0
comp_key                          0
driver_id                         0
constructor_id                    0
prior_points_driver               0
prior_position_driver             0
prior_points_constructor          0
prior_race_scaled_performance     0
q1_z                              0
q2_z                              0
q3_z                              0
prior_pole_position               0
upcoming_race_result             20
q_z_avg                           0
pred_win                          0
rank                              0
exp_points_driver                 0
exp_posistion_driver              0
exp_points_constructor            0
exp_race_scaled_performance       0
exp_pole_position                 0
exp_q                             0
dtype: int64

In [11]:
f1_2019 = base_query_1

f1_2019.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,prior_position_driver,prior_points_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result,q_z_avg,pred_win
0,Albon,toro_rosso,2019,1011,2019,1010,1010-848,848,5,0.0,14,1.0,0.092288,-0.222269,0.591081,-0.999922,13.0,9,-0.21037,0
1,Albon,toro_rosso,2019,1012,2019,1011,1011-848,848,5,2.0,9,2.0,0.008085,-0.406297,0.586967,-0.999979,12.0,10,-0.273103,0
2,Albon,toro_rosso,2019,1013,2019,1012,1012-848,848,5,1.0,10,1.0,0.038021,4.0,4.0,4.0,20.0,11,4.0,0
3,Albon,toro_rosso,2019,1014,2019,1013,1013-848,848,5,0.0,11,0.0,0.043757,-0.421294,1.048179,-0.421294,12.0,11,0.06853,0
4,Albon,toro_rosso,2019,1015,2019,1014,1014-848,848,5,0.0,11,2.0,0.008905,-0.326546,0.588753,-0.999911,12.0,8,-0.245901,0


In [12]:
#### WONT BE NEEDING THIS, GOOD FOR FUTURE REF THOUGH

# Rank group by driver for future weighted average
f1_2019["rank"] = f1_2019.groupby("surname")["prior_race"].rank("dense", ascending=True)

f1_2019.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_points_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result,q_z_avg,pred_win,rank
0,Albon,toro_rosso,2019,1011,2019,1010,1010-848,848,5,0.0,...,1.0,0.092288,-0.222269,0.591081,-0.999922,13.0,9,-0.21037,0,1.0
1,Albon,toro_rosso,2019,1012,2019,1011,1011-848,848,5,2.0,...,2.0,0.008085,-0.406297,0.586967,-0.999979,12.0,10,-0.273103,0,2.0
2,Albon,toro_rosso,2019,1013,2019,1012,1012-848,848,5,1.0,...,1.0,0.038021,4.0,4.0,4.0,20.0,11,4.0,0,3.0
3,Albon,toro_rosso,2019,1014,2019,1013,1013-848,848,5,0.0,...,0.0,0.043757,-0.421294,1.048179,-0.421294,12.0,11,0.06853,0,4.0
4,Albon,toro_rosso,2019,1015,2019,1014,1014-848,848,5,0.0,...,2.0,0.008905,-0.326546,0.588753,-0.999911,12.0,8,-0.245901,0,5.0


In [13]:
# Make Position Numerical
f1_2019['prior_position_driver'] = pd.to_numeric(f1_2019.prior_position_driver)
f1_2019['prior_pole_position'] = pd.to_numeric(f1_2019.prior_pole_position)


### Actually make columns

In [14]:
f1_2019.columns

Index(['surname', 'constructor_name', 'upcoming_race_year', 'upcoming_race',
       'prior_race_year', 'prior_race', 'comp_key', 'driver_id',
       'constructor_id', 'prior_points_driver', 'prior_position_driver',
       'prior_points_constructor', 'prior_race_scaled_performance', 'q1_z',
       'q2_z', 'q3_z', 'prior_pole_position', 'upcoming_race_result',
       'q_z_avg', 'pred_win', 'rank'],
      dtype='object')

In [15]:
cols_for_exp = ['prior_points_driver', 'prior_position_driver', 'prior_points_constructor' 
               , 'prior_race_scaled_performance', 'prior_pole_position', 'q_z_avg']

exp_cols = ['exp_points_driver', 'exp_posistion_driver', 'exp_points_constructor', 
         'exp_race_scaled_performance', 'exp_pole_position', 'exp_q']


# Create exp smoothed covariates
f1_2019[exp_cols] = f1_2019.groupby('surname').apply(lambda x: x[cols_for_exp].ewm(alpha = .3).mean())

# Remove exp_q since it is already scaled, dont want to rescale
exp_cols.remove('exp_q')

f1_2019.head()



Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,upcoming_race_result,q_z_avg,pred_win,rank,exp_points_driver,exp_posistion_driver,exp_points_constructor,exp_race_scaled_performance,exp_pole_position,exp_q
0,Albon,toro_rosso,2019,1011,2019,1010,1010-848,848,5,0.0,...,9,-0.21037,0,1.0,0.0,14.0,1.0,0.092288,13.0,-0.21037
1,Albon,toro_rosso,2019,1012,2019,1011,1011-848,848,5,2.0,...,10,-0.273103,0,2.0,1.176471,11.058824,1.588235,0.042757,12.411765,-0.247272
2,Albon,toro_rosso,2019,1013,2019,1012,1012-848,848,5,1.0,...,11,4.0,0,3.0,1.09589,10.575342,1.319635,0.040594,15.876712,1.692122
3,Albon,toro_rosso,2019,1014,2019,1013,1013-848,848,5,0.0,...,11,0.06853,0,4.0,0.663245,10.742992,0.798658,0.041843,14.34623,1.051146
4,Albon,toro_rosso,2019,1015,2019,1014,1014-848,848,5,0.0,...,8,-0.245901,0,5.0,0.424074,10.835671,1.23187,0.029965,13.500162,0.583421


In [16]:
# Trim the DF
keep_cols = ['prior_race', 'surname', 'constructor_name', 'exp_points_driver', 'exp_posistion_driver',
             'exp_points_constructor', 'exp_race_scaled_performance', 'exp_pole_position', 'exp_q',
             'pred_win', 'upcoming_race_result']

f1_before_dummies = f1_2019[keep_cols].copy()

In [17]:
# Dummy the cat vars driver and constructor
f1_before_dummies['surname'] = pd.Categorical(f1_before_dummies.surname)
f1_before_dummies['constructor_name'] = pd.Categorical(f1_before_dummies.constructor_name)

f1_dummies = pd.get_dummies(f1_before_dummies, columns = ['surname','constructor_name'])
f1_dummies['surname'] = f1_before_dummies['surname']
f1_dummies.head()


Unnamed: 0,prior_race,exp_points_driver,exp_posistion_driver,exp_points_constructor,exp_race_scaled_performance,exp_pole_position,exp_q,pred_win,upcoming_race_result,surname_Albon,...,constructor_name_ferrari,constructor_name_haas,constructor_name_mclaren,constructor_name_mercedes,constructor_name_racing_point,constructor_name_red_bull,constructor_name_renault,constructor_name_toro_rosso,constructor_name_williams,surname
0,1010,0.0,14.0,1.0,0.092288,13.0,-0.21037,0,9,1,...,0,0,0,0,0,0,0,1,0,Albon
1,1011,1.176471,11.058824,1.588235,0.042757,12.411765,-0.247272,0,10,1,...,0,0,0,0,0,0,0,1,0,Albon
2,1012,1.09589,10.575342,1.319635,0.040594,15.876712,1.692122,0,11,1,...,0,0,0,0,0,0,0,1,0,Albon
3,1013,0.663245,10.742992,0.798658,0.041843,14.34623,1.051146,0,11,1,...,0,0,0,0,0,0,0,1,0,Albon
4,1014,0.424074,10.835671,1.23187,0.029965,13.500162,0.583421,0,8,1,...,0,0,0,0,0,0,0,1,0,Albon


In [19]:
max_race = max(f1_dummies.prior_race)

train = f1_dummies[f1_dummies['prior_race'] != max_race]
test = f1_dummies[f1_dummies['prior_race'] == max_race]

train.sort_values('prior_race', ascending = True).tail()
    

Unnamed: 0,prior_race,exp_points_driver,exp_posistion_driver,exp_points_constructor,exp_race_scaled_performance,exp_pole_position,exp_q,pred_win,upcoming_race_result,surname_Albon,...,constructor_name_ferrari,constructor_name_haas,constructor_name_mclaren,constructor_name_mercedes,constructor_name_racing_point,constructor_name_red_bull,constructor_name_renault,constructor_name_toro_rosso,constructor_name_williams,surname
118,1018,3.51265,11.944155,7.099655,0.113085,7.65537,0.348072,0,11,0,...,0,0,1,0,0,0,0,0,0,Norris
148,1018,1.50581,11.314268,3.382351,-0.014623,9.64585,0.228691,0,7,0,...,0,0,0,0,0,0,1,0,0,Ricciardo
128,1018,0.59319,11.523957,1.040679,0.02112,14.709818,-0.506061,0,17,0,...,0,0,0,0,1,0,0,0,0,Pérez
28,1018,4.719382,8.421108,21.172292,-0.062713,8.658058,0.583302,0,4,0,...,0,0,0,0,0,1,0,0,0,Gasly
198,1018,13.56009,3.634439,27.055814,-0.287123,5.838747,0.030782,0,16,0,...,1,0,0,0,0,0,0,0,0,Vettel


In [20]:
test.prior_race

9      1019
19     1019
29     1019
39     1019
49     1019
59     1019
69     1019
79     1019
89     1019
99     1019
109    1019
119    1019
129    1019
139    1019
149    1019
159    1019
169    1019
179    1019
189    1019
199    1019
Name: prior_race, dtype: int64

# PREDICTIVE STUFF


### Naive Bayes, Random Forest, Logistic -> Model Average


In [21]:
all_features = f1_dummies.columns.tolist()
dont_use = ['prior_race', 'pred_win', 'upcoming_race_result', 'surname']

used_features = [i for i in all_features if i not in dont_use] 

gnb = GaussianNB()

results_nb = []

gnb.fit(
    train[used_features].values,
    train["pred_win"]
)

y_pred = gnb.predict_proba(test[used_features])

results = pd.DataFrame(test['surname'])
results['pred_nb'] = y_pred[:,1].tolist()
results


Unnamed: 0,surname,pred_nb
9,Albon,0.0
19,Bottas,1.0
29,Gasly,0.0
39,Giovinazzi,0.0
49,Grosjean,0.0
59,Hamilton,1.0
69,Hülkenberg,0.0
79,Kubica,0.0
89,Kvyat,0.0
99,Leclerc,0.0


## Logistic

In [22]:
results_log = []

logreg = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
logreg.fit(
    train[used_features].values,
    train["pred_win"]
)

y_pred = logreg.predict_proba(test[used_features])

results['pred_log'] = y_pred[:,1].tolist()
results



Unnamed: 0,surname,pred_nb,pred_log
9,Albon,0.0,0.000142
19,Bottas,1.0,0.166947
29,Gasly,0.0,0.01896
39,Giovinazzi,0.0,3.7e-05
49,Grosjean,0.0,1.9e-05
59,Hamilton,1.0,0.553267
69,Hülkenberg,0.0,0.000652
79,Kubica,0.0,8e-06
89,Kvyat,0.0,0.000191
99,Leclerc,0.0,0.02461


## Random Forest

In [23]:
results_rf = []

clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes = 20)
clf.fit(
    train[used_features].values,
    train["pred_win"]
)

y_pred = clf.predict_proba(test[used_features])

results['pred_rf'] = y_pred[:,1].tolist()
results



Unnamed: 0,surname,pred_nb,pred_log,pred_rf
9,Albon,0.0,0.000142,0.0
19,Bottas,1.0,0.166947,0.06
29,Gasly,0.0,0.01896,0.01
39,Giovinazzi,0.0,3.7e-05,0.0
49,Grosjean,0.0,1.9e-05,0.0
59,Hamilton,1.0,0.553267,0.84
69,Hülkenberg,0.0,0.000652,0.0
79,Kubica,0.0,8e-06,0.0
89,Kvyat,0.0,0.000191,0.0
99,Leclerc,0.0,0.02461,0.0


In [24]:
results['with_bayes'] = (results.pred_nb + results.pred_log + results.pred_rf ) / sum(results.pred_nb + results.pred_log + results.pred_rf)
results['no_bayes'] = (results.pred_log + results.pred_rf) / sum(results.pred_log + results.pred_rf)


probs_bayes = results.with_bayes.tolist()
probs_no = results.no_bayes.tolist()

money_line_bayes = []
money_line_no = []

for i in range(0, len(probs_bayes)):
    if probs_bayes[i] >= .5:
        money_line_bayes.append((probs_bayes[i]/(1 - probs_bayes[i]))*-100)
    else:
        money_line_bayes.append((1- probs_bayes[i])/(probs_bayes[i])*100)
        
for i in range(0, len(probs_no)):
    if probs_no[i] >= .5:
        money_line_no.append((probs_no[i]/(1 - probs_no[i]))*-100)
    else:
        money_line_no.append((1- probs_no[i])/(probs_no[i])*100)
        
results['money_line_bayes'] = money_line_bayes
results['money_line_no'] = money_line_no

results['money_line_bayes'] = results['money_line_bayes']
results['money_line_no'] = results['money_line_no']

results['money_line_bayes'] = results['money_line_bayes'].astype(float)
results['money_line_no'] = results['money_line_no'].astype(float)

results['money_line_bayes'] = results['money_line_bayes']
results['money_line_no'] = results['money_line_no']

results['avg_money'] = (results['money_line_bayes'] + results['money_line_no'])/2

results['money_line_bayes'] = results['money_line_bayes'].map(lambda x: '{:.0f}'.format(x))
results['money_line_no'] = results['money_line_no'].map(lambda x: '{:.0f}'.format(x))
results['avg_money'] = results['avg_money'].map(lambda x: '{:.0f}'.format(x))

results

Unnamed: 0,surname,pred_nb,pred_log,pred_rf,with_bayes,no_bayes,money_line_bayes,money_line_no,avg_money
9,Albon,0.0,0.000142,0.0,2.9e-05,7.6e-05,3424108,1311998,2368053
19,Bottas,1.0,0.166947,0.06,0.252267,0.121773,296,721,509
29,Gasly,0.0,0.01896,0.01,0.005954,0.015539,16695,6335,11515
39,Giovinazzi,0.0,3.7e-05,0.0,8e-06,2e-05,13215673,5063957,9139815
49,Grosjean,0.0,1.9e-05,0.0,4e-06,1e-05,25441672,9748750,17595211
59,Hamilton,1.0,0.553267,0.84,0.492069,0.74759,103,-296,-96
69,Hülkenberg,0.0,0.000652,0.0,0.000134,0.00035,746093,285828,515961
79,Kubica,0.0,8e-06,0.0,2e-06,4e-06,59633671,22850536,41242104
89,Kvyat,0.0,0.000191,0.0,3.9e-05,0.000103,2540424,973385,1756905
99,Leclerc,0.0,0.02461,0.0,0.00506,0.013205,19663,7473,13568
