In [1]:
import os
import wget
import zipfile
import shutil
import glob
import re
import csv
import pandas as pd
import numpy as np
import pymysql
import sqlalchemy
import matplotlib.pyplot as plt

In [2]:
conn = pymysql.connect(
    host = '127.0.0.1',
    port = 3306,
    user = 'root',
    passwd = 'root',
    db = 'f1test'
)

cur = conn.cursor()

e = sqlalchemy.create_engine("mysql+pymysql://root:root@localhost/f1test")


# Querying DF to start predictive stuff
#### Things to Try:
1. Logistic
2. Recode Categorical Vars -> SVM/Tree/RF (NN? Probably not big enough)
3. Other Stuff 

Asside: We don't have enough data to do any TS stuff

In [13]:
query = '''with 
base as (
	select
		re.race_id + 1 as upcoming_race, ra.year as prior_race_year, re.race_id as prior_race
		, concat(re.race_id, '-', re.driver_id) as comp_key
		, re.driver_id, re.constructor_id
		, re.points as prior_points_driver
		, re.position as prior_position_driver
		, ds.points as prior_total_points_driver
		, ds.wins as prior_total_wins_driver
		, cs.points as prior_total_points_constructor
		, cs.wins as prior_total_wins_constructor
	from results re
	  join races ra
		on re.race_id = ra.race_id
	  join constructor_standings cs
		on re.race_id = cs.race_id and re.constructor_id = cs.constructor_id
	  join driver_standings ds
		on re.race_id = ds.race_id and re.driver_id = ds.driver_id
	  where ra.year = 2019
),
step1_zscore as (
	  select 
        lt.race_id, lt.driver_id, lt.milliseconds
		, avg(lt.milliseconds) OVER (partition by lt.race_id) as ms_avg_race
		, stddev(lt.milliseconds) OVER (partition by lt.race_id) as sd_ms
		, concat(lt.race_id, '-', lt.driver_id) as fk
	  from lap_times lt
		join races ra
		  on lt.race_id = ra.race_id
		where ra.year = 2019
),
z_score as (
	  select  
		avg((milliseconds - ms_avg_race) / sd_ms) as scaled_performance, fk
	  from step1_zscore
	  group by fk
),
quali as (
	  select 
		race_id, driver_id
		, substring_index(q1, ':', 1)*60*1000 as q1_min_ms
		, substring_index(substring_index(q1, '.', 1), ':', -1) * 1000 as q1_sec_ms
		, substring_index(q1, '.', -1) as q1_ms
		, substring_index(q2, ':', 1)*60*1000 as q2_min_ms
		, substring_index(substring_index(q2, '.', 1), ':', -1) * 1000 as q2_sec_ms
		, substring_index(q2, '.', -1) as q2_ms
		, substring_index(q3, ':', 1)*60*1000 as q3_min_ms
		, substring_index(substring_index(q3, '.', 1), ':', -1) * 1000 as q3_sec_ms
		, substring_index(q3, '.', -1) as q3_ms
		, position as prior_pole_position_quali
      from qualifying
),
quali_step_1 as (
	  select
		race_id, driver_id
        , prior_pole_position_quali
		, q1_min_ms + q1_sec_ms + q1_ms as q1_ms_tot
        , q2_min_ms + q2_sec_ms + q2_ms as q2_ms_tot
        , q3_min_ms + q3_sec_ms + q3_ms as q3_ms_tot
	  from quali		
),
quali_avgs as (
	  select
		race_id, driver_id
        , prior_pole_position_quali
        , q1_ms_tot
        , q2_ms_tot
        , q3_ms_tot
        , avg(q1_ms_tot) over (partition by race_id) as q1_avg
        , avg(q2_ms_tot) over (partition by race_id) as q2_avg
        , avg(q3_ms_tot) over (partition by race_id) as q3_avg
        , stddev(q1_ms_tot) over (partition by race_id) as q1_sd
        , stddev(q2_ms_tot) over (partition by race_id) as q2_sd
        , stddev(q3_ms_tot) over (partition by race_id) as q3_sd
        from quali_step_1
),
quali_z as (
	  select
		concat(race_id, '-', driver_id) as fk
        , prior_pole_position_quali as prior_pole_position
        , race_id, driver_id
        , (q1_ms_tot - q1_avg)/q1_sd as q1_z
        , (q2_ms_tot - q2_avg)/q2_sd as q2_z
        , (q3_ms_tot - q3_avg)/q3_sd as q3_z
        from quali_avgs
)
select
	d.surname, c.name as constructor_name
    , ra.year as upcoming_race_year -- Just to double check for backtesting (cant predict the first race of the next year with the last race of the prior year)
    , b.*
    , z.scaled_performance as prior_race_scaled_performance
    , q.q1_z, q.q2_z, q.q3_z
    , q.prior_pole_position
    , re.position as upcoming_race_result
from base b
  left join driver d
    on b.driver_id = d.driver_id
  left join constructors c
    on b.constructor_id = c.constructor_id
  left join races ra
    on b.upcoming_race = ra.race_id
  left join z_score z
    on b.comp_key = z.fk
  left join quali_z q
	on b.comp_key = q.fk
  left join results re
	on b.upcoming_race = re.race_id and b.driver_id = re.driver_id
order by prior_race asc, prior_points_driver desc;'''

In [15]:
base_query = pd.read_sql_query(query, e)
base_query.head(20)

Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_points_driver,prior_total_wins_driver,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result
0,Bottas,Mercedes,2019,1011,2019,1010,1010-822,822,131,26.0,...,26.0,1,44.0,1,-0.471972,-0.677308,0.550517,0.974494,2.0,2
1,Hamilton,Mercedes,2019,1011,2019,1010,1010-1,1,131,18.0,...,18.0,0,44.0,1,-0.387785,-1.05534,0.545485,0.97175,1.0,1
2,Verstappen,Red Bull,2019,1011,2019,1010,1010-830,830,9,15.0,...,15.0,0,15.0,0,-0.381199,-0.083424,0.564151,0.992181,4.0,4
3,Vettel,Ferrari,2019,1011,2019,1010,1010-20,20,6,12.0,...,12.0,0,22.0,0,-0.241778,-0.072923,0.570729,0.988996,3.0,5
4,Leclerc,Ferrari,2019,1011,2019,1010,1010-844,844,6,10.0,...,10.0,0,22.0,0,-0.23726,-1.085676,0.565866,0.995169,5.0,3
5,Magnussen,Haas F1 Team,2019,1011,2019,1010,1010-825,825,210,8.0,...,8.0,0,8.0,0,-0.120665,-0.499959,0.579415,1.011264,7.0,13
6,Hülkenberg,Renault,2019,1011,2019,1010,1010-807,807,4,6.0,...,6.0,0,6.0,0,-0.088067,-0.475457,0.589001,-0.999922,11.0,\N
7,Räikkönen,Alfa Romeo,2019,1011,2019,1010,1010-8,8,51,4.0,...,4.0,0,4.0,0,-0.083207,0.021585,0.583014,1.016531,9.0,7
8,Stroll,Racing Point,2019,1011,2019,1010,1010-840,840,211,2.0,...,2.0,0,2.0,0,-0.080123,0.08109,-1.731904,-0.999922,16.0,14
9,Kvyat,Toro Rosso,2019,1011,2019,1010,1010-826,826,5,1.0,...,1.0,0,1.0,0,-0.077256,-0.509293,0.594961,-0.999922,15.0,12


In [16]:
base_query_1 = base_query

# DNF for all \N values
base_query_1 = base_query_1.replace(to_replace = '\\N', value = "DNF")
base_query_1["prior_position_driver"] = base_query_1["prior_position_driver"].replace('DNF', 21) #21 = DNF
base_query_1.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_points_driver,prior_total_wins_driver,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result
0,Bottas,Mercedes,2019,1011,2019,1010,1010-822,822,131,26.0,...,26.0,1,44.0,1,-0.471972,-0.677308,0.550517,0.974494,2.0,2
1,Hamilton,Mercedes,2019,1011,2019,1010,1010-1,1,131,18.0,...,18.0,0,44.0,1,-0.387785,-1.05534,0.545485,0.97175,1.0,1
2,Verstappen,Red Bull,2019,1011,2019,1010,1010-830,830,9,15.0,...,15.0,0,15.0,0,-0.381199,-0.083424,0.564151,0.992181,4.0,4
3,Vettel,Ferrari,2019,1011,2019,1010,1010-20,20,6,12.0,...,12.0,0,22.0,0,-0.241778,-0.072923,0.570729,0.988996,3.0,5
4,Leclerc,Ferrari,2019,1011,2019,1010,1010-844,844,6,10.0,...,10.0,0,22.0,0,-0.23726,-1.085676,0.565866,0.995169,5.0,3


In [18]:
# Fill missing Values for Quali data
base_query_1.q1_z.max() # 3.6397266253351455

q1_z_max = 4

base_query_1['q1_z'] = base_query_1['q1_z'].fillna(q1_z_max)
base_query_1['q2_z'] = base_query_1['q2_z'].fillna(base_query_1.q1_z)
base_query_1['q3_z'] = base_query_1['q3_z'].fillna((base_query_1.q1_z + base_query_1.q1_z)/2)
base_query_1['q_z_avg'] = (base_query_1.q1_z + base_query_1.q2_z + base_query_1.q3_z)/3

# Creating bool column indicating win
base_query_1['pred_win'] = np.where(base_query_1.upcoming_race_result == '1', 1, 0)

base_query_1.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result,q_z_avg,pred_win
0,Bottas,Mercedes,2019,1011,2019,1010,1010-822,822,131,26.0,...,44.0,1,-0.471972,-0.677308,0.550517,0.974494,2.0,2,0.282568,0
1,Hamilton,Mercedes,2019,1011,2019,1010,1010-1,1,131,18.0,...,44.0,1,-0.387785,-1.05534,0.545485,0.97175,1.0,1,0.153965,1
2,Verstappen,Red Bull,2019,1011,2019,1010,1010-830,830,9,15.0,...,15.0,0,-0.381199,-0.083424,0.564151,0.992181,4.0,4,0.490969,0
3,Vettel,Ferrari,2019,1011,2019,1010,1010-20,20,6,12.0,...,22.0,0,-0.241778,-0.072923,0.570729,0.988996,3.0,5,0.495601,0
4,Leclerc,Ferrari,2019,1011,2019,1010,1010-844,844,6,10.0,...,22.0,0,-0.23726,-1.085676,0.565866,0.995169,5.0,3,0.158453,0


In [19]:
# Drop DNF, will not use for train or test sets
base_query_1 = base_query_1[base_query_1.upcoming_race_result != 'DNF']
base_query_1.head()

Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result,q_z_avg,pred_win
0,Bottas,Mercedes,2019,1011,2019,1010,1010-822,822,131,26.0,...,44.0,1,-0.471972,-0.677308,0.550517,0.974494,2.0,2,0.282568,0
1,Hamilton,Mercedes,2019,1011,2019,1010,1010-1,1,131,18.0,...,44.0,1,-0.387785,-1.05534,0.545485,0.97175,1.0,1,0.153965,1
2,Verstappen,Red Bull,2019,1011,2019,1010,1010-830,830,9,15.0,...,15.0,0,-0.381199,-0.083424,0.564151,0.992181,4.0,4,0.490969,0
3,Vettel,Ferrari,2019,1011,2019,1010,1010-20,20,6,12.0,...,22.0,0,-0.241778,-0.072923,0.570729,0.988996,3.0,5,0.495601,0
4,Leclerc,Ferrari,2019,1011,2019,1010,1010-844,844,6,10.0,...,22.0,0,-0.23726,-1.085676,0.565866,0.995169,5.0,3,0.158453,0


In [20]:
drop_cols = ['upcoming_race_year','upcoming_race','prior_race_year'
             ,'prior_race', 'comp_key','driver_id','constructor_id','upcoming_race_result']

f1_2019 = base_query_1.drop(drop_cols, axis = 1)
f1_2019.head()

Unnamed: 0,surname,constructor_name,prior_points_driver,prior_position_driver,prior_total_points_driver,prior_total_wins_driver,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,q_z_avg,pred_win
0,Bottas,Mercedes,26.0,1,26.0,1,44.0,1,-0.471972,-0.677308,0.550517,0.974494,2.0,0.282568,0
1,Hamilton,Mercedes,18.0,2,18.0,0,44.0,1,-0.387785,-1.05534,0.545485,0.97175,1.0,0.153965,1
2,Verstappen,Red Bull,15.0,3,15.0,0,15.0,0,-0.381199,-0.083424,0.564151,0.992181,4.0,0.490969,0
3,Vettel,Ferrari,12.0,4,12.0,0,22.0,0,-0.241778,-0.072923,0.570729,0.988996,3.0,0.495601,0
4,Leclerc,Ferrari,10.0,5,10.0,0,22.0,0,-0.23726,-1.085676,0.565866,0.995169,5.0,0.158453,0


In [None]:
##### Training Set
upcoming_race_test_index = base_query_1[base_query_1['upcoming_race_result'].isnull()].index.tolist()
upcoming_race_test_index

base_query_test = base_query_1.iloc[upcoming_race_test_index]
base_query_test