In [1]:
import os
import wget
import zipfile
import shutil
import glob
import re
import csv
import pandas as pd
import numpy as np
import pymysql
import sqlalchemy
import matplotlib.pyplot as plt

In [2]:
conn = pymysql.connect(
    host = '127.0.0.1',
    port = 3306,
    user = 'root',
    passwd = 'root',
    db = 'f1test'
)

cur = conn.cursor()

e = sqlalchemy.create_engine("mysql+pymysql://root:root@localhost/f1test")


# Querying DF to start predictive stuff
#### Things to Try:
1. Logistic
2. Recode Categorical Vars -> SVM/Tree/RF (NN? Probably not big enough)
3. Other Stuff 

Asside: We don't have enough data to do any TS stuff

In [15]:
query = '''with 
base as (
	select
		re.race_id + 1 as upcoming_race, ra.year as prior_race_year, re.race_id as prior_race
		, concat(re.race_id, '-', re.driver_id) as comp_key
		, re.driver_id, re.constructor_id
		, re.points as prior_points_driver
		, re.position as prior_position_driver
		, ds.points as prior_total_points_driver
		, ds.wins as prior_total_wins_driver
		, cs.points as prior_total_points_constructor
		, cs.wins as prior_total_wins_constructor
	from results re
	  join races ra
		on re.race_id = ra.race_id
	  join constructor_standings cs
		on re.race_id = cs.race_id and re.constructor_id = cs.constructor_id
	  join driver_standings ds
		on re.race_id = ds.race_id and re.driver_id = ds.driver_id
	  where ra.year = 2018
),
step1_zscore as (
	  select 
        lt.race_id, lt.driver_id, lt.milliseconds
		, avg(lt.milliseconds) OVER (partition by lt.race_id) as ms_avg_race
		, stddev(lt.milliseconds) OVER (partition by lt.race_id) as sd_ms
		, concat(lt.race_id, '-', lt.driver_id) as fk
	  from lap_times lt
		join races ra
		  on lt.race_id = ra.race_id
		where ra.year = 2018
),
z_score as (
	  select  
		avg((milliseconds - ms_avg_race) / sd_ms) as scaled_performance, fk
	  from step1_zscore
	  group by fk
),
quali as (
	  select 
		race_id, driver_id
		, substring_index(q1, ':', 1)*60*1000 as q1_min_ms
		, substring_index(substring_index(q1, '.', 1), ':', -1) * 1000 as q1_sec_ms
		, substring_index(q1, '.', -1) as q1_ms
		, substring_index(q2, ':', 1)*60*1000 as q2_min_ms
		, substring_index(substring_index(q2, '.', 1), ':', -1) * 1000 as q2_sec_ms
		, substring_index(q2, '.', -1) as q2_ms
		, substring_index(q3, ':', 1)*60*1000 as q3_min_ms
		, substring_index(substring_index(q3, '.', 1), ':', -1) * 1000 as q3_sec_ms
		, substring_index(q3, '.', -1) as q3_ms
		, position as prior_pole_position_quali
      from qualifying
),
quali_step_1 as (
	  select
		race_id, driver_id
        , prior_pole_position_quali
		, q1_min_ms + q1_sec_ms + q1_ms as q1_ms_tot
        , q2_min_ms + q2_sec_ms + q2_ms as q2_ms_tot
        , q3_min_ms + q3_sec_ms + q3_ms as q3_ms_tot
	  from quali		
),
quali_avgs as (
	  select
		race_id, driver_id
        , prior_pole_position_quali
        , q1_ms_tot
        , q2_ms_tot
        , q3_ms_tot
        , avg(q1_ms_tot) over (partition by race_id) as q1_avg
        , avg(q2_ms_tot) over (partition by race_id) as q2_avg
        , avg(q3_ms_tot) over (partition by race_id) as q3_avg
        , stddev(q1_ms_tot) over (partition by race_id) as q1_sd
        , stddev(q2_ms_tot) over (partition by race_id) as q2_sd
        , stddev(q3_ms_tot) over (partition by race_id) as q3_sd
        from quali_step_1
),
quali_z as (
	  select
		concat(race_id, '-', driver_id) as fk
        , prior_pole_position_quali as prior_pole_position
        , race_id, driver_id
        , (q1_ms_tot - q1_avg)/q1_sd as q1_z
        , (q2_ms_tot - q2_avg)/q2_sd as q2_z
        , (q3_ms_tot - q3_avg)/q3_sd as q3_z
        from quali_avgs
)
select
	d.surname, c.constructor_ref as constructor_name
    , ra.year as upcoming_race_year -- Just to double check for backtesting (cant predict the first race of the next year with the last race of the prior year)
    , b.*
    , z.scaled_performance as prior_race_scaled_performance
    , q.q1_z, q.q2_z, q.q3_z
    , q.prior_pole_position
    , re.position as upcoming_race_result
from base b
  left join driver d
    on b.driver_id = d.driver_id
  left join constructors c
    on b.constructor_id = c.constructor_id
  left join races ra
    on b.upcoming_race = ra.race_id
  left join z_score z
    on b.comp_key = z.fk
  left join quali_z q
	on b.comp_key = q.fk
  left join results re
	on b.upcoming_race = re.race_id and b.driver_id = re.driver_id
order by prior_race asc, prior_points_driver desc;'''



In [16]:
base_query = pd.read_sql_query(query, e)
base_query.head()

Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_points_driver,prior_total_wins_driver,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result
0,Vettel,ferrari,2018,990,2018,989,989-20,20,6,25.0,...,25.0,1,40.0,1,-0.057987,-1.016528,-1.493727,-0.773568,3,1
1,Hamilton,mercedes,2018,990,2018,989,989-1,1,131,18.0,...,18.0,0,22.0,0,-0.049803,-1.849851,-1.365405,-1.561687,1,3
2,Räikkönen,ferrari,2018,990,2018,989,989-8,8,6,15.0,...,15.0,0,40.0,1,-0.047735,-1.417286,-0.81854,-0.785261,2,\N
3,Ricciardo,red_bull,2018,990,2018,989,989-817,817,9,12.0,...,12.0,0,20.0,0,-0.0465,-0.784342,-0.350826,-0.406402,5,\N
4,Alonso,mclaren,2018,990,2018,989,989-4,4,1,10.0,...,10.0,0,12.0,0,-0.012672,-0.62054,0.602592,,11,7


In [17]:
base_query_1 = base_query

# DNF for all \N values
base_query_1 = base_query_1.replace(to_replace = '\\N', value = "DNF")
base_query_1["prior_position_driver"] = base_query_1["prior_position_driver"].replace('DNF', 21) #21 = DNF

base_query_1.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_points_driver,prior_total_wins_driver,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result
0,Vettel,ferrari,2018,990,2018,989,989-20,20,6,25.0,...,25.0,1,40.0,1,-0.057987,-1.016528,-1.493727,-0.773568,3,1
1,Hamilton,mercedes,2018,990,2018,989,989-1,1,131,18.0,...,18.0,0,22.0,0,-0.049803,-1.849851,-1.365405,-1.561687,1,3
2,Räikkönen,ferrari,2018,990,2018,989,989-8,8,6,15.0,...,15.0,0,40.0,1,-0.047735,-1.417286,-0.81854,-0.785261,2,DNF
3,Ricciardo,red_bull,2018,990,2018,989,989-817,817,9,12.0,...,12.0,0,20.0,0,-0.0465,-0.784342,-0.350826,-0.406402,5,DNF
4,Alonso,mclaren,2018,990,2018,989,989-4,4,1,10.0,...,10.0,0,12.0,0,-0.012672,-0.62054,0.602592,,11,7


In [18]:
# Fill missing Values for Quali data
base_query_1.q1_z.max() # 2.6718442273878935

q1_z_max = 3

base_query_1['q1_z'] = base_query_1['q1_z'].fillna(q1_z_max)
base_query_1['q2_z'] = base_query_1['q2_z'].fillna(base_query_1.q1_z)
base_query_1['q3_z'] = base_query_1['q3_z'].fillna((base_query_1.q1_z + base_query_1.q1_z)/2)
base_query_1['q_z_avg'] = (base_query_1.q1_z + base_query_1.q2_z + base_query_1.q3_z)/3

# Creating bool column indicating win
base_query_1['pred_win'] = np.where(base_query_1.upcoming_race_result == '1', 1, 0)

base_query_1.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result,q_z_avg,pred_win
0,Vettel,ferrari,2018,990,2018,989,989-20,20,6,25.0,...,40.0,1,-0.057987,-1.016528,-1.493727,-0.773568,3,1,-1.094608,1
1,Hamilton,mercedes,2018,990,2018,989,989-1,1,131,18.0,...,22.0,0,-0.049803,-1.849851,-1.365405,-1.561687,1,3,-1.592315,0
2,Räikkönen,ferrari,2018,990,2018,989,989-8,8,6,15.0,...,40.0,1,-0.047735,-1.417286,-0.81854,-0.785261,2,DNF,-1.007029,0
3,Ricciardo,red_bull,2018,990,2018,989,989-817,817,9,12.0,...,20.0,0,-0.0465,-0.784342,-0.350826,-0.406402,5,DNF,-0.513857,0
4,Alonso,mclaren,2018,990,2018,989,989-4,4,1,10.0,...,12.0,0,-0.012672,-0.62054,0.602592,-0.62054,11,7,-0.212829,0


In [19]:
# Drop DNF, will not use for train or test sets
base_query_1 = base_query_1[base_query_1.upcoming_race_result != 'DNF']
base_query_1.head()

Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result,q_z_avg,pred_win
0,Vettel,ferrari,2018,990,2018,989,989-20,20,6,25.0,...,40.0,1,-0.057987,-1.016528,-1.493727,-0.773568,3,1,-1.094608,1
1,Hamilton,mercedes,2018,990,2018,989,989-1,1,131,18.0,...,22.0,0,-0.049803,-1.849851,-1.365405,-1.561687,1,3,-1.592315,0
4,Alonso,mclaren,2018,990,2018,989,989-4,4,1,10.0,...,12.0,0,-0.012672,-0.62054,0.602592,-0.62054,11,7,-0.212829,0
6,Hülkenberg,renault,2018,990,2018,989,989-807,807,4,6.0,...,7.0,0,-0.004897,-0.326332,0.4251,1.207254,8,6,0.435341,0
7,Bottas,mercedes,2018,990,2018,989,989-822,822,131,4.0,...,22.0,0,-0.002186,-0.479002,-1.319833,-0.479002,10,2,-0.759279,0


In [20]:
f1_2018 = base_query_1

f1_2018.prior_race_scaled_performance.corr(f1_2018.q_z_avg) # no major correlation

# fill missing scaled performance with 0 (average)
f1_2018['prior_race_scaled_performance'] = f1_2018['prior_race_scaled_performance'].fillna(0)

f1_2018.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result,q_z_avg,pred_win
0,Vettel,ferrari,2018,990,2018,989,989-20,20,6,25.0,...,40.0,1,-0.057987,-1.016528,-1.493727,-0.773568,3,1,-1.094608,1
1,Hamilton,mercedes,2018,990,2018,989,989-1,1,131,18.0,...,22.0,0,-0.049803,-1.849851,-1.365405,-1.561687,1,3,-1.592315,0
4,Alonso,mclaren,2018,990,2018,989,989-4,4,1,10.0,...,12.0,0,-0.012672,-0.62054,0.602592,-0.62054,11,7,-0.212829,0
6,Hülkenberg,renault,2018,990,2018,989,989-807,807,4,6.0,...,7.0,0,-0.004897,-0.326332,0.4251,1.207254,8,6,0.435341,0
7,Bottas,mercedes,2018,990,2018,989,989-822,822,131,4.0,...,22.0,0,-0.002186,-0.479002,-1.319833,-0.479002,10,2,-0.759279,0


In [21]:
# Rank group by driver for future weighted average
f1_2018["rank"] = f1_2018.groupby("surname")["prior_race"].rank("dense", ascending=True)
f1_2018.head()


Unnamed: 0,surname,constructor_name,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,...,prior_total_wins_constructor,prior_race_scaled_performance,q1_z,q2_z,q3_z,prior_pole_position,upcoming_race_result,q_z_avg,pred_win,rank
0,Vettel,ferrari,2018,990,2018,989,989-20,20,6,25.0,...,1,-0.057987,-1.016528,-1.493727,-0.773568,3,1,-1.094608,1,1.0
1,Hamilton,mercedes,2018,990,2018,989,989-1,1,131,18.0,...,0,-0.049803,-1.849851,-1.365405,-1.561687,1,3,-1.592315,0,1.0
4,Alonso,mclaren,2018,990,2018,989,989-4,4,1,10.0,...,0,-0.012672,-0.62054,0.602592,-0.62054,11,7,-0.212829,0,1.0
6,Hülkenberg,renault,2018,990,2018,989,989-807,807,4,6.0,...,0,-0.004897,-0.326332,0.4251,1.207254,8,6,0.435341,0,1.0
7,Bottas,mercedes,2018,990,2018,989,989-822,822,131,4.0,...,0,-0.002186,-0.479002,-1.319833,-0.479002,10,2,-0.759279,0,1.0


In [29]:
test_1 = f1_2018[['surname', 'prior_race', 'prior_points_driver', 'rank']]
test_1.head()

Unnamed: 0,surname,prior_race,prior_points_driver,rank
0,Vettel,989,25.0,1.0
1,Hamilton,989,18.0,1.0
4,Alonso,989,10.0,1.0
6,Hülkenberg,989,6.0,1.0
7,Bottas,989,4.0,1.0


In [32]:
test = test_1.copy() # prevents setting with copy warning. Should probably use iloc but whatever
test
test['harmonic_base'] = test['rank']
test['geometric_base'] = test['rank'] ** 2

test['harmonic_divis'] = test['harmonic_base'].cumsum(axis = 0)
test['geometric_divis'] = test['geometric_base'].cumsum(axis = 0)

test['harmonic_w'] = test['prior_points_driver'] * test['harmonic_base']
test['geometric_w'] = test['prior_points_driver'] * test['geometric_base']

test['harmonic_sum'] = test['harmonic_w'].cumsum(axis = 0)
test['geometric_sum'] = test['geometric_w'].cumsum(axis = 0)

test['harmonic_final'] = test['harmonic_sum'] / test ['harmonic_divis']
test['geometric_final'] = test['geometric_sum'] / test ['geometric_divis']

test.sort_values(by = ['surname','prior_race'], ascending = [True, True]).head() # Just making sure it sorted right


Unnamed: 0,surname,prior_race,prior_points_driver,rank,harmonic_base,geometric_base,harmonic_divis,geometric_divis,harmonic_w,geometric_w,harmonic_sum,geometric_sum,harmonic_final,geometric_final
4,Alonso,989,10.0,1.0,1.0,1.0,3.0,3.0,10.0,10.0,53.0,53.0,17.666667,17.666667
26,Alonso,990,6.0,2.0,2.0,4.0,31.0,45.0,12.0,24.0,254.0,442.0,8.193548,9.822222
46,Alonso,991,6.0,3.0,3.0,9.0,62.0,110.0,18.0,54.0,352.0,692.0,5.677419,6.290909
66,Alonso,992,6.0,4.0,4.0,16.0,116.0,296.0,24.0,96.0,677.0,1971.0,5.836207,6.658784
134,Alonso,995,0.0,5.0,5.0,25.0,379.0,1619.0,0.0,0.0,2189.0,9949.0,5.775726,6.145151


In [12]:
# Dummy the cat vars driver and constructor
# f1_2018['surname'] = pd.Categorical(f1_2018.surname)
# f1_2018['constructor_name'] = pd.Categorical(f1_2018.constructor_name)

# f1_dummies = pd.get_dummies(f1_2018, columns = ['surname','constructor_name'])
# list(f1_test.columns) 


In [13]:
# train_sets = []
# test_sets = []

# for i in range(10):
#     train_sets.append(f1_dummies[f1_dummies.prior_race < (989 + 11 + i)])

# for i in range(10):
#     test_sets.append(f1_dummies[f1_dummies.prior_race == (989 + 11 + i)])
    
### Indexes will match, index[0] in train corresponds to the index[0] in test set
    

In [14]:
# training on 989 to 999
print(max(train_sets[0].prior_race))

# first race to predict on is 1000
print(max(test_sets[0].prior_race)) 


# triaing on 989 to 1008
print(max(train_sets[9].prior_race))
# testing on 1009
print(max(test_sets[9].prior_race)) 

NameError: name 'train_sets' is not defined

In [None]:
# train_sets[0]