In [1]:
import os
import wget
import zipfile
import shutil
import glob
import re
import csv
import pandas as pd
import numpy as np
import pymysql
import sqlalchemy
from datetime import datetime

In [2]:
files = os.listdir('./')

for file in files:
    if file == 'f1db_csv.zip':
        os.remove(file)
    if file == 'f1db_csv':
        shutil.rmtree(file)
        
csvs = glob.glob('./*.csv')
for csv in csvs:
    os.remove(csv)

url = 'http://ergast.com/downloads/f1db_csv.zip'

wget.download(url)

with zipfile.ZipFile("f1db_csv.zip") as f_in:
    f_in.extractall()
    
os.remove('f1db_csv.zip')


In [3]:
csvs = glob.glob("*.csv")

circuit_headers = ['circuit_id', 'circuit_ref', 'name', 'location', 'country', 'lat', 'lng', 'alt', 'url']
status_headers = ['status_id', 'status']
lap_time_headers = ['race_id', 'driver_id', 'lap', 'position', 'time', 'milliseconds']
races_headers = ['race_id', 'year', 'round', 'circuit_id', 'name', 'date', 'time', 'url']
constructors_headers = ['constructor_id', 'constructor_ref', 'name', 'nationality', 'url']
constructor_standings_headers = ['constructor_standings_id', 'race_id', 'constructor_id', 'points', 'position', 'position_text', 'wins']
driver_headers = ['driver_id', 'driver_ref', 'number', 'code', 'forename', 'surname', 'dob', 'nationality', 'url']
qualifying_headers = ['qualify_id', 'race_id', 'driver_id', 'constructor_id', 'number', 'position', 'q1', 'q2', 'q3']
driver_standings_headers = ['driver_standings_id', 'race_id', 'driver_id', 'points', 'position', 'position_text', 'wins']
constructor_results_headers = ['constructor_results_id', 'race_id', 'constructor_id', 'points', 'status']
pit_stops_headers = ['race_id', 'driver_id', 'stop', 'lap', 'time', 'duration', 'milliseconds']
seasons_headers = ['year', 'url']
results_headers = ['result_id', 'race_id', 'driver_id', 'constructor_id', 'number', 'grid', 'position', 'position_text',
                   'position_order', 'points', 'laps', 'time', 'milliseconds', 'fastest_lap', 'rank', 'fastest_lap_time',
                   'fastest_lap_speed', 'status_id']

headers_list = [circuit_headers, status_headers, lap_time_headers, races_headers, constructors_headers,
               constructor_standings_headers, driver_headers, qualifying_headers, driver_standings_headers,
               constructor_results_headers, pit_stops_headers, seasons_headers, results_headers]

def header_boi(in_csv, table_headers):
    df = pd.read_csv(in_csv, header = None, index_col = False)
    df.columns = table_headers
    df.to_csv(in_csv)
       

for i in range(0,len(csvs)):
    header_boi(csvs[i], headers_list[i])
    




In [4]:
conn = pymysql.connect(
    host = '127.0.0.1',
    port = 3306,
    user = 'root',
    passwd = 'root',
    db = 'f1test'
)

cur = conn.cursor()

e = sqlalchemy.create_engine("mysql+pymysql://root:root@localhost/f1test")


In [5]:
csvs = glob.glob("*.csv")
tables = glob.glob("*.csv")

for i in range(0, len(tables)):
    tables[i] = re.sub('.csv', '', tables[i])
    
del_statements = []
    
for table in tables:
    del_statements.append('drop table if exists f1test.{}'.format(table))
    
for statement in del_statements:
    cur.execute(statement)
    
    

In [6]:
for table in tables:
    df = pd.read_csv('{}'.format(table) + '.csv')
    df.to_sql('{}'.format(table), con = e)
    

# Querying DF to start predictive stuff
#### Things to Try:
1. Logistic
2. Recode Categorical Vars -> SVM/Tree/RF (NN? Probably not big enough)
3. Other Stuff 

Asside: We don't have enough data to do any TS stuff
    

In [7]:
query = '''with 
base as (
select
    re.race_id + 1 as upcoming_race, ra.year as prior_race_year, re.race_id as prior_race
    , concat(re.race_id, '-', re.driver_id) as comp_key
    , re.driver_id, re.constructor_id
    , re.points as prior_points_driver
    , re.position as prior_position_driver
    , ds.points as prior_total_points_driver
    , ds.wins as prior_total_wins_driver
    , cs.points as prior_total_points_constructor
    , cs.wins as prior_total_wins_constructor
from results re
  join races ra
    on re.race_id = ra.race_id
  join constructor_standings cs
    on re.race_id = cs.race_id and re.constructor_id = cs.constructor_id
  join driver_standings ds
    on re.race_id = ds.race_id and re.driver_id = ds.driver_id
  where ra.year = 2019
),
step1_zscore as (
  select lt.race_id, lt.driver_id, lt.milliseconds
      , avg(lt.milliseconds) OVER (partition by lt.race_id) as ms_avg_race
      , stddev(lt.milliseconds) OVER (partition by lt.race_id) as sd_ms
      , concat(lt.race_id, '-', lt.driver_id) as fk
  from lap_times lt
    join races ra
      on lt.race_id = ra.race_id
    where ra.year = 2019
),
z_score as (
    select  avg((milliseconds - ms_avg_race) / sd_ms) as scaled_performance, fk
    from step1_zscore
    group by fk
),
quali as (
	select concat(race_id, '-', driver_id) as fk
    , q1, q2, q3
    , position as prior_pole_position
    from qualifying
)
select
    d.surname
    , ra.year as upcoming_race_year -- Just to double check for backtesting (cant predict the first race of the next year with the last race of the prior year)
    , b.*
    , z.scaled_performance as prior_race_scaled_performance
    , q.q1, q.q2, q.q3
    , q.prior_pole_position
    , re.position as upcoming_race_result
from base b
  left join driver d
    on b.driver_id = d.driver_id
  left join races ra
    on b.upcoming_race = ra.race_id
  left join z_score z
    on b.comp_key = z.fk
  left join quali q
	on b.comp_key = q.fk
  left join results re
	on b.upcoming_race = re.race_id and b.driver_id = re.driver_id
order by driver_id asc, prior_race asc;'''



In [8]:
base_query = pd.read_sql_query(query, e)

In [16]:
base_query.tail(20)

Unnamed: 0,surname,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,prior_position_driver,prior_total_points_driver,prior_total_wins_driver,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1,q2,q3,prior_pole_position,upcoming_race_result
180,Russell,2019,1011,2019,1010,1010-847,847,3,0.0,16,0.0,0,0.0,0,0.47354,1:24.360,\N,\N,19.0,15
181,Russell,2019,1012,2019,1011,1011-847,847,3,0.0,15,0.0,0,0.0,0,0.224572,1:31.759,\N,\N,19.0,16
182,Russell,2019,1013,2019,1012,1012-847,847,3,0.0,16,0.0,0,0.0,0,0.350543,1:35.253,,,17.0,15
183,Russell,2019,1014,2019,1013,1013-847,847,3,0.0,15,0.0,0,0.0,0,0.353217,1:45.062,,,17.0,17
184,Russell,2019,1015,2019,1014,1014-847,847,3,0.0,17,0.0,0,0.0,0,0.151738,1:19.072,\N,\N,19.0,15
185,Russell,2019,1016,2019,1015,1015-847,847,3,0.0,15,0.0,0,0.0,0,0.053423,1:13.477,\N,\N,19.0,16
186,Russell,2019,1017,2019,1016,1016-847,847,3,0.0,16,0.0,0,0.0,0,0.42682,1:13.617,\N,\N,19.0,19
187,Russell,2019,1018,2019,1017,1017-847,847,3,0.0,19,0.0,0,0.0,0,0.600997,1:32.789,\N,\N,19.0,18
188,Russell,2019,1019,2019,1018,1018-847,847,3,0.0,18,0.0,0,0.0,0,0.350754,1:05.904,\N,\N,19.0,14
189,Russell,2019,1020,2019,1019,1019-847,847,3,0.0,14,0.0,0,0.0,0,0.060591,1:27.789,\N,\N,19.0,


## A little cleanup

In [14]:
base_query_1 = base_query

# DNF for all \N values
base_query_1 = base_query_1.replace(to_replace = '\\N', value = "DNF")
base_query_1["prior_position_driver"] = base_query_1["prior_position_driver"].replace('DNF', 21) #21 = DNF
base_query_1

# Deal with q1 q2 q3 (Convert to MS, then impute missing)
quali = base_query_1[['q1','q2','q3']]
q1_stuff = quali.q1.str.extract(r'(?P<q1_min>[0-9]+):(?P<q1_sec>[0-9]+).(?P<q1_ms>[0-9][0-9]+)')
q2_stuff = quali.q2.str.extract(r'(?P<q2_min>[0-9]+):(?P<q2_sec>[0-9]+).(?P<q2_ms>[0-9][0-9]+)')
q3_stuff = quali.q3.str.extract(r'(?P<q3_min>[0-9]+):(?P<q3_sec>[0-9]+).(?P<q3_ms>[0-9][0-9]+)')

q1_stuff.q1_min = pd.to_numeric(q1_stuff.q1_min) * 60 * 1000
q1_stuff.q1_sec = pd.to_numeric(q1_stuff.q1_sec) * 1000
q1_stuff.q1_ms = pd.to_numeric(q1_stuff.q1_ms)
q1_stuff["q1_tot_ms"] = q1_stuff.q1_min + q1_stuff.q1_sec + q1_stuff.q1_ms

base_query_1['q1'] = q1_stuff.q1_tot_ms

q2_stuff.q2_min = pd.to_numeric(q2_stuff.q2_min) * 60 * 1000
q2_stuff.q2_sec = pd.to_numeric(q2_stuff.q2_sec) * 1000
q2_stuff.q2_ms = pd.to_numeric(q2_stuff.q2_ms)
q2_stuff["q2_tot_ms"] = q2_stuff.q2_min + q2_stuff.q2_sec + q2_stuff.q2_ms

base_query_1['q2'] = q2_stuff.q2_tot_ms

q3_stuff.q3_min = pd.to_numeric(q3_stuff.q3_min) * 60 * 1000
q3_stuff.q3_sec = pd.to_numeric(q3_stuff.q3_sec) * 1000
q3_stuff.q3_ms = pd.to_numeric(q3_stuff.q3_ms)
q3_stuff["q3_tot_ms"] = q3_stuff.q3_min + q3_stuff.q3_sec + q3_stuff.q3_ms

base_query_1['q3'] = q3_stuff.q3_tot_ms

base_query_1.head()

Unnamed: 0,surname,upcoming_race_year,upcoming_race,prior_race_year,prior_race,comp_key,driver_id,constructor_id,prior_points_driver,prior_position_driver,prior_total_points_driver,prior_total_wins_driver,prior_total_points_constructor,prior_total_wins_constructor,prior_race_scaled_performance,q1,q2,q3,prior_pole_position,upcoming_race_result
0,Hamilton,2019,1011,2019,1010,1010-1,1,131,18.0,2,18.0,0,44.0,1,-0.387785,82043.0,81014.0,80486.0,1.0,1
1,Hamilton,2019,1012,2019,1011,1011-1,1,131,25.0,1,43.0,1,87.0,2,-0.128763,89262.0,88578.0,88190.0,3.0,1
2,Hamilton,2019,1013,2019,1012,1012-1,1,131,25.0,1,68.0,2,130.0,3,-0.408917,93115.0,91637.0,91570.0,2.0,2
3,Hamilton,2019,1014,2019,1013,1013-1,1,131,18.0,2,86.0,2,173.0,4,-0.313299,101614.0,101580.0,100554.0,2.0,1
4,Hamilton,2019,1015,2019,1014,1014-1,1,131,26.0,1,112.0,3,217.0,5,-0.035611,77292.0,76038.0,76040.0,2.0,1


In [None]:
# Fill missing Values for Q1


##### Training Set

In [None]:
base_query_train = base_query_1.dropna() 
base_query_train = base_query_train[base_query_train.prior_position_driver != 'DNF'] # Drop DNF Results
base_query_train.tail(10)

In [None]:
upcoming_race_test_index = base_query_1[base_query_1['upcoming_race_result'].isnull()].index.tolist()
upcoming_race_test_index

base_query_test = base_query_1.iloc[upcoming_race_test_index]
base_query_test


In [None]:
base_query_1.describe()

## Creating weighted average of points, position, and pole position
#### Also create vars for categorical data

In [None]:
base_query_2['driver_race_order'] = base_query_1.groupby('surname')['prior_race'].rank(method = 'first')
base_query_2.head(30)
