# Get Races from 2017 - 2022

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

BASE_URL = 'https://racing-reference.info'
years = range(2017, 2023)
cup_results = [requests.get(BASE_URL + f'/season-stats/{year}/W') for year in years]
set([r.status_code for r in cup_results])

In [None]:
race_anchors = []
href_regex = re.compile('/race-results/.*/W')

for c in cup_results:
    race_anchors.extend(BeautifulSoup(c.text, 'lxml').find_all(href=href_regex))

In [None]:
races = [requests.get(a.attrs['href']) for a in race_anchors]

In [None]:
set([r.status_code for r in races])

In [None]:
# display first race
pd.read_html(races[0].text, match='Sponsor / Owner', header=0)[-1]

In [None]:
#create a dictionary containing a list of each year's tracks
trackdata = []
years = range(2017, 2023)
for x in cup_results:
    trackdata.append(BeautifulSoup(x.text, 'lxml').find_all(class_='track W'))

tracks = []
for group in trackdata:
    yeartracks = []
    for tag in group:
        yeartracks.append(tag.text.strip())
    tracks.append(yeartracks)
    
trackdict = dict(zip(years, tracks))

In [None]:
trackdict

In [None]:
#flatten track list
tracklst = [x for l in tracks for x in l]

#create empty dataframe to put race data into
racedata = pd.DataFrame(columns = [
 'Pos',
 'St',
 '#',
 'Driver',
 'Sponsor / Owner',
 'Car',
 'Laps',
 'Status',
 'Led',
 'Pts',
 'PPts',
 'Track'])

#create dataframe with data for all races from 2017-2021
for i in range(len(races)):
    race = pd.read_html(races[i].text, match='Sponsor / Owner', header=0)[-1]
    race['Track'] = tracklst[i]
    racedata = pd.concat([racedata, race])
    #print(race)
    
print(racedata)

In [None]:
type(racedata)

In [None]:
# dump racedata to csv
racedata.to_csv('data/racedata_2017-2022.csv')

# ETL

In [28]:
# Imports
import pandas as pd
import pickle
import numpy as np

# read in race data
racedata = pd.read_csv('data/racedata_2017-2022.csv')

In [29]:
racedata

Unnamed: 0.1,Unnamed: 0,Pos,St,#,Driver,Sponsor / Owner,Car,Laps,Status,Led,Pts,PPts,Track
0,0,1,8,41,Kurt Busch,Haas Automation / Monster Energy (Stewart Haas...,Ford,200,running,1,48,5,Daytona
1,1,2,36,21,Ryan Blaney,Motorcraft / Quick Lane Tire & Auto Center (Wo...,Ford,200,running,2,44,0,Daytona
2,2,3,38,47,A.J. Allmendinger,Kroger ClickList / Stouffer's / Cheez-It (JTG-...,Chevrolet,200,running,2,39,0,Daytona
3,3,4,13,43,Aric Almirola,Smithfield Foods (Richard Petty Motorsports),Ford,200,running,2,33,0,Daytona
4,4,5,33,27,Paul Menard,Menards / Peak (Richard Childress),Chevrolet,200,running,0,32,0,Daytona
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8307,31,32,29,47,"Ricky Stenhouse, Jr.",Fry's/ Nature Valley (JTG-Daugherty Racing),Chevrolet,307,running,0,5,0,Phoenix
8308,32,33,35,15,Garrett Smithley,Jacob Companies (Rick Ware),Ford,304,running,0,0,0,Phoenix
8309,33,34,24,48,Alex Bowman,Ally (Rick Hendrick),Chevrolet,304,running,1,3,0,Phoenix
8310,34,35,18,6,Brad Keselowski,Kohler Generators (Jack Roush),Ford,270,electrical,0,2,0,Phoenix


In [30]:
racedata['race_ID'] = np.nan

In [31]:
racedata

Unnamed: 0.1,Unnamed: 0,Pos,St,#,Driver,Sponsor / Owner,Car,Laps,Status,Led,Pts,PPts,Track,race_ID
0,0,1,8,41,Kurt Busch,Haas Automation / Monster Energy (Stewart Haas...,Ford,200,running,1,48,5,Daytona,
1,1,2,36,21,Ryan Blaney,Motorcraft / Quick Lane Tire & Auto Center (Wo...,Ford,200,running,2,44,0,Daytona,
2,2,3,38,47,A.J. Allmendinger,Kroger ClickList / Stouffer's / Cheez-It (JTG-...,Chevrolet,200,running,2,39,0,Daytona,
3,3,4,13,43,Aric Almirola,Smithfield Foods (Richard Petty Motorsports),Ford,200,running,2,33,0,Daytona,
4,4,5,33,27,Paul Menard,Menards / Peak (Richard Childress),Chevrolet,200,running,0,32,0,Daytona,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8307,31,32,29,47,"Ricky Stenhouse, Jr.",Fry's/ Nature Valley (JTG-Daugherty Racing),Chevrolet,307,running,0,5,0,Phoenix,
8308,32,33,35,15,Garrett Smithley,Jacob Companies (Rick Ware),Ford,304,running,0,0,0,Phoenix,
8309,33,34,24,48,Alex Bowman,Ally (Rick Hendrick),Chevrolet,304,running,1,3,0,Phoenix,
8310,34,35,18,6,Brad Keselowski,Kohler Generators (Jack Roush),Ford,270,electrical,0,2,0,Phoenix,


In [32]:
race_id = 1
for i in range(len(racedata)):
    # assign race ID of first entry
    if i == 0:
        racedata.iloc[i, 13] = int(race_id)
    elif i != 0:
        # check if prior entry's track is the same. if not, assign a new race ID
        if racedata.iloc[i, 12] != racedata.iloc[i-1, 12]:
            race_id += 1
            racedata.iloc[i, 13] = int(race_id)
        else:
            racedata.iloc[i, 13] = int(race_id)
            

In [33]:
racedata['race_ID'].unique()

array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       100., 101., 102., 103., 104., 105., 106., 107., 108., 109., 110.,
       111., 112., 113., 114., 115., 116., 117., 118., 119., 120., 121.,
       122., 123., 124., 125., 126., 127., 128., 129., 130., 131., 132.,
       133., 134., 135., 136., 137., 138., 139., 140., 141., 142., 143.,
       144., 145., 146., 147., 148., 149., 150., 15

In [34]:
# for ID in list(racedata['race_ID'].unique()):
#     race_df = racedata[(racedata['race_ID'] == ID)]
#     display(race_df)

In [35]:
race_df = racedata[(racedata['race_ID'] == 1)]

In [36]:
for i in list(race_df['Pos']):
    print("'" + str(i) + "'", ':', ',')

'1' : ,
'2' : ,
'3' : ,
'4' : ,
'5' : ,
'6' : ,
'7' : ,
'8' : ,
'9' : ,
'10' : ,
'11' : ,
'12' : ,
'13' : ,
'14' : ,
'15' : ,
'16' : ,
'17' : ,
'18' : ,
'19' : ,
'20' : ,
'21' : ,
'22' : ,
'23' : ,
'24' : ,
'25' : ,
'26' : ,
'27' : ,
'28' : ,
'29' : ,
'30' : ,
'31' : ,
'32' : ,
'33' : ,
'34' : ,
'35' : ,
'36' : ,
'37' : ,
'38' : ,
'39' : ,
'40' : ,


In [37]:
# dictionary for FD point values assigned for each finishing position
FD_points_dict = {
    '1' : 43,
    '2' : 40,
    '3' : 38,
    '4' : 37,
    '5' : 36,
    '6' : 35,
    '7' : 34,
    '8' : 33,
    '9' : 32,
    '10' : 31,
    '11' : 30,
    '12' : 29,
    '13' : 28,
    '14' : 27,
    '15' : 26,
    '16' : 25,
    '17' : 24,
    '18' : 23,
    '19' : 22,
    '20' : 21,
    '21' : 20,
    '22' : 19,
    '23' : 18,
    '24' : 17,
    '25' : 16,
    '26' : 15,
    '27' : 14,
    '28' : 13,
    '29' : 12,
    '30' : 11,
    '31' : 10,
    '32' : 9,
    '33' : 8,
    '34' : 7,
    '35' : 6,
    '36' : 5,
    '37' : 4,
    '38' : 3,
    '39' : 2,
    '40' : 1,
    '41' : 1,
    '42' : 1,
    '43' : 1
}

In [38]:
# laps completed: 0.1 * LC
# laps led: 0.1 * LL
# place diff: +/- 0.5 * place diff

In [39]:
racedata['FP'] = np.nan

In [40]:
racedata

Unnamed: 0.1,Unnamed: 0,Pos,St,#,Driver,Sponsor / Owner,Car,Laps,Status,Led,Pts,PPts,Track,race_ID,FP
0,0,1,8,41,Kurt Busch,Haas Automation / Monster Energy (Stewart Haas...,Ford,200,running,1,48,5,Daytona,1.0,
1,1,2,36,21,Ryan Blaney,Motorcraft / Quick Lane Tire & Auto Center (Wo...,Ford,200,running,2,44,0,Daytona,1.0,
2,2,3,38,47,A.J. Allmendinger,Kroger ClickList / Stouffer's / Cheez-It (JTG-...,Chevrolet,200,running,2,39,0,Daytona,1.0,
3,3,4,13,43,Aric Almirola,Smithfield Foods (Richard Petty Motorsports),Ford,200,running,2,33,0,Daytona,1.0,
4,4,5,33,27,Paul Menard,Menards / Peak (Richard Childress),Chevrolet,200,running,0,32,0,Daytona,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8307,31,32,29,47,"Ricky Stenhouse, Jr.",Fry's/ Nature Valley (JTG-Daugherty Racing),Chevrolet,307,running,0,5,0,Phoenix,210.0,
8308,32,33,35,15,Garrett Smithley,Jacob Companies (Rick Ware),Ford,304,running,0,0,0,Phoenix,210.0,
8309,33,34,24,48,Alex Bowman,Ally (Rick Hendrick),Chevrolet,304,running,1,3,0,Phoenix,210.0,
8310,34,35,18,6,Brad Keselowski,Kohler Generators (Jack Roush),Ford,270,electrical,0,2,0,Phoenix,210.0,


In [41]:
for i in range(len(racedata)):
    
    # get start/finish position, laps completed, laps led, FP assigned on finish
    fin_pos = racedata.iloc[i, 1]
    strt_pos = racedata.iloc[i, 2]
    LC = racedata.iloc[i, 7]
    LL = racedata.iloc[i, 9]
    fin_pos_FP = FD_points_dict[str(racedata.iloc[i, 1])]
    place_diff = strt_pos - fin_pos
    
    # calculate fantasy points
    FP = fin_pos_FP + (0.1 * LC) + (0.1 * LL) + (0.5 * place_diff)
    
    # if fantasy points are negative, FP = 0
    if FP < 0:
        FP = 0
    
    racedata.iloc[i, 14] = FP

In [117]:
racedata_df = racedata.copy()

In [118]:
for col in racedata_df.columns:
    if 'Unnamed' in col:
        racedata_df.drop(col, axis = 1, inplace = True)
        
# drop unnecessary cols.
racedata_df.drop(['Sponsor / Owner', 'Car', 'Status'], axis = 1, inplace = True)

In [119]:
racedata_df

Unnamed: 0,Pos,St,#,Driver,Laps,Led,Pts,PPts,Track,race_ID,FP
0,1,8,41,Kurt Busch,200,1,48,5,Daytona,1.0,66.6
1,2,36,21,Ryan Blaney,200,2,44,0,Daytona,1.0,77.2
2,3,38,47,A.J. Allmendinger,200,2,39,0,Daytona,1.0,75.7
3,4,13,43,Aric Almirola,200,2,33,0,Daytona,1.0,61.7
4,5,33,27,Paul Menard,200,0,32,0,Daytona,1.0,70.0
...,...,...,...,...,...,...,...,...,...,...,...
8307,32,29,47,"Ricky Stenhouse, Jr.",307,0,5,0,Phoenix,210.0,38.2
8308,33,35,15,Garrett Smithley,304,0,0,0,Phoenix,210.0,39.4
8309,34,24,48,Alex Bowman,304,1,3,0,Phoenix,210.0,32.5
8310,35,18,6,Brad Keselowski,270,0,2,0,Phoenix,210.0,24.5


In [120]:
racedata_df['Track'].sort_values().unique().tolist()

['Atlanta',
 'Austin',
 'Bristol',
 'Bristol (Dirt)',
 'Charlotte',
 'Charlotte (Road)',
 'Chicago',
 'Darlington',
 'Daytona',
 'Daytona (Road)',
 'Dover',
 'Elkhart Lake',
 'Fontana',
 'Fort Worth',
 'Gateway',
 'Homestead',
 'Indianapolis',
 'Indianapolis G.P.',
 'Kansas',
 'Kentucky',
 'Las Vegas',
 'Loudon',
 'Martinsville',
 'Michigan',
 'Nashville',
 'Phoenix',
 'Pocono',
 'Richmond',
 'Sonoma',
 'Talladega',
 'Watkins Glen']

In [121]:
tracktype_dict = {
    'Atlanta': 'int',
    'Austin': 'roadcourse',
    'Bristol': 'short',
    'Bristol (Dirt)': 'short',
    'Charlotte': 'int',
    'Charlotte (Road)': 'roadcourse',
    'Chicago': 'int',
    'Darlington': 'int',
    'Daytona': 'ss',
    'Daytona (Road)': 'roadcourse',
    'Dover': 'int',
    'Elkhart Lake': 'roadcourse',
    'Fontana': 'ss',
    'Fort Worth': 'int',
    'Gateway': 'int',
    'Homestead': 'int',
    'Indianapolis': 'flat',
    'Indianapolis G.P.': 'roadcourse',
    'Kansas': 'int',
    'Kentucky': 'int',
    'Las Vegas': 'int',
    'Loudon': 'flat',
    'Martinsville': 'short',
    'Michigan': 'int',
    'Nashville': 'int',
    'Phoenix': 'flat',
    'Pocono': 'flat',
    'Richmond': 'short',
    'Sonoma': 'roadcourse',
    'Talladega': 'ss',
    'Watkins Glen': 'roadcourse',
}

In [122]:
# assign track types
racedata_df['track_type'] = racedata_df['Track'].map(tracktype_dict)

In [123]:
# map track types to integers
tt_mapper = {'int' : 1, 'short' : 2, 'roadcourse' : 3, 'ss' : 4, 'flat' : 5}
racedata_df['track_type'] = racedata_df['track_type'].replace(tt_mapper)

In [124]:
racedata_df['place_diff'] = racedata_df['St'] - racedata_df['Pos']

In [125]:
# Last 3 means/sums
racedata_df['FinPos_mean_L3'] = racedata_df.groupby(['Driver'])['Pos'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['StrtPos_mean_L3'] = racedata_df.groupby(['Driver'])['St'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PlaceDiff_mean_L3'] = racedata_df.groupby(['Driver'])['place_diff'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LL_sum_L3'] = racedata_df.groupby(['Driver'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LL_mean_L3'] = racedata_df.groupby(['Driver'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LC_sum_L3'] = racedata_df.groupby(['Driver'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LC_mean_L3'] = racedata_df.groupby(['Driver'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['Pts_sum_L3'] = racedata_df.groupby(['Driver'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['Pts_mean_L3'] = racedata_df.groupby(['Driver'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PPts_sum_L3'] = racedata_df.groupby(['Driver'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['PPts_mean_L3'] = racedata_df.groupby(['Driver'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['FP_sum_L3'] = racedata_df.groupby(['Driver'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['FP_mean_L3'] = racedata_df.groupby(['Driver'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())

# Last 3 means/sums at each track
racedata_df['FinPos_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Pos'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['StrtPos_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['St'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PlaceDiff_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['place_diff'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LL_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LL_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LC_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LC_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['Pts_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['Pts_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PPts_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['PPts_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['FP_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['FP_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())

# Last 3 means/sums at each tracktype
racedata_df['FinPos_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Pos'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['StrtPos_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['St'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PlaceDiff_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['place_diff'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LL_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LL_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LC_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LC_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['Pts_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['Pts_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PPts_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['PPts_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['FP_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['FP_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())

# racedata_df.fillna(0, inplace = True)
racedata_df.dropna(inplace = True)
# ranks?

In [126]:
pd.set_option('display.max_columns', None)

In [128]:
racedata_df.head(n = 36)

Unnamed: 0,Pos,St,#,Driver,Laps,Led,Pts,PPts,Track,race_ID,FP,track_type,place_diff,FinPos_mean_L3,StrtPos_mean_L3,PlaceDiff_mean_L3,LL_sum_L3,LL_mean_L3,LC_sum_L3,LC_mean_L3,Pts_sum_L3,Pts_mean_L3,PPts_sum_L3,PPts_mean_L3,FP_sum_L3,FP_mean_L3,FinPos_mean_L3_track,StrtPos_mean_L3_track,PlaceDiff_mean_L3_track,LL_sum_L3_track,LL_mean_L3_track,LC_sum_L3_track,LC_mean_L3_track,Pts_sum_L3_track,Pts_mean_L3_track,PPts_sum_L3_track,PPts_mean_L3_track,FP_sum_L3_track,FP_mean_L3_track,FinPos_mean_L3_tt,StrtPos_mean_L3_tt,PlaceDiff_mean_L3_tt,LL_sum_L3_tt,LL_mean_L3_tt,LC_sum_L3_tt,LC_mean_L3_tt,Pts_sum_L3_tt,Pts_mean_L3_tt,PPts_sum_L3_tt,PPts_mean_L3_tt,FP_sum_L3_tt,FP_mean_L3_tt
624,1,6,17,"Ricky Stenhouse, Jr.",163,17,40,5,Daytona,17.0,63.5,4,5,19.0,22.0,3.0,3.0,1.0,390.0,130.0,56.0,18.666667,0.0,0.0,111.8,37.266667,31.0,23.0,-8.0,0.0,0.0,133.0,133.0,6.0,6.0,0.0,0.0,19.3,19.3,18.0,13.333333,-4.666667,14.0,4.666667,526.0,175.333333,70.0,23.333333,5.0,1.666667,119.0,39.666667
625,2,11,14,Clint Bowyer,163,4,42,0,Daytona,17.0,61.2,4,9,15.0,12.0,-3.0,0.0,0.0,470.0,156.666667,78.0,26.0,0.0,0.0,121.5,40.5,32.0,6.0,-26.0,0.0,0.0,128.0,128.0,9.0,9.0,0.0,0.0,8.8,8.8,16.333333,13.333333,-3.0,10.0,3.333333,521.0,173.666667,78.0,26.0,0.0,0.0,122.6,40.866667
626,3,24,27,Paul Menard,163,0,34,0,Daytona,17.0,64.8,4,21,17.666667,19.333333,1.666667,0.0,0.0,470.0,156.666667,58.0,19.333333,0.0,0.0,119.5,39.833333,5.0,33.0,28.0,0.0,0.0,200.0,200.0,32.0,32.0,0.0,0.0,70.0,70.0,14.0,21.666667,7.666667,3.0,1.0,591.0,197.0,73.0,24.333333,0.0,0.0,151.9,50.633333
627,4,23,95,Michael McDowell,163,0,38,0,Daytona,17.0,62.8,4,19,20.333333,16.333333,-4.0,0.0,0.0,470.0,156.666667,50.0,16.666667,0.0,0.0,103.0,34.333333,15.0,22.0,7.0,0.0,0.0,200.0,200.0,22.0,22.0,0.0,0.0,49.5,49.5,27.333333,23.666667,-3.666667,0.0,0.0,566.0,188.666667,29.0,9.666667,0.0,0.0,92.1,30.7
628,5,22,31,Ryan Newman,163,0,32,0,Daytona,17.0,60.8,4,17,14.666667,16.333333,1.666667,0.0,0.0,470.0,156.666667,68.0,22.666667,0.0,0.0,128.5,42.833333,21.0,14.0,-7.0,0.0,0.0,198.0,198.0,18.0,18.0,0.0,0.0,36.3,36.3,20.333333,13.0,-7.333333,2.0,0.666667,585.0,195.0,52.0,17.333333,0.0,0.0,109.7,36.566667
629,6,30,38,David Ragan,163,4,38,0,Daytona,17.0,63.7,4,24,28.333333,29.333333,1.0,0.0,0.0,466.0,155.333333,26.0,8.666667,0.0,0.0,86.1,28.7,25.0,20.0,-5.0,0.0,0.0,188.0,188.0,12.0,12.0,0.0,0.0,32.3,32.3,22.0,25.333333,3.333333,0.0,0.0,578.0,192.666667,45.0,15.0,0.0,0.0,119.8,39.933333
630,7,32,75,Brendan Gaughan,163,0,0,0,Daytona,17.0,62.8,4,25,18.5,35.5,17.0,0.0,0.0,383.0,191.5,0.0,0.0,0.0,0.0,100.3,50.15,11.0,39.0,28.0,0.0,0.0,200.0,200.0,0.0,0.0,0.0,0.0,64.0,64.0,18.5,35.5,17.0,0.0,0.0,383.0,191.5,0.0,0.0,0.0,0.0,100.3,50.15
631,8,27,47,A.J. Allmendinger,163,0,29,0,Daytona,17.0,58.8,4,19,25.0,17.0,-8.0,4.0,1.333333,464.0,154.666667,45.0,15.0,0.0,0.0,82.8,27.6,3.0,38.0,35.0,2.0,2.0,200.0,200.0,39.0,39.0,0.0,0.0,75.7,75.7,17.0,29.666667,12.666667,2.0,0.666667,570.0,190.0,65.0,21.666667,0.0,0.0,148.2,49.4
632,9,17,77,Erik Jones,163,9,28,0,Daytona,17.0,53.2,4,8,13.666667,19.666667,6.0,20.0,6.666667,469.0,156.333333,75.0,25.0,0.0,0.0,139.9,46.633333,39.0,34.0,-5.0,0.0,0.0,103.0,103.0,1.0,1.0,0.0,0.0,9.8,9.8,28.0,20.666667,-7.333333,0.0,0.0,473.0,157.666667,42.0,14.0,0.0,0.0,75.3,25.1
633,10,29,37,Chris Buescher,163,0,27,0,Daytona,17.0,56.8,4,19,24.666667,20.666667,-4.0,4.0,1.333333,464.0,154.666667,37.0,12.333333,0.0,0.0,89.8,29.933333,35.0,37.0,2.0,0.0,0.0,127.0,127.0,2.0,2.0,0.0,0.0,19.7,19.7,25.0,30.333333,5.333333,0.0,0.0,519.0,173.0,36.0,12.0,0.0,0.0,107.9,35.966667


In [129]:
len(racedata_df)

6297

In [130]:
racedata_df.loc[(racedata_df['Driver'] == 'Kyle Busch') & (racedata_df['track_type'] == 3)][['Pos', 'Track', 'FinPos_mean_L3_tt']].head()

Unnamed: 0,Pos,Track,FinPos_mean_L3_tt
1990,5,Sonoma,6.0
2221,3,Watkins Glen,5.666667
3382,2,Sonoma,13.333333
3618,11,Watkins Glen,12.333333
3914,37,Charlotte (Road),15.0


In [131]:
racedata_df.to_csv('data/racedata_2017-2022_ETL.csv')

# Model Development

In [132]:
ml_df = pd.read_csv('data/racedata_2017-2022_ETL.csv', index_col = 0)

In [133]:
ml_df = ml_df.dropna()

In [134]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

working_directory = 'D:/machine_learning/DFS/nascar/'
data_dir = 'data/'

# needs hyperparameters
def lgbm_mod(): return LGBMRegressor(random_state = 1, n_estimators = 1000, learning_rate = 0.01, n_jobs = -1)
def xgb_mod(): return xgb.XGBRegressor(random_state = 1)
def rf_mod(): return RandomForestRegressor(random_state = 1, n_jobs = -1, n_estimators = 1000)
def sgd_mod(): return SGDRegressor(max_iter=100, tol=1e-3, random_state = 1) # won't work with feature importance
def svm_mod(): return SVR(C=1.0, epsilon=0.2) # won't work with feature importance
def neigh_mod(): return KNeighborsRegressor()
def dt_mod(): return DecisionTreeRegressor(random_state=0)

dcols = [
    'Pos',
     'St',
     '#',
     'Driver',
     'Laps',
     'Led',
     'Pts',
     'PPts',
     'Track',
     'race_ID',
     'FP',
     'place_diff',
]

X = ml_df.drop(dcols, axis = 1)
Y = ml_df['FP']

from sklearn.model_selection import train_test_split

#Create Training and Testing DataSets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42)

X_train.reset_index(inplace = True, drop=True)
X_test.reset_index(inplace = True, drop=True)
Y_train.reset_index(inplace = True, drop=True)
Y_test.reset_index(inplace = True, drop=True)

print('Training set size:', len(X_train))
print('Testing set size:', len(X_test))

pred_df = pd.concat([X_test, Y_test], axis = 1)

print('\nNum Possible Features:',len(X_train.columns.tolist()))

""" MODEL SELECTION """

model = rf_mod()

"""                 """

#print possible features
print('possible features:', X_train.columns.tolist(), '\n')

# Fit model, make predictions with all features
model.fit(X_train, Y_train)

preds_all = model.predict(X_test)

pdf = pred_df[['FP']].copy()

pdf['pred_FP_all'] = preds_all

# calculate MAE
print('MAE:', "{:.2f}".format(mean_absolute_error(Y_test, preds_all)))

Training set size: 4722
Testing set size: 1575

Num Possible Features: 40
possible features: ['track_type', 'FinPos_mean_L3', 'StrtPos_mean_L3', 'PlaceDiff_mean_L3', 'LL_sum_L3', 'LL_mean_L3', 'LC_sum_L3', 'LC_mean_L3', 'Pts_sum_L3', 'Pts_mean_L3', 'PPts_sum_L3', 'PPts_mean_L3', 'FP_sum_L3', 'FP_mean_L3', 'FinPos_mean_L3_track', 'StrtPos_mean_L3_track', 'PlaceDiff_mean_L3_track', 'LL_sum_L3_track', 'LL_mean_L3_track', 'LC_sum_L3_track', 'LC_mean_L3_track', 'Pts_sum_L3_track', 'Pts_mean_L3_track', 'PPts_sum_L3_track', 'PPts_mean_L3_track', 'FP_sum_L3_track', 'FP_mean_L3_track', 'FinPos_mean_L3_tt', 'StrtPos_mean_L3_tt', 'PlaceDiff_mean_L3_tt', 'LL_sum_L3_tt', 'LL_mean_L3_tt', 'LC_sum_L3_tt', 'LC_mean_L3_tt', 'Pts_sum_L3_tt', 'Pts_mean_L3_tt', 'PPts_sum_L3_tt', 'PPts_mean_L3_tt', 'FP_sum_L3_tt', 'FP_mean_L3_tt'] 

MAE: 13.69


In [135]:
pdf

Unnamed: 0,FP,pred_FP_all
0,54.1,75.4512
1,104.5,85.7150
2,61.6,49.3923
3,27.8,33.5974
4,24.8,33.7492
...,...,...
1570,67.6,62.0676
1571,50.6,43.9437
1572,64.2,44.7629
1573,44.6,44.8518


---
**need to develop predictors based on track type**

* assign track type for each race, since it's known
* for each driver and track type, calculate means/sums for L3 of that track type