# Get Races from 2017 - 2022

In [22]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

BASE_URL = 'https://racing-reference.info'
years = range(2017, 2023)
cup_results = [requests.get(BASE_URL + f'/season-stats/{year}/W') for year in years]
set([r.status_code for r in cup_results])

{200}

In [23]:
race_anchors = []
href_regex = re.compile('/race-results/.*/W')
race_dates = []

for c in cup_results:
    race_anchors.extend(BeautifulSoup(c.text, 'lxml').find_all(href=href_regex))
    result = BeautifulSoup(c.text, 'lxml').find_all("div", {"class": "date W"})
    for res in result:
        race_dates.append(res.text)

In [26]:
len(race_dates)

216

In [38]:
# convert race_dates to datetime objs
race_dates_dt = []
from datetime import datetime
for date in race_dates:
    race_dates_dt.append(datetime.strptime(date, '%m/%d/%y'))

In [27]:
len(race_anchors)

216

In [28]:
races = [requests.get(a.attrs['href']) for a in race_anchors]

In [29]:
set([r.status_code for r in races])

{200}

In [42]:
# display first race
pd.read_html(races[0].text, match='Sponsor / Owner', header=0)[-1]

Unnamed: 0,Pos,St,#,Driver,Sponsor / Owner,Car,Laps,Status,Led,Pts,PPts
0,1,8,41,Kurt Busch,Haas Automation / Monster Energy (Stewart Haas...,Ford,200,running,1,48,5
1,2,36,21,Ryan Blaney,Motorcraft / Quick Lane Tire & Auto Center (Wo...,Ford,200,running,2,44,0
2,3,38,47,A.J. Allmendinger,Kroger ClickList / Stouffer's / Cheez-It (JTG-...,Chevrolet,200,running,2,39,0
3,4,13,43,Aric Almirola,Smithfield Foods (Richard Petty Motorsports),Ford,200,running,2,33,0
4,5,33,27,Paul Menard,Menards / Peak (Richard Childress),Chevrolet,200,running,0,32,0
5,6,15,22,Joey Logano,Shell / Pennzoil (Roger Penske),Ford,200,running,16,40,0
6,7,26,5,Kasey Kahne,Farmers Insurance (Rick Hendrick),Chevrolet,200,running,7,30,0
7,8,30,15,Michael Waltrip,"Aaron's ""Thanks, Mikey!"" (Jay Robinson)",Toyota,200,running,0,29,0
8,9,25,32,Matt DiBenedetto,E.J. Wade Foundation (Archie St. Hilaire),Ford,200,running,0,28,0
9,10,11,6,Trevor Bayne,AdvoCare (Jack Roush),Ford,200,running,0,27,0


In [43]:
#create a dictionary containing a list of each year's tracks
trackdata = []
years = range(2017, 2023)
for x in cup_results:
    trackdata.append(BeautifulSoup(x.text, 'lxml').find_all(class_='track W'))

tracks = []
for group in trackdata:
    yeartracks = []
    for tag in group:
        yeartracks.append(tag.text.strip())
    tracks.append(yeartracks)
    
trackdict = dict(zip(years, tracks))

In [44]:
trackdict

{2017: ['Daytona',
  'Atlanta',
  'Las Vegas',
  'Phoenix',
  'Fontana',
  'Martinsville',
  'Fort Worth',
  'Bristol',
  'Richmond',
  'Talladega',
  'Kansas',
  'Charlotte',
  'Dover',
  'Pocono',
  'Michigan',
  'Sonoma',
  'Daytona',
  'Kentucky',
  'Loudon',
  'Indianapolis',
  'Pocono',
  'Watkins Glen',
  'Michigan',
  'Bristol',
  'Darlington',
  'Richmond',
  'Chicago',
  'Loudon',
  'Dover',
  'Charlotte',
  'Talladega',
  'Kansas',
  'Martinsville',
  'Fort Worth',
  'Phoenix',
  'Homestead'],
 2018: ['Daytona',
  'Atlanta',
  'Las Vegas',
  'Phoenix',
  'Fontana',
  'Martinsville',
  'Fort Worth',
  'Bristol',
  'Richmond',
  'Talladega',
  'Dover',
  'Kansas',
  'Charlotte',
  'Pocono',
  'Michigan',
  'Sonoma',
  'Chicago',
  'Daytona',
  'Kentucky',
  'Loudon',
  'Pocono',
  'Watkins Glen',
  'Michigan',
  'Bristol',
  'Darlington',
  'Indianapolis',
  'Las Vegas',
  'Richmond',
  'Charlotte (Road)',
  'Dover',
  'Talladega',
  'Kansas',
  'Martinsville',
  'Fort Worth',

In [45]:
len(races)

216

In [46]:
#flatten track list
tracklst = [x for l in tracks for x in l]

#create empty dataframe to put race data into
racedata = pd.DataFrame(columns = [
 'Pos',
 'St',
 '#',
 'Driver',
 'Sponsor / Owner',
 'Car',
 'Laps',
 'Status',
 'Led',
 'Pts',
 'PPts',
 'Track'])

#create dataframe with data for all races from 2017-2021
for i in range(len(races)):
    race = pd.read_html(races[i].text, match='Sponsor / Owner', header=0)[-1]
    race['Track'] = tracklst[i]
    race['Date'] = race_dates[i]
    race['Date_dtobj'] = race_dates_dt[i]
    racedata = pd.concat([racedata, race])
    #print(race)
    
print(racedata)

   Pos  St   #                Driver  \
0    1   8  41            Kurt Busch   
1    2  36  21           Ryan Blaney   
2    3  38  47     A.J. Allmendinger   
3    4  13  43         Aric Almirola   
4    5  33  27           Paul Menard   
..  ..  ..  ..                   ...   
31  32  29  47  Ricky Stenhouse, Jr.   
32  33  35  15      Garrett Smithley   
33  34  24  48           Alex Bowman   
34  35  18   6       Brad Keselowski   
35  36  36  77        Landon Cassill   

                                      Sponsor / Owner        Car Laps  \
0   Haas Automation / Monster Energy (Stewart Haas...       Ford  200   
1   Motorcraft / Quick Lane Tire & Auto Center (Wo...       Ford  200   
2   Kroger ClickList / Stouffer's / Cheez-It (JTG-...  Chevrolet  200   
3        Smithfield Foods (Richard Petty Motorsports)       Ford  200   
4                  Menards / Peak (Richard Childress)  Chevrolet  200   
..                                                ...        ...  ...   
31      

In [47]:
type(racedata)

pandas.core.frame.DataFrame

In [48]:
# dump racedata to csv
racedata.to_csv('data/racedata_2017-2022.csv')

# ETL

In [337]:
# Imports
import pandas as pd
import pickle
import numpy as np

# read in race data
racedata = pd.read_csv('data/racedata_2017-2022.csv')

In [338]:
racedata

Unnamed: 0.1,Unnamed: 0,Pos,St,#,Driver,Sponsor / Owner,Car,Laps,Status,Led,Pts,PPts,Track,Date,Date_dtobj
0,0,1,8,41,Kurt Busch,Haas Automation / Monster Energy (Stewart Haas...,Ford,200,running,1,48,5,Daytona,02/26/17,2017-02-26
1,1,2,36,21,Ryan Blaney,Motorcraft / Quick Lane Tire & Auto Center (Wo...,Ford,200,running,2,44,0,Daytona,02/26/17,2017-02-26
2,2,3,38,47,A.J. Allmendinger,Kroger ClickList / Stouffer's / Cheez-It (JTG-...,Chevrolet,200,running,2,39,0,Daytona,02/26/17,2017-02-26
3,3,4,13,43,Aric Almirola,Smithfield Foods (Richard Petty Motorsports),Ford,200,running,2,33,0,Daytona,02/26/17,2017-02-26
4,4,5,33,27,Paul Menard,Menards / Peak (Richard Childress),Chevrolet,200,running,0,32,0,Daytona,02/26/17,2017-02-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8307,31,32,29,47,"Ricky Stenhouse, Jr.",Fry's/ Nature Valley (JTG-Daugherty Racing),Chevrolet,307,running,0,5,0,Phoenix,11/06/22,2022-11-06
8308,32,33,35,15,Garrett Smithley,Jacob Companies (Rick Ware),Ford,304,running,0,0,0,Phoenix,11/06/22,2022-11-06
8309,33,34,24,48,Alex Bowman,Ally (Rick Hendrick),Chevrolet,304,running,1,3,0,Phoenix,11/06/22,2022-11-06
8310,34,35,18,6,Brad Keselowski,Kohler Generators (Jack Roush),Ford,270,electrical,0,2,0,Phoenix,11/06/22,2022-11-06


In [339]:
racedata['Year'] = pd.to_datetime(racedata['Date']).dt.year

In [340]:
racedata['race_ID'] = np.nan

In [341]:
list(racedata.columns).index('race_ID')

16

In [342]:
race_id = 1
for i in range(len(racedata)):
    # assign race ID of first entry
    if i == 0:
        racedata.iloc[i, 16] = int(race_id)
    elif i != 0:
        # check if prior entry's date is the same. if not, assign a new race ID
        if racedata.iloc[i, 13] != racedata.iloc[i-1, 13]:
            race_id += 1
            racedata.iloc[i, 16] = int(race_id)
        else:
            racedata.iloc[i, 16] = int(race_id)
            

In [343]:
len(racedata['Date'].unique().tolist())

216

In [344]:
# for ID in list(racedata['race_ID'].unique()):
#     race_df = racedata[(racedata['race_ID'] == ID)]
#     display(race_df)

In [345]:
race_df = racedata[(racedata['race_ID'] == 1)]

In [346]:
# dictionary for FD point values assigned for each finishing position
FD_points_dict = {
    '1' : 43,
    '2' : 40,
    '3' : 38,
    '4' : 37,
    '5' : 36,
    '6' : 35,
    '7' : 34,
    '8' : 33,
    '9' : 32,
    '10' : 31,
    '11' : 30,
    '12' : 29,
    '13' : 28,
    '14' : 27,
    '15' : 26,
    '16' : 25,
    '17' : 24,
    '18' : 23,
    '19' : 22,
    '20' : 21,
    '21' : 20,
    '22' : 19,
    '23' : 18,
    '24' : 17,
    '25' : 16,
    '26' : 15,
    '27' : 14,
    '28' : 13,
    '29' : 12,
    '30' : 11,
    '31' : 10,
    '32' : 9,
    '33' : 8,
    '34' : 7,
    '35' : 6,
    '36' : 5,
    '37' : 4,
    '38' : 3,
    '39' : 2,
    '40' : 1,
    '41' : 1,
    '42' : 1,
    '43' : 1
}

In [347]:
# laps completed: 0.1 * LC
# laps led: 0.1 * LL
# place diff: +/- 0.5 * place diff

In [348]:
racedata['FP'] = np.nan

In [349]:
list(racedata.columns).index('FP')

17

In [350]:
racedata

Unnamed: 0.1,Unnamed: 0,Pos,St,#,Driver,Sponsor / Owner,Car,Laps,Status,Led,Pts,PPts,Track,Date,Date_dtobj,Year,race_ID,FP
0,0,1,8,41,Kurt Busch,Haas Automation / Monster Energy (Stewart Haas...,Ford,200,running,1,48,5,Daytona,02/26/17,2017-02-26,2017,1.0,
1,1,2,36,21,Ryan Blaney,Motorcraft / Quick Lane Tire & Auto Center (Wo...,Ford,200,running,2,44,0,Daytona,02/26/17,2017-02-26,2017,1.0,
2,2,3,38,47,A.J. Allmendinger,Kroger ClickList / Stouffer's / Cheez-It (JTG-...,Chevrolet,200,running,2,39,0,Daytona,02/26/17,2017-02-26,2017,1.0,
3,3,4,13,43,Aric Almirola,Smithfield Foods (Richard Petty Motorsports),Ford,200,running,2,33,0,Daytona,02/26/17,2017-02-26,2017,1.0,
4,4,5,33,27,Paul Menard,Menards / Peak (Richard Childress),Chevrolet,200,running,0,32,0,Daytona,02/26/17,2017-02-26,2017,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8307,31,32,29,47,"Ricky Stenhouse, Jr.",Fry's/ Nature Valley (JTG-Daugherty Racing),Chevrolet,307,running,0,5,0,Phoenix,11/06/22,2022-11-06,2022,216.0,
8308,32,33,35,15,Garrett Smithley,Jacob Companies (Rick Ware),Ford,304,running,0,0,0,Phoenix,11/06/22,2022-11-06,2022,216.0,
8309,33,34,24,48,Alex Bowman,Ally (Rick Hendrick),Chevrolet,304,running,1,3,0,Phoenix,11/06/22,2022-11-06,2022,216.0,
8310,34,35,18,6,Brad Keselowski,Kohler Generators (Jack Roush),Ford,270,electrical,0,2,0,Phoenix,11/06/22,2022-11-06,2022,216.0,


In [351]:
for i in range(len(racedata)):
    
    # get start/finish position, laps completed, laps led, FP assigned on finish
    fin_pos = racedata.iloc[i, 1]
    strt_pos = racedata.iloc[i, 2]
    LC = racedata.iloc[i, 7]
    LL = racedata.iloc[i, 9]
    fin_pos_FP = FD_points_dict[str(racedata.iloc[i, 1])]
    place_diff = strt_pos - fin_pos
    
    # calculate fantasy points
    FP = fin_pos_FP + (0.1 * LC) + (0.1 * LL) + (0.5 * place_diff)
    
    # if fantasy points are negative, FP = 0
    if FP < 0:
        FP = 0
    
    racedata.iloc[i, 17] = FP

In [352]:
racedata_df = racedata.copy()

In [353]:
for col in racedata_df.columns:
    if 'Unnamed' in col:
        racedata_df.drop(col, axis = 1, inplace = True)
        
# drop unnecessary cols.
racedata_df.drop(['Sponsor / Owner', 'Car'], axis = 1, inplace = True)

In [354]:
racedata_df

Unnamed: 0,Pos,St,#,Driver,Laps,Status,Led,Pts,PPts,Track,Date,Date_dtobj,Year,race_ID,FP
0,1,8,41,Kurt Busch,200,running,1,48,5,Daytona,02/26/17,2017-02-26,2017,1.0,66.6
1,2,36,21,Ryan Blaney,200,running,2,44,0,Daytona,02/26/17,2017-02-26,2017,1.0,77.2
2,3,38,47,A.J. Allmendinger,200,running,2,39,0,Daytona,02/26/17,2017-02-26,2017,1.0,75.7
3,4,13,43,Aric Almirola,200,running,2,33,0,Daytona,02/26/17,2017-02-26,2017,1.0,61.7
4,5,33,27,Paul Menard,200,running,0,32,0,Daytona,02/26/17,2017-02-26,2017,1.0,70.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8307,32,29,47,"Ricky Stenhouse, Jr.",307,running,0,5,0,Phoenix,11/06/22,2022-11-06,2022,216.0,38.2
8308,33,35,15,Garrett Smithley,304,running,0,0,0,Phoenix,11/06/22,2022-11-06,2022,216.0,39.4
8309,34,24,48,Alex Bowman,304,running,1,3,0,Phoenix,11/06/22,2022-11-06,2022,216.0,32.5
8310,35,18,6,Brad Keselowski,270,electrical,0,2,0,Phoenix,11/06/22,2022-11-06,2022,216.0,24.5


In [355]:
# create column representing whether the driver finished the race
racedata_df['DNF'] = np.where(racedata_df['Status']!= 'running', 1, 0)

# create column representing whether the driver won the race
racedata_df['Win'] = np.where(racedata_df['Pos'] == 1, 1, 0)

# create column representing whether the driver finished in the top 5
racedata_df['T5'] = np.where(racedata_df['Pos'] <= 5, 1, 0)

# create column representing whether the driver finished in the top 5
racedata_df['T10'] = np.where(racedata_df['Pos'] <= 10, 1, 0)

# create column representing whether the driver finished in the top 5
racedata_df['T20'] = np.where(racedata_df['Pos'] <= 20, 1, 0)


In [356]:
racedata_df

Unnamed: 0,Pos,St,#,Driver,Laps,Status,Led,Pts,PPts,Track,Date,Date_dtobj,Year,race_ID,FP,DNF,Win,T5,T10,T20
0,1,8,41,Kurt Busch,200,running,1,48,5,Daytona,02/26/17,2017-02-26,2017,1.0,66.6,0,1,1,1,1
1,2,36,21,Ryan Blaney,200,running,2,44,0,Daytona,02/26/17,2017-02-26,2017,1.0,77.2,0,0,1,1,1
2,3,38,47,A.J. Allmendinger,200,running,2,39,0,Daytona,02/26/17,2017-02-26,2017,1.0,75.7,0,0,1,1,1
3,4,13,43,Aric Almirola,200,running,2,33,0,Daytona,02/26/17,2017-02-26,2017,1.0,61.7,0,0,1,1,1
4,5,33,27,Paul Menard,200,running,0,32,0,Daytona,02/26/17,2017-02-26,2017,1.0,70.0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8307,32,29,47,"Ricky Stenhouse, Jr.",307,running,0,5,0,Phoenix,11/06/22,2022-11-06,2022,216.0,38.2,0,0,0,0,0
8308,33,35,15,Garrett Smithley,304,running,0,0,0,Phoenix,11/06/22,2022-11-06,2022,216.0,39.4,0,0,0,0,0
8309,34,24,48,Alex Bowman,304,running,1,3,0,Phoenix,11/06/22,2022-11-06,2022,216.0,32.5,0,0,0,0,0
8310,35,18,6,Brad Keselowski,270,electrical,0,2,0,Phoenix,11/06/22,2022-11-06,2022,216.0,24.5,1,0,0,0,0


In [357]:
tracktype_dict = {
    'Atlanta': 'int',
    'Austin': 'roadcourse',
    'Bristol': 'short',
    'Bristol (Dirt)': 'short',
    'Charlotte': 'int',
    'Charlotte (Road)': 'roadcourse',
    'Chicago': 'int',
    'Darlington': 'int',
    'Daytona': 'ss',
    'Daytona (Road)': 'roadcourse',
    'Dover': 'int',
    'Elkhart Lake': 'roadcourse',
    'Fontana': 'ss',
    'Fort Worth': 'int',
    'Gateway': 'int',
    'Homestead': 'int',
    'Indianapolis': 'flat',
    'Indianapolis G.P.': 'roadcourse',
    'Kansas': 'int',
    'Kentucky': 'int',
    'Las Vegas': 'int',
    'Loudon': 'flat',
    'Martinsville': 'short',
    'Michigan': 'int',
    'Nashville': 'int',
    'Phoenix': 'flat',
    'Pocono': 'flat',
    'Richmond': 'short',
    'Sonoma': 'roadcourse',
    'Talladega': 'ss',
    'Watkins Glen': 'roadcourse',
}

In [358]:
# assign track types
racedata_df['track_type'] = racedata_df['Track'].map(tracktype_dict)

In [359]:
# map track types to integers
tt_mapper = {'int' : 1, 'short' : 2, 'roadcourse' : 3, 'ss' : 4, 'flat' : 5}
racedata_df['track_type'] = racedata_df['track_type'].replace(tt_mapper)

In [360]:
racedata_df['place_diff'] = racedata_df['St'] - racedata_df['Pos']

In [361]:
# year to date sums for laps led, laps completed, DNFs, wins, T5, T10, T20
racedata_df['LL_yr_cumsum'] = racedata_df.groupby(['Driver', 'Year'])['Led'].transform(pd.Series.cumsum)
racedata_df['LC_yr_cumsum'] = racedata_df.groupby(['Driver', 'Year'])['Laps'].transform(pd.Series.cumsum)
racedata_df['DNF_yr_cumsum'] = racedata_df.groupby(['Driver', 'Year'])['DNF'].transform(pd.Series.cumsum)
racedata_df['Win_yr_cumsum'] = racedata_df.groupby(['Driver', 'Year'])['Win'].transform(pd.Series.cumsum)
racedata_df['T5_yr_cumsum'] = racedata_df.groupby(['Driver', 'Year'])['T5'].transform(pd.Series.cumsum)
racedata_df['T10_yr_cumsum'] = racedata_df.groupby(['Driver', 'Year'])['T10'].transform(pd.Series.cumsum)
racedata_df['T20_yr_cumsum'] = racedata_df.groupby(['Driver', 'Year'])['T20'].transform(pd.Series.cumsum)

# Last 3 means/sums
racedata_df['FinPos_mean_L3'] = racedata_df.groupby(['Driver'])['Pos'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['StrtPos_mean_L3'] = racedata_df.groupby(['Driver'])['St'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PlaceDiff_mean_L3'] = racedata_df.groupby(['Driver'])['place_diff'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LL_sum_L3'] = racedata_df.groupby(['Driver'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LL_mean_L3'] = racedata_df.groupby(['Driver'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LC_sum_L3'] = racedata_df.groupby(['Driver'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LC_mean_L3'] = racedata_df.groupby(['Driver'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['Pts_sum_L3'] = racedata_df.groupby(['Driver'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['Pts_mean_L3'] = racedata_df.groupby(['Driver'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PPts_sum_L3'] = racedata_df.groupby(['Driver'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['PPts_mean_L3'] = racedata_df.groupby(['Driver'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['FP_sum_L3'] = racedata_df.groupby(['Driver'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['FP_mean_L3'] = racedata_df.groupby(['Driver'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['DNF_sum_L3'] = racedata_df.groupby(['Driver'])['DNF'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['DNF_sum_L10'] = racedata_df.groupby(['Driver'])['DNF'].transform(lambda x: x.shift().rolling(10, min_periods = 1).sum())
racedata_df['Win_sum_L3'] = racedata_df.groupby(['Driver'])['Win'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['T5_sum_L3'] = racedata_df.groupby(['Driver'])['T5'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['T10_sum_L3'] = racedata_df.groupby(['Driver'])['T10'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['T20_sum_L3'] = racedata_df.groupby(['Driver'])['T20'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['Win_sum_L10'] = racedata_df.groupby(['Driver'])['Win'].transform(lambda x: x.shift().rolling(10, min_periods = 1).sum())
racedata_df['T5_sum_L10'] = racedata_df.groupby(['Driver'])['T5'].transform(lambda x: x.shift().rolling(10, min_periods = 1).sum())
racedata_df['T10_sum_L10'] = racedata_df.groupby(['Driver'])['T10'].transform(lambda x: x.shift().rolling(10, min_periods = 1).sum())
racedata_df['T20_sum_L10'] = racedata_df.groupby(['Driver'])['T10'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())

# Last 3 means/sums at each track
racedata_df['FinPos_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Pos'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['StrtPos_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['St'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PlaceDiff_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['place_diff'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LL_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LL_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LC_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LC_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['Pts_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['Pts_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PPts_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['PPts_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['FP_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['FP_mean_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['DNF_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['DNF'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['Win_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['Win'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['T5_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['T5'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['T10_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['T10'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['T20_sum_L3_track'] = racedata_df.groupby(['Driver', 'Track'])['T20'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())

                                                                              
# Last 3 means/sums at each tracktype
racedata_df['FinPos_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Pos'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['StrtPos_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['St'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PlaceDiff_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['place_diff'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LL_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LL_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Led'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['LC_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['LC_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Laps'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['Pts_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['Pts_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Pts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['PPts_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['PPts_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['PPts'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['FP_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['FP_mean_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['FP'].transform(lambda x: x.shift().rolling(3, min_periods = 1).mean())
racedata_df['DNF_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['DNF'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['Win_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['Win'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['T5_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['T5'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['T10_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['T10'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())
racedata_df['T20_sum_L3_tt'] = racedata_df.groupby(['Driver', 'track_type'])['T20'].transform(lambda x: x.shift().rolling(3, min_periods = 1).sum())

racedata_df.fillna(0, inplace = True)
# racedata_df.dropna(inplace = True)
# ranks?

In [362]:
pd.set_option('display.max_columns', None)

In [363]:
racedata_df.tail(n = 36)

Unnamed: 0,Pos,St,#,Driver,Laps,Status,Led,Pts,PPts,Track,Date,Date_dtobj,Year,race_ID,FP,DNF,Win,T5,T10,T20,track_type,place_diff,LL_yr_cumsum,LC_yr_cumsum,DNF_yr_cumsum,Win_yr_cumsum,T5_yr_cumsum,T10_yr_cumsum,T20_yr_cumsum,FinPos_mean_L3,StrtPos_mean_L3,PlaceDiff_mean_L3,LL_sum_L3,LL_mean_L3,LC_sum_L3,LC_mean_L3,Pts_sum_L3,Pts_mean_L3,PPts_sum_L3,PPts_mean_L3,FP_sum_L3,FP_mean_L3,DNF_sum_L3,DNF_sum_L10,Win_sum_L3,T5_sum_L3,T10_sum_L3,T20_sum_L3,Win_sum_L10,T5_sum_L10,T10_sum_L10,T20_sum_L10,FinPos_mean_L3_track,StrtPos_mean_L3_track,PlaceDiff_mean_L3_track,LL_sum_L3_track,LL_mean_L3_track,LC_sum_L3_track,LC_mean_L3_track,Pts_sum_L3_track,Pts_mean_L3_track,PPts_sum_L3_track,PPts_mean_L3_track,FP_sum_L3_track,FP_mean_L3_track,DNF_sum_L3_track,Win_sum_L3_track,T5_sum_L3_track,T10_sum_L3_track,T20_sum_L3_track,FinPos_mean_L3_tt,StrtPos_mean_L3_tt,PlaceDiff_mean_L3_tt,LL_sum_L3_tt,LL_mean_L3_tt,LC_sum_L3_tt,LC_mean_L3_tt,Pts_sum_L3_tt,Pts_mean_L3_tt,PPts_sum_L3_tt,PPts_mean_L3_tt,FP_sum_L3_tt,FP_mean_L3_tt,DNF_sum_L3_tt,Win_sum_L3_tt,T5_sum_L3_tt,T10_sum_L3_tt,T20_sum_L3_tt
8276,1,1,22,Joey Logano,312,running,187,40,6,Phoenix,11/06/22,2022-11-06,2022,216.0,92.9,0,1,1,1,1,5,0,784,9293,4,4,11,17,27,8.333333,11.333333,3.0,32.0,10.666667,1034.0,344.666667,115.0,38.333333,5.0,1.666667,212.1,70.7,0.0,1.0,1.0,1.0,2.0,3.0,1.0,3.0,4.0,2.0,7.0,9.666667,2.666667,147.0,49.0,936.0,312.0,114.0,38.0,1.0,0.333333,215.3,71.766667,0.0,0.0,1.0,2.0,3.0,17.333333,12.0,-5.333333,29.0,9.666667,772.0,257.333333,69.0,23.0,0.0,0.0,143.1,47.7,0.0,0.0,0.0,1.0,2.0
8277,2,2,12,Ryan Blaney,312,running,109,54,1,Phoenix,11/06/22,2022-11-06,2022,216.0,82.1,0,0,1,1,1,5,0,636,8891,3,0,12,17,27,16.0,7.0,-9.0,39.0,13.0,1027.0,342.333333,97.0,32.333333,1.0,0.333333,168.1,56.033333,0.0,0.0,0.0,1.0,1.0,2.0,0.0,3.0,4.0,1.0,6.0,4.333333,-1.666667,211.0,70.333333,936.0,312.0,128.0,42.666667,2.0,0.666667,217.2,72.4,0.0,0.0,2.0,3.0,3.0,18.333333,6.0,-12.333333,150.0,50.0,747.0,249.0,82.0,27.333333,1.0,0.333333,139.2,46.4,1.0,0.0,1.0,1.0,2.0
8278,3,25,1,Ross Chastain,312,running,0,34,0,Phoenix,11/06/22,2022-11-06,2022,216.0,80.2,0,0,1,1,1,5,22,692,9052,5,2,15,21,26,2.666667,13.333333,10.666667,68.0,22.666667,1034.0,344.666667,119.0,39.666667,0.0,0.0,243.2,81.066667,0.0,1.0,0.0,3.0,3.0,3.0,0.0,4.0,6.0,3.0,11.666667,18.333333,6.666667,0.0,0.0,936.0,312.0,80.0,26.666667,0.0,0.0,192.6,64.2,0.0,0.0,1.0,1.0,3.0,14.0,18.666667,4.666667,16.0,5.333333,755.0,251.666667,87.0,29.0,1.0,0.333333,166.1,55.366667,1.0,0.0,1.0,2.0,2.0
8279,4,3,14,Chase Briscoe,312,running,11,48,0,Phoenix,11/06/22,2022-11-06,2022,216.0,68.8,0,0,1,1,1,5,-1,280,9027,4,1,6,10,23,16.333333,12.666667,-3.666667,31.0,10.333333,927.0,309.0,68.0,22.666667,0.0,0.0,164.3,54.766667,1.0,2.0,0.0,1.0,2.0,2.0,0.0,2.0,5.0,2.0,19.333333,18.0,-1.333333,101.0,33.666667,777.0,259.0,69.0,23.0,5.0,1.666667,153.8,51.266667,1.0,1.0,1.0,1.0,1.0,10.333333,16.0,5.666667,107.0,35.666667,772.0,257.333333,96.0,32.0,5.0,1.666667,191.4,63.8,0.0,1.0,1.0,1.0,3.0
8280,5,7,4,Kevin Harvick,312,running,0,41,0,Phoenix,11/06/22,2022-11-06,2022,216.0,68.2,0,0,1,1,1,5,2,119,8942,7,2,9,17,27,12.0,12.333333,0.333333,0.0,0.0,1034.0,344.666667,79.0,26.333333,0.0,0.0,190.9,63.633333,0.0,3.0,0.0,0.0,1.0,3.0,0.0,1.0,3.0,1.0,6.666667,14.333333,7.666667,1.0,0.333333,936.0,312.0,125.0,41.666667,0.0,0.0,208.2,69.4,0.0,0.0,0.0,3.0,3.0,12.666667,16.666667,4.0,0.0,0.0,773.0,257.666667,109.0,36.333333,0.0,0.0,168.3,56.1,0.0,0.0,1.0,2.0,2.0
8281,6,8,24,William Byron,312,running,0,39,0,Phoenix,11/06/22,2022-11-06,2022,216.0,67.2,0,0,0,1,1,5,2,746,8878,6,2,5,11,27,10.666667,9.666667,-1.0,32.0,10.666667,1034.0,344.666667,102.0,34.0,0.0,0.0,196.1,65.366667,0.0,1.0,0.0,0.0,1.0,3.0,0.0,1.0,5.0,1.0,14.333333,5.333333,-9.0,12.0,4.0,936.0,312.0,98.0,32.666667,1.0,0.333333,161.3,53.766667,0.0,0.0,0.0,1.0,3.0,13.666667,9.0,-4.666667,12.0,4.0,773.0,257.666667,103.0,34.333333,1.0,0.333333,153.5,51.166667,0.0,0.0,0.0,0.0,3.0
8282,7,22,18,Kyle Busch,312,running,0,30,0,Phoenix,11/06/22,2022-11-06,2022,216.0,72.7,0,0,0,1,1,5,15,627,8479,7,1,8,17,22,13.666667,15.666667,2.0,0.0,0.0,1028.0,342.666667,71.0,23.666667,0.0,0.0,187.8,62.6,0.0,3.0,0.0,1.0,2.0,2.0,0.0,2.0,4.0,2.0,13.0,10.333333,-2.666667,0.0,0.0,935.0,311.666667,80.0,26.666667,0.0,0.0,173.5,57.833333,0.0,0.0,0.0,2.0,2.0,18.333333,10.0,-8.333333,63.0,21.0,773.0,257.666667,56.0,18.666667,0.0,0.0,139.1,46.366667,1.0,0.0,0.0,1.0,2.0
8283,8,21,11,Denny Hamlin,312,running,0,29,0,Phoenix,11/06/22,2022-11-06,2022,216.0,70.7,0,0,0,1,1,5,13,624,8978,6,2,10,16,24,5.666667,18.666667,13.0,212.0,70.666667,1034.0,344.666667,121.0,40.333333,2.0,0.666667,250.1,83.366667,0.0,1.0,0.0,2.0,3.0,3.0,0.0,5.0,8.0,3.0,6.333333,3.666667,-2.666667,33.0,11.0,936.0,312.0,107.0,35.666667,0.0,0.0,196.9,65.633333,0.0,0.0,2.0,2.0,3.0,18.0,5.666667,-12.333333,21.0,7.0,773.0,257.666667,58.0,19.333333,0.0,0.0,129.9,43.3,1.0,0.0,0.0,1.0,2.0
8284,9,4,5,Kyle Larson,312,running,0,38,0,Phoenix,11/06/22,2022-11-06,2022,216.0,60.7,0,0,0,1,1,5,-5,635,8750,7,3,13,19,27,12.666667,6.666667,-6.0,267.0,89.0,861.0,287.0,116.0,38.666667,7.0,2.333333,192.8,64.266667,1.0,2.0,1.0,2.0,2.0,2.0,1.0,3.0,5.0,2.0,14.0,3.333333,-10.666667,110.0,36.666667,863.0,287.666667,85.0,28.333333,6.0,2.0,165.3,55.1,1.0,1.0,1.0,2.0,2.0,17.666667,6.333333,-11.333333,20.0,6.666667,699.0,233.0,95.0,31.666667,1.0,0.333333,124.9,41.633333,1.0,0.0,1.0,1.0,2.0
8285,10,17,20,Christopher Bell,312,running,0,27,0,Phoenix,11/06/22,2022-11-06,2022,216.0,65.7,0,0,0,1,1,5,7,573,8816,6,3,12,20,26,15.333333,9.666667,-5.666667,154.0,51.333333,861.0,287.0,87.0,29.0,5.0,1.666667,173.0,57.666667,1.0,3.0,1.0,1.0,1.0,2.0,2.0,5.0,5.0,1.0,14.666667,5.333333,-9.333333,0.0,0.0,934.0,311.333333,73.0,24.333333,0.0,0.0,158.4,52.8,0.0,0.0,0.0,2.0,2.0,10.333333,6.666667,-3.666667,56.0,18.666667,771.0,257.0,98.0,32.666667,5.0,1.666667,172.2,57.4,0.0,1.0,2.0,2.0,2.0


In [364]:
len(racedata_df)

8312

In [365]:
racedata_df.loc[(racedata_df['Driver'] == 'Kyle Busch') & (racedata_df['track_type'] == 3)][['Pos', 'Track', 'FinPos_mean_L3_tt']].head()

Unnamed: 0,Pos,Track,FinPos_mean_L3_tt
590,5,Sonoma,0.0
827,7,Watkins Glen,5.0
1990,5,Sonoma,6.0
2221,3,Watkins Glen,5.666667
2527,32,Charlotte (Road),5.0


In [366]:
racedata_df.loc[(racedata_df['Driver'] == 'Kyle Busch') & (racedata_df['track_type'] == 3)][['Pos', 'Track', 'T5_sum_L3_tt']].head()

Unnamed: 0,Pos,Track,T5_sum_L3_tt
590,5,Sonoma,0.0
827,7,Watkins Glen,1.0
1990,5,Sonoma,1.0
2221,3,Watkins Glen,2.0
2527,32,Charlotte (Road),2.0


In [370]:
racedata_df.loc[(racedata_df['Driver'] == 'Kyle Busch') & (racedata_df['Year'] == 2018)][['Pos', 'Track', 'T5_yr_cumsum']].head(n=10)

Unnamed: 0,Pos,Track,T5_yr_cumsum
1438,25,Daytona,0
1460,7,Atlanta,0
1491,2,Las Vegas,1
1528,2,Phoenix,2
1566,3,Fontana,3
1602,2,Martinsville,4
1639,1,Fort Worth,5
1676,1,Bristol,6
1715,1,Richmond,7
1765,13,Talladega,7


In [371]:
racedata_df.to_csv('data/racedata_2017-2022_ETL.csv')

# Model Development

In [10]:
import pandas as pd
ml_df = pd.read_csv('data/racedata_2017-2022_ETL.csv', index_col = 0)

In [11]:
ml_df = ml_df.dropna()

In [12]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

working_directory = 'D:/machine_learning/DFS/nascar/'
data_dir = 'data/'

# needs hyperparameters

dcols = [
    'Pos',
     'St',
     '#',
     'Driver',
     'Laps',
     'Led',
     'Pts',
     'PPts',
     'Track',
     'race_ID',
     'FP',
     'place_diff',
     'Date',
     'Date_dtobj',
     'Year',
     'DNF',
     'Win',
     'T5',
     'T10',
     'T20'
]

X = ml_df.drop(dcols, axis = 1)
Y = ml_df['FP']

from sklearn.model_selection import train_test_split

#Create Training and Testing DataSets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42)

X_train.reset_index(inplace = True, drop=True)
X_test.reset_index(inplace = True, drop=True)
Y_train.reset_index(inplace = True, drop=True)
Y_test.reset_index(inplace = True, drop=True)

print('Training set size:', len(X_train))
print('Testing set size:', len(X_test))

pred_df = pd.concat([X_test, Y_test], axis = 1)

print('\nNum Possible Features:',len(X_train.columns.tolist()))

Training set size: 6234
Testing set size: 2078

Num Possible Features: 68


In [13]:
# get statuses for comparison
train_status = X_train['Status']
test_status = X_test['Status']

X_train.drop(['Status'], axis = 1, inplace = True)
X_test.drop(['Status'], axis = 1, inplace = True)

pred_df['Status'] = test_status

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [14]:
# standardize data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.compose import ColumnTransformer

# get numerical features from the features dataframe and store numerical columns in a variable
numerical_features = X_train.select_dtypes(include=['float64', 'int64'])
numerical_columns = numerical_features.columns
 
ct = ColumnTransformer([("only numeric", MinMaxScaler(), numerical_columns)], remainder='passthrough')

# fit columntransformer to the training data and transform it
X_train_scaled = ct.fit_transform(X_train)

# fit columntransformer to the testing data and transform it
X_test_scaled = ct.transform(X_test)

# convert to pd dataframes
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_test_scaled = pd.DataFrame(X_train_scaled, columns = X_test.columns)

In [15]:
ml_df

Unnamed: 0,Pos,St,#,Driver,Laps,Status,Led,Pts,PPts,Track,...,Pts_mean_L3_tt,PPts_sum_L3_tt,PPts_mean_L3_tt,FP_sum_L3_tt,FP_mean_L3_tt,DNF_sum_L3_tt,Win_sum_L3_tt,T5_sum_L3_tt,T10_sum_L3_tt,T20_sum_L3_tt
0,1,8,41,Kurt Busch,200,running,1,48,5,Daytona,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,2,36,21,Ryan Blaney,200,running,2,44,0,Daytona,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,3,38,47,A.J. Allmendinger,200,running,2,39,0,Daytona,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,4,13,43,Aric Almirola,200,running,2,33,0,Daytona,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,5,33,27,Paul Menard,200,running,0,32,0,Daytona,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8307,32,29,47,"Ricky Stenhouse, Jr.",307,running,0,5,0,Phoenix,...,14.333333,0.0,0.0,137.5,45.833333,0.0,0.0,0.0,0.0,1.0
8308,33,35,15,Garrett Smithley,304,running,0,0,0,Phoenix,...,1.666667,0.0,0.0,120.9,40.300000,0.0,0.0,0.0,0.0,0.0
8309,34,24,48,Alex Bowman,304,running,1,3,0,Phoenix,...,19.000000,0.0,0.0,107.1,35.700000,1.0,0.0,0.0,0.0,2.0
8310,35,18,6,Brad Keselowski,270,electrical,0,2,0,Phoenix,...,22.333333,0.0,0.0,160.7,53.566667,0.0,0.0,0.0,1.0,2.0


In [16]:
X_train_scaled

Unnamed: 0,track_type,LL_yr_cumsum,LC_yr_cumsum,DNF_yr_cumsum,Win_yr_cumsum,T5_yr_cumsum,T10_yr_cumsum,T20_yr_cumsum,FinPos_mean_L3,StrtPos_mean_L3,...,Pts_mean_L3_tt,PPts_sum_L3_tt,PPts_mean_L3_tt,FP_sum_L3_tt,FP_mean_L3_tt,DNF_sum_L3_tt,Win_sum_L3_tt,T5_sum_L3_tt,T10_sum_L3_tt,T20_sum_L3_tt
0,0.00,0.000387,0.338727,0.071429,0.0,0.000000,0.071429,0.171429,0.583333,0.508333,...,0.428571,0.344828,0.344828,0.333439,0.330506,0.333333,0.000000,0.000000,0.000000,0.333333
1,1.00,0.001550,0.488176,0.214286,0.0,0.000000,0.107143,0.314286,0.650000,0.408333,...,0.538776,0.344828,0.344828,0.381616,0.378260,0.000000,0.000000,0.000000,0.333333,0.666667
2,0.00,0.003100,0.583054,0.214286,0.0,0.000000,0.035714,0.314286,0.466667,0.591667,...,0.506122,0.344828,0.344828,0.437718,0.433867,0.000000,0.000000,0.000000,0.000000,0.666667
3,0.50,0.000000,0.029009,0.214286,0.0,0.000000,0.000000,0.000000,0.891667,0.866667,...,0.293878,0.344828,0.344828,0.087797,0.087025,0.666667,0.000000,0.000000,0.000000,0.000000
4,0.25,0.000775,0.250742,0.000000,0.0,0.045455,0.071429,0.114286,0.358333,0.583333,...,0.673469,0.344828,0.344828,0.307448,0.914232,0.000000,0.000000,0.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6229,0.00,0.000000,0.058880,0.071429,0.0,0.000000,0.035714,0.057143,0.666667,0.716667,...,0.526531,0.344828,0.344828,0.414580,0.410933,0.000000,0.000000,0.000000,0.000000,0.333333
6230,0.75,0.000387,0.163619,0.071429,0.0,0.000000,0.000000,0.085714,0.508333,0.641667,...,0.293878,0.344828,0.344828,0.448177,0.444235,0.333333,0.000000,0.000000,0.000000,0.666667
6231,0.75,0.000387,0.019052,0.000000,0.0,0.000000,0.000000,0.028571,0.875000,0.858333,...,0.355102,0.344828,0.344828,0.263391,0.261074,0.333333,0.000000,0.000000,0.000000,0.000000
6232,0.00,0.019372,0.510483,0.428571,0.0,0.090909,0.285714,0.428571,0.408333,0.200000,...,0.665306,0.344828,0.344828,0.562282,0.557336,0.333333,0.000000,0.000000,0.333333,1.000000


In [18]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

working_directory = 'D:/machine_learning/DFS/nascar/ML'
os.chdir(working_directory)

# needs hyperparameters
def lgbm_mod(): return LGBMRegressor(random_state = 1, n_estimators = 1000, learning_rate = 0.01, n_jobs = -1)
def xgb_mod(): return xgb.XGBRegressor(random_state = 1)
def rf_mod(): return RandomForestRegressor(random_state = 1, n_jobs = -1, n_estimators = 200, max_depth = 5)
def sgd_mod(): return SGDRegressor(max_iter=100, tol=1e-3, random_state = 1) # won't work with feature importance
def svm_mod(): return SVR(C=1.0, epsilon=0.2) # won't work with feature importance
def neigh_mod(): return KNeighborsRegressor()
def dt_mod(): return DecisionTreeRegressor(random_state=0)

""" MODEL SELECTION """

model = lgbm_mod()

"""                 """

#print possible features
print('possible features:', X_train_scaled.columns.tolist(), '\n')

# Fit model, make predictions with all features
model.fit(X_train_scaled, Y_train)

preds_all = model.predict(X_test)

pdf = pred_df[['Status', 'FP']].copy()
pdf['Pred_FP_all'] = preds_all

# save the initial model to disk
filename = 'models/nascar_model_allfeats.pkl'
joblib.dump(model, filename)

# get top 50 features
dset = pd.DataFrame({'attr':X_train.columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr50 = dset['attr'][0:50].tolist()

# Using Top 50 Features, Find Top 30 Features
model.fit(X_train[attr50], Y_train)
dset = pd.DataFrame({'attr':X_train[attr50].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr30 = dset['attr'][0:30].tolist()

# Using Top 30 Features, Find Top 20 Features
model.fit(X_train[attr30], Y_train)
dset = pd.DataFrame({'attr':X_train[attr30].columns.tolist(),'importance':model.feature_importances_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
attr20 = dset['attr'][0:20].tolist()

#Perform RFE (recursive feature elimination) using Top 20 Features, To Find Top 15
rfe_model = RFE(model, n_features_to_select = 15)
rfe_model.fit(X_train[attr20], Y_train)
dset = pd.DataFrame({'attr':X_train[attr20].columns.tolist(),'importance':rfe_model.ranking_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
cols15 = dset[dset['importance']==1]['attr'].tolist()

#Perform RFE using Top 15 Features, To Find Top 10
rfe_model = RFE(model, n_features_to_select = 10)
rfe_model.fit(X_train[cols15], Y_train)
dset = pd.DataFrame({'attr':X_train[cols15].columns.tolist(),'importance':rfe_model.ranking_}).sort_values(by='importance', ascending=False).reset_index(drop=True)
cols10 = dset[dset['importance']==1]['attr'].tolist()

print('T50 features', attr50, '\n')
print('T30 features', attr30, '\n')
print('T20 features', attr20, '\n')
print('T15 features',cols15, '\n')
print('T10 features',cols10, '\n')

model.fit(X_train[attr50], Y_train)
preds50 = model.predict(X_test[attr50])
filename = 'models/nascar_model_50feats.pkl'
joblib.dump(model, filename)

model.fit(X_train[attr30], Y_train)
preds30 = model.predict(X_test[attr30])
filename = 'models/nascar_model_30feats.pkl'
joblib.dump(model, filename)

model.fit(X_train[attr20], Y_train)
preds20 = model.predict(X_test[attr20])
filename = 'models/nascar_model_20feats.pkl'
joblib.dump(model, filename)

model.fit(X_train[cols15], Y_train)
preds15 = model.predict(X_test[cols15])
filename = 'models/nascar_model_15feats.pkl'
joblib.dump(model, filename)

model.fit(X_train[cols10], Y_train)
preds10 = model.predict(X_test[cols10])
filename = 'models/nascar_model_10feats.pkl'
joblib.dump(model, filename)


pdf['Pred_FP_50'] = preds50
pdf['Pred_FP_30'] = preds30
pdf['Pred_FP_20'] = preds20
pdf['Pred_FP_15'] = preds15
pdf['Pred_FP_10'] = preds10

pdf.to_csv('predictions_50_30_20_15_10.csv')

feature_sets = ['all', '50', '30', '20', '15', '10']

mae_values = [
    "{:.2f}".format(mean_absolute_error(Y_test, preds_all)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds50)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds30)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds20)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds15)),
    "{:.2f}".format(mean_absolute_error(Y_test, preds10))
]

results_df = pd.DataFrame({'Features' : feature_sets, 'MAE' : mae_values})

results_df.style.hide_index()
results_df.to_clipboard()
display(results_df)

possible features: ['track_type', 'LL_yr_cumsum', 'LC_yr_cumsum', 'DNF_yr_cumsum', 'Win_yr_cumsum', 'T5_yr_cumsum', 'T10_yr_cumsum', 'T20_yr_cumsum', 'FinPos_mean_L3', 'StrtPos_mean_L3', 'PlaceDiff_mean_L3', 'LL_sum_L3', 'LL_mean_L3', 'LC_sum_L3', 'LC_mean_L3', 'Pts_sum_L3', 'Pts_mean_L3', 'PPts_sum_L3', 'PPts_mean_L3', 'FP_sum_L3', 'FP_mean_L3', 'DNF_sum_L3', 'DNF_sum_L10', 'Win_sum_L3', 'T5_sum_L3', 'T10_sum_L3', 'T20_sum_L3', 'Win_sum_L10', 'T5_sum_L10', 'T10_sum_L10', 'T20_sum_L10', 'FinPos_mean_L3_track', 'StrtPos_mean_L3_track', 'PlaceDiff_mean_L3_track', 'LL_sum_L3_track', 'LL_mean_L3_track', 'LC_sum_L3_track', 'LC_mean_L3_track', 'Pts_sum_L3_track', 'Pts_mean_L3_track', 'PPts_sum_L3_track', 'PPts_mean_L3_track', 'FP_sum_L3_track', 'FP_mean_L3_track', 'DNF_sum_L3_track', 'Win_sum_L3_track', 'T5_sum_L3_track', 'T10_sum_L3_track', 'T20_sum_L3_track', 'FinPos_mean_L3_tt', 'StrtPos_mean_L3_tt', 'PlaceDiff_mean_L3_tt', 'LL_sum_L3_tt', 'LL_mean_L3_tt', 'LC_sum_L3_tt', 'LC_mean_L3_tt',

Unnamed: 0,Features,MAE
0,all,25.85
1,50,12.06
2,30,12.18
3,20,12.14
4,15,12.92
5,10,13.15


In [380]:
test_preds = model.predict(X_test_scaled)
test_preds = [pred for [pred] in test_preds]
pdf = pd.DataFrame({'actual' : Y_test, 'predicted' : test_preds, 'abs. error': abs(test_preds-Y_test), 'status' : test_status})



In [381]:
pdf.loc[pdf['status'] == 'running'].describe()

Unnamed: 0,actual,predicted,abs. error
count,1797.0,1797.0,1797.0
mean,53.519421,49.623383,11.473874
std,18.939117,14.068583,9.130235
min,0.0,12.833677,0.002911
25%,39.7,39.377014,4.42169
50%,53.4,48.874165,9.301463
75%,66.0,58.573917,16.168793
max,138.6,108.717484,62.869211


In [382]:
pdf.loc[pdf['status'] != 'running'].describe()

Unnamed: 0,actual,predicted,abs. error
count,281.0,281.0,281.0
mean,18.48363,41.552753,24.41183
std,14.62147,12.253642,15.29526
min,0.0,13.525291,0.05869
25%,6.1,32.670452,12.266153
50%,15.9,39.381245,23.028477
75%,28.1,48.775669,34.328568
max,73.6,79.006569,70.062119


In [383]:
pdf.head(n=20)

Unnamed: 0,actual,predicted,abs. error,status
0,44.5,32.008884,12.491116,running
1,74.4,82.746796,8.346796,running
2,20.0,54.328568,34.328568,crash
3,77.0,57.456577,19.543423,running
4,72.7,52.579689,20.120311,running
5,56.8,56.631878,0.168122,running
6,37.2,38.757439,1.557439,running
7,44.9,35.598537,9.301463,running
8,40.0,39.028259,0.971741,running
9,39.7,34.268368,5.431632,electrical


In [384]:
pdf.describe()

Unnamed: 0,actual,predicted,abs. error
count,2078.0,2078.0,2078.0
mean,48.781665,48.532024,13.223424
std,21.968089,14.10709,11.09979
min,0.0,12.833677,0.002911
25%,33.9,38.12789,4.790457
50%,49.8,47.544008,10.416541
75%,63.575,57.595137,18.772615
max,138.6,108.717484,70.062119


---
**need to develop predictors based on track type**

* assign track type for each race, since it's known
* for each driver and track type, calculate means/sums for L3 of that track type