In [1]:
# Imports
import pandas as pd
import pickle
import numpy as np

# import pickled racedata object from NASCAR_dataimport.ipynb
racedata = pd.read_pickle('racedata_2017-2021.pkl')
racedata.to_csv('racedata_2017-2021.csv')

In [2]:
# get list of unique tracks
uniq_tracklst = list(racedata['Track'].unique())

In [3]:
# remove drivers appearing < 10 times
racedata = racedata[racedata.groupby('Driver').Driver.transform(len) >= 10]

In [4]:
#jr. name fix (ensures that these 2 drivers are represented in the figure due to a prior string mismatch)
racedata.replace('Ricky Stenhouse, Jr.', 'Ricky Stenhouse Jr.', inplace=True)
racedata.replace('Martin Truex, Jr.', 'Martin Truex Jr.', inplace=True)

In [5]:
# get list of unique drivers
uniq_drivers = list(racedata['Driver'].unique())

In [6]:
# display all rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# create dataframe with columns representing each driver and track
fp_df = pd.DataFrame(columns = ['Driver'] + uniq_tracklst)

# add unique drivers to Driver column
fp_df['Driver'] = sorted(uniq_drivers)

In [7]:
# fix incorrect sorting with jrs.

# replace jr. names w/ non-jr. names
fp_df.replace('Ricky Stenhouse Jr.', 'Ricky Stenhouse', inplace=True)
fp_df.replace('Martin Truex Jr.', 'Martin Truex', inplace=True)

# sort by last name
fp_df = fp_df.sort_values(by='Driver', key=lambda x: x.str.split('\s+').str[-1])

# replace non-jr. names w/ jr. names
fp_df.replace('Ricky Stenhouse', 'Ricky Stenhouse Jr.', inplace=True)
fp_df.replace('Martin Truex', 'Martin Truex Jr.', inplace=True)

In [8]:
# get list of active drivers with beautiful soup

# Importing the required libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Downloading contents of the web page
url = "https://www.espn.com/racing/drivers"
data = requests.get(url).text

# Creating BeautifulSoup object
soup = BeautifulSoup(data, 'html.parser')

# Verifying tables and their classes
print('Classes of each table:')
for table in soup.find_all('table'):
    print(table.get('class'))

# Creating list with all tables
tables = soup.find_all('table')

#  Looking for the table with the classes 'wikitable' and 'sortable'
table = soup.find('table')

# get active drivers from scraped data and store in active_drivers
rows = table.find_all('tr')
active_drivers = []
for row in rows:
    first_column = row.findAll('td')[0].contents
    for entry in first_column:
        active_drivers.append(entry.text.strip())
# drop unnecessary data
active_drivers = active_drivers[2:]

Classes of each table:
['tablehead']


In [9]:
active_drivers

['Ross Chastain',
 'Aric Almirola',
 'Denny Hamlin',
 'Ryan Blaney',
 'Chase Briscoe',
 'David Ragan',
 'J.J. Yeley',
 'Parker Kligerman',
 'Ryan Preece',
 'Garrett Smithley',
 'Joey Hand',
 'Joey Hand',
 'AJ Allmendinger',
 'Daniel Hemric',
 'Noah Gragson',
 'Chris Buescher',
 'Zane Smith',
 'Kyle Busch',
 'Martin Truex Jr.',
 'Austin Cindric',
 'Christopher Bell',
 'Harrison Burton',
 'Joey Logano',
 'Bubba Wallace',
 'William Byron',
 'Daniil Kvyat',
 'Jacques Villeneuve',
 'Loris Hezemans',
 'Austin Dillon',
 'Justin Haley',
 'Austin Hill',
 'Michael McDowell',
 'Todd Gilliland',
 'Kevin Harvick',
 'Cole Custer',
 'Ty Dillon',
 'Erik Jones',
 'Greg Biffle',
 'Kurt Busch',
 'Ty Gibbs',
 'Ricky Stenhouse Jr.',
 'Alex Bowman',
 'Kyle Larson',
 'Kaz Grala',
 'Cody Ware',
 'Brad Keselowski',
 'Boris Said',
 'Timmy Hill',
 'Corey LaJoie',
 'Justin Allgaier',
 'Landon Cassill',
 'Josh Bilicki',
 'Andy Lally',
 'B.J. McLeod',
 'Josh Williams',
 'Scott Heckert',
 'Kyle Tilley',
 'Tyler Redd

In [10]:
# drop inactive drivers from the dataframe
fp_df_copy = fp_df.copy()

# loop through drivers in the dataframe and remove them if they are not in active_drivers
for i in range(len(fp_df) - 1):
    #print(i)
    #print(tt_fp_df.iloc[i]['Driver'])
    driver = fp_df.iloc[i]['Driver']
    if driver not in active_drivers:
        fp_df_copy = fp_df_copy[fp_df_copy.Driver != driver]
        
fp_df = fp_df_copy
fp_df.reset_index(drop=True, inplace=True)

In [11]:
# get average finishing position for a driver at a track
def get_avg_fp(drivername, trackname):
    from statistics import mean
    temp_df = racedata.loc[(racedata['Driver'] == drivername) & (racedata['Track'] == trackname)]
    if not temp_df.empty:
        return mean(temp_df['Pos'])
    else:
        return np.nan

In [12]:
# for each driver, fill fp_df with average finishing position at each track
for colname in list(fp_df.columns):
    for i in range(len(fp_df['Driver'])):
        if colname != 'Driver':
            fp_df.at[i, colname] = format(float(get_avg_fp(fp_df.at[i, 'Driver'], colname)), '.1f')

In [13]:
#convert track columns to float
for track_col in uniq_tracklst:
    fp_df[track_col] = fp_df[track_col].astype(float)

In [14]:
fp_df

Unnamed: 0,Driver,Daytona,Atlanta,Las Vegas,Phoenix,Fontana,Martinsville,Fort Worth,Bristol,Richmond,Talladega,Kansas,Charlotte,Dover,Pocono,Michigan,Sonoma,Kentucky,Loudon,Indianapolis,Watkins Glen,Darlington,Chicago,Homestead,Charlotte (Road),Daytona (Road),Bristol (Dirt),Austin,Nashville,Elkhart Lake,Indianapolis G.P.
0,Aric Almirola,18.8,18.0,16.1,10.1,12.0,17.3,14.8,23.8,12.8,11.1,17.5,17.5,17.9,14.7,14.9,14.7,10.0,12.0,13.2,17.8,16.5,21.7,16.8,18.2,20.5,36.0,26.0,4.0,14.0,19.0
1,Christopher Bell,20.5,15.7,22.0,14.8,38.0,16.8,9.0,22.0,7.3,22.5,17.2,18.0,23.3,23.0,14.3,24.0,7.0,15.0,12.0,7.0,20.6,,14.0,16.0,11.0,34.0,38.0,9.0,2.0,36.0
2,Josh Bilicki,24.0,35.0,35.7,33.3,,30.0,29.0,35.0,36.5,33.5,33.6,35.5,32.7,34.0,32.3,32.5,32.0,35.0,27.0,33.7,31.8,33.0,35.3,33.0,36.0,30.0,30.0,26.0,23.0,18.0
3,Ryan Blaney,18.1,10.3,8.0,15.3,10.2,9.1,9.7,17.3,19.7,17.0,18.9,14.3,18.0,11.6,14.3,14.0,7.8,10.7,18.2,9.8,18.8,11.7,17.8,5.8,23.0,8.0,17.0,37.0,20.0,2.0
4,Alex Bowman,16.5,10.8,14.9,20.2,11.7,14.4,19.0,15.6,13.1,22.1,10.0,14.2,11.9,13.6,18.3,10.7,25.0,12.2,28.0,16.0,15.7,5.5,16.2,6.0,11.0,22.0,8.0,14.0,22.0,17.0
5,Chase Briscoe,20.0,19.0,17.5,28.5,,24.5,15.0,13.0,19.0,12.5,19.5,23.0,35.0,22.5,11.0,17.0,,27.0,,9.0,15.0,,18.0,22.0,32.0,20.0,6.0,31.0,6.0,26.0
6,Chris Buescher,19.2,17.2,16.8,22.3,21.8,17.4,21.6,23.8,25.7,16.9,17.1,16.1,22.7,21.6,19.0,15.8,17.2,22.5,20.0,15.2,17.6,22.3,20.2,14.5,8.0,14.0,13.0,36.0,18.0,12.0
7,Kyle Busch,24.7,6.3,8.9,6.2,3.5,6.6,9.1,9.9,6.7,20.2,10.5,10.1,13.0,7.0,5.7,4.2,8.0,16.3,21.2,6.2,10.6,12.7,4.6,25.8,36.0,17.0,10.0,11.0,3.0,20.0
8,Kurt Busch,22.8,10.7,20.3,15.5,11.8,13.6,9.1,11.7,14.8,16.1,12.4,16.1,17.2,14.0,9.0,8.0,10.5,17.3,19.5,9.5,10.4,16.3,15.6,13.5,9.0,16.0,27.0,8.0,4.0,6.0
9,William Byron,22.8,19.2,20.0,13.2,15.0,16.9,14.4,18.1,16.6,19.5,16.1,16.8,12.9,9.1,14.7,26.3,16.3,14.5,16.7,11.7,20.9,14.0,18.2,14.2,20.5,6.0,11.0,3.0,33.0,33.0


In [15]:
# function to sort drivers by their average finishing position at a track
def sort_avg_fp_track(trackname):
    track_df = fp_df[['Driver', trackname]]
    return track_df.sort_values(by=trackname)

In [16]:
sort_avg_fp_track('Watkins Glen')

Unnamed: 0,Driver,Watkins Glen
35,Martin Truex Jr.,2.0
15,Chase Elliott,4.2
17,Denny Hamlin,6.2
7,Kyle Busch,6.2
1,Christopher Bell,7.0
5,Chase Briscoe,9.0
25,Kyle Larson,9.5
8,Kurt Busch,9.5
3,Ryan Blaney,9.8
31,Tyler Reddick,10.0


In [17]:
list(fp_df.columns[1:])

['Daytona',
 'Atlanta',
 'Las Vegas',
 'Phoenix',
 'Fontana',
 'Martinsville',
 'Fort Worth',
 'Bristol',
 'Richmond',
 'Talladega',
 'Kansas',
 'Charlotte',
 'Dover',
 'Pocono',
 'Michigan',
 'Sonoma',
 'Kentucky',
 'Loudon',
 'Indianapolis',
 'Watkins Glen',
 'Darlington',
 'Chicago',
 'Homestead',
 'Charlotte (Road)',
 'Daytona (Road)',
 'Bristol (Dirt)',
 'Austin',
 'Nashville',
 'Elkhart Lake',
 'Indianapolis G.P.']

In [18]:
# assign tracks to their respective categories
tracks = list(fp_df.columns[1:])
plate_tracks = ['Daytona', 'Talladega']
flat_tracks = ['Indianapolis', 'Loudon', 'Phoenix', 'Pocono']
int_tracks = ['Atlanta', 'Darlington', 'Dover', 'Homestead', 'Kansas', 'Las Vegas', 'Michigan', 'Fontana', 'Fort Worth', 'Charlotte', 'Nashville']
short_tracks = ['Bristol', 'Bristol (Dirt)', 'Martinsville', 'Richmond']
road_courses = ['Watkins Glen', 'Sonoma', 'Daytona (Road)', 'Charlotte (Road)', 'Elkhart Lake', 'Austin', 'Indianapolis G.P.']
track_types = [plate_tracks, flat_tracks, int_tracks, short_tracks, road_courses]
trackset_names = ['plate_tracks', 'flat_tracks', 'int_tracks', 'short_tracks', 'road_courses']

In [19]:
# list of tracks in each group without apostrophes (for reddit post)
translation = {39: None}
for l in track_types:
    print(str(l).translate(translation))

[Daytona, Talladega]
[Indianapolis, Loudon, Phoenix, Pocono]
[Atlanta, Darlington, Dover, Homestead, Kansas, Las Vegas, Michigan, Fontana, Fort Worth, Charlotte, Nashville]
[Bristol, Bristol (Dirt), Martinsville, Richmond]
[Watkins Glen, Sonoma, Daytona (Road), Charlotte (Road), Elkhart Lake, Austin, Indianapolis G.P.]


In [20]:
# create new columns with each track type
for t_type in ['plate_tracks', 'flat_tracks', 'int_tracks', 'short_tracks', 'road_courses']:
    fp_df[t_type] = np.nan
    fp_df[t_type] = fp_df[t_type].astype(float)

In [21]:
# reset the index of fp_df
fp_df.reset_index(drop=True, inplace=True)

In [22]:
fp_df

Unnamed: 0,Driver,Daytona,Atlanta,Las Vegas,Phoenix,Fontana,Martinsville,Fort Worth,Bristol,Richmond,Talladega,Kansas,Charlotte,Dover,Pocono,Michigan,Sonoma,Kentucky,Loudon,Indianapolis,Watkins Glen,Darlington,Chicago,Homestead,Charlotte (Road),Daytona (Road),Bristol (Dirt),Austin,Nashville,Elkhart Lake,Indianapolis G.P.,plate_tracks,flat_tracks,int_tracks,short_tracks,road_courses
0,Aric Almirola,18.8,18.0,16.1,10.1,12.0,17.3,14.8,23.8,12.8,11.1,17.5,17.5,17.9,14.7,14.9,14.7,10.0,12.0,13.2,17.8,16.5,21.7,16.8,18.2,20.5,36.0,26.0,4.0,14.0,19.0,,,,,
1,Christopher Bell,20.5,15.7,22.0,14.8,38.0,16.8,9.0,22.0,7.3,22.5,17.2,18.0,23.3,23.0,14.3,24.0,7.0,15.0,12.0,7.0,20.6,,14.0,16.0,11.0,34.0,38.0,9.0,2.0,36.0,,,,,
2,Josh Bilicki,24.0,35.0,35.7,33.3,,30.0,29.0,35.0,36.5,33.5,33.6,35.5,32.7,34.0,32.3,32.5,32.0,35.0,27.0,33.7,31.8,33.0,35.3,33.0,36.0,30.0,30.0,26.0,23.0,18.0,,,,,
3,Ryan Blaney,18.1,10.3,8.0,15.3,10.2,9.1,9.7,17.3,19.7,17.0,18.9,14.3,18.0,11.6,14.3,14.0,7.8,10.7,18.2,9.8,18.8,11.7,17.8,5.8,23.0,8.0,17.0,37.0,20.0,2.0,,,,,
4,Alex Bowman,16.5,10.8,14.9,20.2,11.7,14.4,19.0,15.6,13.1,22.1,10.0,14.2,11.9,13.6,18.3,10.7,25.0,12.2,28.0,16.0,15.7,5.5,16.2,6.0,11.0,22.0,8.0,14.0,22.0,17.0,,,,,
5,Chase Briscoe,20.0,19.0,17.5,28.5,,24.5,15.0,13.0,19.0,12.5,19.5,23.0,35.0,22.5,11.0,17.0,,27.0,,9.0,15.0,,18.0,22.0,32.0,20.0,6.0,31.0,6.0,26.0,,,,,
6,Chris Buescher,19.2,17.2,16.8,22.3,21.8,17.4,21.6,23.8,25.7,16.9,17.1,16.1,22.7,21.6,19.0,15.8,17.2,22.5,20.0,15.2,17.6,22.3,20.2,14.5,8.0,14.0,13.0,36.0,18.0,12.0,,,,,
7,Kyle Busch,24.7,6.3,8.9,6.2,3.5,6.6,9.1,9.9,6.7,20.2,10.5,10.1,13.0,7.0,5.7,4.2,8.0,16.3,21.2,6.2,10.6,12.7,4.6,25.8,36.0,17.0,10.0,11.0,3.0,20.0,,,,,
8,Kurt Busch,22.8,10.7,20.3,15.5,11.8,13.6,9.1,11.7,14.8,16.1,12.4,16.1,17.2,14.0,9.0,8.0,10.5,17.3,19.5,9.5,10.4,16.3,15.6,13.5,9.0,16.0,27.0,8.0,4.0,6.0,,,,,
9,William Byron,22.8,19.2,20.0,13.2,15.0,16.9,14.4,18.1,16.6,19.5,16.1,16.8,12.9,9.1,14.7,26.3,16.3,14.5,16.7,11.7,20.9,14.0,18.2,14.2,20.5,6.0,11.0,3.0,33.0,33.0,,,,,


In [23]:
import math

for i in range(len(fp_df)): # for each driver
    print(fp_df.iloc[i]['Driver'])
    avgs = []
    for trackset in track_types:
        trackset_copy = trackset.copy()
        total = 0
        for track in trackset: # for each track in the trackset
            if math.isnan(float(fp_df.iloc[i][track])): # if the finishing position is nan, the driver hasn't raced at that track. remove the track from the trackset
                trackset_copy.remove(track)
            else: # otherwise, add their average finish at the track to the total used for categorical mean calculation
                total += float(fp_df.iloc[i][track])
        print(trackset_copy)
        if len(trackset_copy) == 0: # don't calculate an average if the driver hasn't raced in any races
            avg = np.nan
        else:
            avg = total / len(trackset_copy) # otherwise, calculate the average for tracks the driver has raced at
        print(avg)
        avgs.append(avg)
        
    # loop to write data to dataframe
    for name in trackset_names:
        fp_df.at[i, name] = format(avgs[trackset_names.index(name)], '.1f')
    print('\n')
        
        

Aric Almirola
['Daytona', 'Talladega']
14.95
['Indianapolis', 'Loudon', 'Phoenix', 'Pocono']
12.5
['Atlanta', 'Darlington', 'Dover', 'Homestead', 'Kansas', 'Las Vegas', 'Michigan', 'Fontana', 'Fort Worth', 'Charlotte', 'Nashville']
15.090909090909093
['Bristol', 'Bristol (Dirt)', 'Martinsville', 'Richmond']
22.474999999999998
['Watkins Glen', 'Sonoma', 'Daytona (Road)', 'Charlotte (Road)', 'Elkhart Lake', 'Austin', 'Indianapolis G.P.']
18.599999999999998


Christopher Bell
['Daytona', 'Talladega']
21.5
['Indianapolis', 'Loudon', 'Phoenix', 'Pocono']
16.2
['Atlanta', 'Darlington', 'Dover', 'Homestead', 'Kansas', 'Las Vegas', 'Michigan', 'Fontana', 'Fort Worth', 'Charlotte', 'Nashville']
18.28181818181818
['Bristol', 'Bristol (Dirt)', 'Martinsville', 'Richmond']
20.025
['Watkins Glen', 'Sonoma', 'Daytona (Road)', 'Charlotte (Road)', 'Elkhart Lake', 'Austin', 'Indianapolis G.P.']
19.142857142857142


Josh Bilicki
['Daytona', 'Talladega']
28.75
['Indianapolis', 'Loudon', 'Phoenix', 'Pocono

['Atlanta', 'Darlington', 'Dover', 'Homestead', 'Kansas', 'Las Vegas', 'Michigan', 'Fontana', 'Fort Worth', 'Charlotte', 'Nashville']
18.700000000000003
['Bristol', 'Bristol (Dirt)', 'Martinsville', 'Richmond']
14.475
['Watkins Glen', 'Sonoma', 'Daytona (Road)', 'Charlotte (Road)', 'Elkhart Lake', 'Austin', 'Indianapolis G.P.']
25.642857142857142


Martin Truex Jr.
['Daytona', 'Talladega']
22.799999999999997
['Indianapolis', 'Loudon', 'Phoenix', 'Pocono']
14.850000000000001
['Atlanta', 'Darlington', 'Dover', 'Homestead', 'Kansas', 'Las Vegas', 'Michigan', 'Fontana', 'Fort Worth', 'Charlotte', 'Nashville']
8.5
['Bristol', 'Bristol (Dirt)', 'Martinsville', 'Richmond']
12.6
['Watkins Glen', 'Sonoma', 'Daytona (Road)', 'Charlotte (Road)', 'Elkhart Lake', 'Austin', 'Indianapolis G.P.']
13.314285714285715


Bubba Wallace
['Daytona', 'Talladega']
16.6
['Indianapolis', 'Loudon', 'Phoenix', 'Pocono']
21.15
['Atlanta', 'Darlington', 'Dover', 'Homestead', 'Kansas', 'Las Vegas', 'Michigan', 'Fonta

In [24]:
display(fp_df)

Unnamed: 0,Driver,Daytona,Atlanta,Las Vegas,Phoenix,Fontana,Martinsville,Fort Worth,Bristol,Richmond,Talladega,Kansas,Charlotte,Dover,Pocono,Michigan,Sonoma,Kentucky,Loudon,Indianapolis,Watkins Glen,Darlington,Chicago,Homestead,Charlotte (Road),Daytona (Road),Bristol (Dirt),Austin,Nashville,Elkhart Lake,Indianapolis G.P.,plate_tracks,flat_tracks,int_tracks,short_tracks,road_courses
0,Aric Almirola,18.8,18.0,16.1,10.1,12.0,17.3,14.8,23.8,12.8,11.1,17.5,17.5,17.9,14.7,14.9,14.7,10.0,12.0,13.2,17.8,16.5,21.7,16.8,18.2,20.5,36.0,26.0,4.0,14.0,19.0,14.9,12.5,15.1,22.5,18.6
1,Christopher Bell,20.5,15.7,22.0,14.8,38.0,16.8,9.0,22.0,7.3,22.5,17.2,18.0,23.3,23.0,14.3,24.0,7.0,15.0,12.0,7.0,20.6,,14.0,16.0,11.0,34.0,38.0,9.0,2.0,36.0,21.5,16.2,18.3,20.0,19.1
2,Josh Bilicki,24.0,35.0,35.7,33.3,,30.0,29.0,35.0,36.5,33.5,33.6,35.5,32.7,34.0,32.3,32.5,32.0,35.0,27.0,33.7,31.8,33.0,35.3,33.0,36.0,30.0,30.0,26.0,23.0,18.0,28.8,32.3,32.7,32.9,29.5
3,Ryan Blaney,18.1,10.3,8.0,15.3,10.2,9.1,9.7,17.3,19.7,17.0,18.9,14.3,18.0,11.6,14.3,14.0,7.8,10.7,18.2,9.8,18.8,11.7,17.8,5.8,23.0,8.0,17.0,37.0,20.0,2.0,17.6,14.0,16.1,13.5,13.1
4,Alex Bowman,16.5,10.8,14.9,20.2,11.7,14.4,19.0,15.6,13.1,22.1,10.0,14.2,11.9,13.6,18.3,10.7,25.0,12.2,28.0,16.0,15.7,5.5,16.2,6.0,11.0,22.0,8.0,14.0,22.0,17.0,19.3,18.5,14.2,16.3,13.0
5,Chase Briscoe,20.0,19.0,17.5,28.5,,24.5,15.0,13.0,19.0,12.5,19.5,23.0,35.0,22.5,11.0,17.0,,27.0,,9.0,15.0,,18.0,22.0,32.0,20.0,6.0,31.0,6.0,26.0,16.2,26.0,20.4,19.1,16.9
6,Chris Buescher,19.2,17.2,16.8,22.3,21.8,17.4,21.6,23.8,25.7,16.9,17.1,16.1,22.7,21.6,19.0,15.8,17.2,22.5,20.0,15.2,17.6,22.3,20.2,14.5,8.0,14.0,13.0,36.0,18.0,12.0,18.0,21.6,20.6,20.2,13.8
7,Kyle Busch,24.7,6.3,8.9,6.2,3.5,6.6,9.1,9.9,6.7,20.2,10.5,10.1,13.0,7.0,5.7,4.2,8.0,16.3,21.2,6.2,10.6,12.7,4.6,25.8,36.0,17.0,10.0,11.0,3.0,20.0,22.4,12.7,8.5,10.1,15.0
8,Kurt Busch,22.8,10.7,20.3,15.5,11.8,13.6,9.1,11.7,14.8,16.1,12.4,16.1,17.2,14.0,9.0,8.0,10.5,17.3,19.5,9.5,10.4,16.3,15.6,13.5,9.0,16.0,27.0,8.0,4.0,6.0,19.5,16.6,12.8,14.0,11.0
9,William Byron,22.8,19.2,20.0,13.2,15.0,16.9,14.4,18.1,16.6,19.5,16.1,16.8,12.9,9.1,14.7,26.3,16.3,14.5,16.7,11.7,20.9,14.0,18.2,14.2,20.5,6.0,11.0,3.0,33.0,33.0,21.1,13.4,15.6,14.4,21.4


In [25]:
# convert numeric columns from string to float
cols = list(fp_df.columns)
cols.pop(0)
for col in cols:
    fp_df[col] = fp_df[col].astype(float)

In [26]:
# function to sort 
def sort_avg_fp_track(trackname, tracktype):
    track_df = fp_df[['Driver', trackname, tracktype]]
    return track_df.sort_values(by=trackname)

In [27]:
track_data = sort_avg_fp_track('Watkins Glen', 'road_courses')

display(track_data)

Unnamed: 0,Driver,Watkins Glen,road_courses
35,Martin Truex Jr.,2.0,13.3
15,Chase Elliott,4.2,5.6
17,Denny Hamlin,6.2,10.0
7,Kyle Busch,6.2,15.0
1,Christopher Bell,7.0,19.1
5,Chase Briscoe,9.0,16.9
25,Kyle Larson,9.5,12.3
8,Kurt Busch,9.5,11.0
3,Ryan Blaney,9.8,13.1
31,Tyler Reddick,10.0,14.6


In [28]:
track_data

Unnamed: 0,Driver,Watkins Glen,road_courses
35,Martin Truex Jr.,2.0,13.3
15,Chase Elliott,4.2,5.6
17,Denny Hamlin,6.2,10.0
7,Kyle Busch,6.2,15.0
1,Christopher Bell,7.0,19.1
5,Chase Briscoe,9.0,16.9
25,Kyle Larson,9.5,12.3
8,Kurt Busch,9.5,11.0
3,Ryan Blaney,9.8,13.1
31,Tyler Reddick,10.0,14.6


In [29]:
# subset data to only include averages at each track type
tt_fp_df = fp_df[['Driver', 'plate_tracks', 'flat_tracks', 'int_tracks', 'short_tracks', 'road_courses']]

In [30]:
# sort average finishes for a specific track type
tt_fp_df.sort_values(by='road_courses', ascending = True)

Unnamed: 0,Driver,plate_tracks,flat_tracks,int_tracks,short_tracks,road_courses
15,Chase Elliott,17.9,14.5,14.2,11.2,5.6
17,Denny Hamlin,12.4,9.7,11.5,7.8,10.0
8,Kurt Busch,19.5,16.6,12.8,14.0,11.0
25,Kyle Larson,22.1,14.8,9.7,16.6,12.3
4,Alex Bowman,19.3,18.5,14.2,16.3,13.0
3,Ryan Blaney,17.6,14.0,16.1,13.5,13.1
35,Martin Truex Jr.,22.8,14.9,8.5,12.6,13.3
6,Chris Buescher,18.0,21.6,20.6,20.2,13.8
21,Erik Jones,21.3,18.8,15.9,15.4,14.5
31,Tyler Reddick,20.7,16.4,14.0,14.0,14.6


## Starting Position Diff

In [31]:
# create dataframe with columns representing each driver and track
sp_df = pd.DataFrame(columns = ['Driver'] + uniq_tracklst)

# add unique drivers to Driver column
sp_df['Driver'] = sorted(uniq_drivers)

# fix incorrect sorting with jrs.

# replace jr. names w/ non-jr. names
sp_df.replace('Ricky Stenhouse Jr.', 'Ricky Stenhouse', inplace=True)
sp_df.replace('Martin Truex Jr.', 'Martin Truex', inplace=True)

# sort by last name
sp_df = sp_df.sort_values(by='Driver', key=lambda x: x.str.split('\s+').str[-1])

# replace non-jr. names w/ jr. names
sp_df.replace('Ricky Stenhouse', 'Ricky Stenhouse Jr.', inplace=True)
sp_df.replace('Martin Truex', 'Martin Truex Jr.', inplace=True)

# drop inactive drivers from the dataframe
sp_df_copy = sp_df.copy()

# loop through drivers in the dataframe and remove them if they are not in active_drivers
for i in range(len(sp_df) - 1):
    #print(i)
    #print(tt_fp_df.iloc[i]['Driver'])
    driver = sp_df.iloc[i]['Driver']
    if driver not in active_drivers:
        sp_df_copy = sp_df_copy[sp_df_copy.Driver != driver]
        
sp_df = sp_df_copy
sp_df.reset_index(drop=True, inplace=True)

In [32]:
sp_df

Unnamed: 0,Driver,Daytona,Atlanta,Las Vegas,Phoenix,Fontana,Martinsville,Fort Worth,Bristol,Richmond,Talladega,Kansas,Charlotte,Dover,Pocono,Michigan,Sonoma,Kentucky,Loudon,Indianapolis,Watkins Glen,Darlington,Chicago,Homestead,Charlotte (Road),Daytona (Road),Bristol (Dirt),Austin,Nashville,Elkhart Lake,Indianapolis G.P.
0,Aric Almirola,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Christopher Bell,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Josh Bilicki,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Ryan Blaney,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Alex Bowman,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,Chase Briscoe,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,Chris Buescher,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,Kyle Busch,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,Kurt Busch,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,William Byron,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [33]:
# get average finishing position for a driver at a track
def get_sp_fp_diff(drivername, trackname):
    from statistics import mean
    temp_df = racedata.loc[(racedata['Driver'] == drivername) & (racedata['Track'] == trackname)]
    temp_df.reset_index(inplace = True)
    pos_gains = []
    if not temp_df.empty:
        for i in range(len(temp_df)):
            sp = temp_df['St'][i]
            fp = temp_df['Pos'][i]
            pos_change = fp-sp
#             print('start:', sp, 'finish:', fp, 'gain:', pos_change)
            pos_gains.append(pos_change)
        return(float(mean(pos_gains)))
    else:
        return np.nan

In [34]:
# for each driver, fill fp_df with average finishing position at each track
for colname in list(sp_df.columns):
    for i in range(len(sp_df['Driver'])):
        if colname != 'Driver':
            sp_df.at[i, colname] = format(float(get_sp_fp_diff(sp_df.at[i, 'Driver'], colname)), '.1f')

In [35]:
# function to sort drivers by their average finishing position at a track
def sort_avg_spfp_diff_track(trackname):
    track_df = sp_df[['Driver', trackname]]
    return track_df.sort_values(by=trackname, ascending = False)

In [36]:
sort_avg_spfp_diff_track('Watkins Glen')

Unnamed: 0,Driver,Watkins Glen
26,Joey Logano,16.0
19,Daniel Hemric,12.0
22,Brad Keselowski,10.0
29,Ryan Preece,6.0
25,Kyle Larson,5.0
32,Garrett Smithley,4.0
0,Aric Almirola,4.0
14,Austin Dillon,1.0
27,Michael McDowell,1.0
7,Kyle Busch,0.0


In [37]:
# for each driver, calculate individual fp-sp at each track type, calculate the mean, write to dataframe
def sort_avg_spfp_diff_tracktype(tracklist):
    from statistics import mean
    drivers = []
    spfp_diffs = []
    for i in range(len(sp_df)):
        diff_list = []
        driver = sp_df['Driver'][i]
        drivers.append(driver)
        for trackname in tracklist:
            diff_list.append(sp_df[trackname][i])
        spfp_diff = mean(diff_list)
        spfp_diffs.append(spfp_diff)
    return(pd.DataFrame({'Driver': drivers, 'Average Pos. Change': spfp_diffs}))

In [38]:
diff_df = sort_avg_spfp_diff_tracktype(road_courses)
diff_df.sort_values(by = 'Average Pos. Change', ascending = False)

Unnamed: 0,Driver,Average Pos. Change
9,William Byron,13.857143
34,Daniel Suarez,9.428571
25,Kyle Larson,8.0
26,Joey Logano,6.0
1,Christopher Bell,5.0
35,Martin Truex Jr.,4.142857
18,Kevin Harvick,3.285714
12,Cole Custer,3.0
31,Tyler Reddick,2.857143
14,Austin Dillon,2.0
