# Part 1: Get Races from 2017 - 2022

## Race Results

In [22]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

BASE_URL = 'https://racing-reference.info'
years = range(2017, 2023)
cup_results = [requests.get(BASE_URL + f'/season-stats/{year}/W') for year in years]
set([r.status_code for r in cup_results])

{200}

In [23]:
race_anchors = []
href_regex = re.compile('/race-results/.*/W')
race_dates = []

for c in cup_results:
    race_anchors.extend(BeautifulSoup(c.text, 'lxml').find_all(href=href_regex))
    result = BeautifulSoup(c.text, 'lxml').find_all("div", {"class": "date W"})
    for res in result:
        race_dates.append(res.text)

In [26]:
len(race_dates)

216

In [38]:
# convert race_dates to datetime objs
race_dates_dt = []
from datetime import datetime
for date in race_dates:
    race_dates_dt.append(datetime.strptime(date, '%m/%d/%y'))

In [27]:
len(race_anchors)

216

In [28]:
races = [requests.get(a.attrs['href']) for a in race_anchors]

In [29]:
set([r.status_code for r in races])

{200}

In [42]:
# display first race
pd.read_html(races[0].text, match='Sponsor / Owner', header=0)[-1]

Unnamed: 0,Pos,St,#,Driver,Sponsor / Owner,Car,Laps,Status,Led,Pts,PPts
0,1,8,41,Kurt Busch,Haas Automation / Monster Energy (Stewart Haas...,Ford,200,running,1,48,5
1,2,36,21,Ryan Blaney,Motorcraft / Quick Lane Tire & Auto Center (Wo...,Ford,200,running,2,44,0
2,3,38,47,A.J. Allmendinger,Kroger ClickList / Stouffer's / Cheez-It (JTG-...,Chevrolet,200,running,2,39,0
3,4,13,43,Aric Almirola,Smithfield Foods (Richard Petty Motorsports),Ford,200,running,2,33,0
4,5,33,27,Paul Menard,Menards / Peak (Richard Childress),Chevrolet,200,running,0,32,0
5,6,15,22,Joey Logano,Shell / Pennzoil (Roger Penske),Ford,200,running,16,40,0
6,7,26,5,Kasey Kahne,Farmers Insurance (Rick Hendrick),Chevrolet,200,running,7,30,0
7,8,30,15,Michael Waltrip,"Aaron's ""Thanks, Mikey!"" (Jay Robinson)",Toyota,200,running,0,29,0
8,9,25,32,Matt DiBenedetto,E.J. Wade Foundation (Archie St. Hilaire),Ford,200,running,0,28,0
9,10,11,6,Trevor Bayne,AdvoCare (Jack Roush),Ford,200,running,0,27,0


In [43]:
#create a dictionary containing a list of each year's tracks
trackdata = []
years = range(2017, 2023)
for x in cup_results:
    trackdata.append(BeautifulSoup(x.text, 'lxml').find_all(class_='track W'))

tracks = []
for group in trackdata:
    yeartracks = []
    for tag in group:
        yeartracks.append(tag.text.strip())
    tracks.append(yeartracks)
    
trackdict = dict(zip(years, tracks))

In [44]:
trackdict

{2017: ['Daytona',
  'Atlanta',
  'Las Vegas',
  'Phoenix',
  'Fontana',
  'Martinsville',
  'Fort Worth',
  'Bristol',
  'Richmond',
  'Talladega',
  'Kansas',
  'Charlotte',
  'Dover',
  'Pocono',
  'Michigan',
  'Sonoma',
  'Daytona',
  'Kentucky',
  'Loudon',
  'Indianapolis',
  'Pocono',
  'Watkins Glen',
  'Michigan',
  'Bristol',
  'Darlington',
  'Richmond',
  'Chicago',
  'Loudon',
  'Dover',
  'Charlotte',
  'Talladega',
  'Kansas',
  'Martinsville',
  'Fort Worth',
  'Phoenix',
  'Homestead'],
 2018: ['Daytona',
  'Atlanta',
  'Las Vegas',
  'Phoenix',
  'Fontana',
  'Martinsville',
  'Fort Worth',
  'Bristol',
  'Richmond',
  'Talladega',
  'Dover',
  'Kansas',
  'Charlotte',
  'Pocono',
  'Michigan',
  'Sonoma',
  'Chicago',
  'Daytona',
  'Kentucky',
  'Loudon',
  'Pocono',
  'Watkins Glen',
  'Michigan',
  'Bristol',
  'Darlington',
  'Indianapolis',
  'Las Vegas',
  'Richmond',
  'Charlotte (Road)',
  'Dover',
  'Talladega',
  'Kansas',
  'Martinsville',
  'Fort Worth',

In [45]:
len(races)

216

In [46]:
#flatten track list
tracklst = [x for l in tracks for x in l]

#create empty dataframe to put race data into
racedata = pd.DataFrame(columns = [
 'Pos',
 'St',
 '#',
 'Driver',
 'Sponsor / Owner',
 'Car',
 'Laps',
 'Status',
 'Led',
 'Pts',
 'PPts',
 'Track'])

#create dataframe with data for all races from 2017-2021
for i in range(len(races)):
    race = pd.read_html(races[i].text, match='Sponsor / Owner', header=0)[-1]
    race['Track'] = tracklst[i]
    race['Date'] = race_dates[i]
    race['Date_dtobj'] = race_dates_dt[i]
    racedata = pd.concat([racedata, race])
    #print(race)
    
print(racedata)

   Pos  St   #                Driver  \
0    1   8  41            Kurt Busch   
1    2  36  21           Ryan Blaney   
2    3  38  47     A.J. Allmendinger   
3    4  13  43         Aric Almirola   
4    5  33  27           Paul Menard   
..  ..  ..  ..                   ...   
31  32  29  47  Ricky Stenhouse, Jr.   
32  33  35  15      Garrett Smithley   
33  34  24  48           Alex Bowman   
34  35  18   6       Brad Keselowski   
35  36  36  77        Landon Cassill   

                                      Sponsor / Owner        Car Laps  \
0   Haas Automation / Monster Energy (Stewart Haas...       Ford  200   
1   Motorcraft / Quick Lane Tire & Auto Center (Wo...       Ford  200   
2   Kroger ClickList / Stouffer's / Cheez-It (JTG-...  Chevrolet  200   
3        Smithfield Foods (Richard Petty Motorsports)       Ford  200   
4                  Menards / Peak (Richard Childress)  Chevrolet  200   
..                                                ...        ...  ...   
31      

In [48]:
# dump racedata to csv
racedata.to_csv('data/racedata_2017-2022.csv')

## Loop Data

In [142]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [143]:
years = range(2017, 2023)
racenums = range(1,37,1)
frames = []
race_id = 0

# loop through each year and racenumber and get loop data
for year in years:
    for num in racenums:
        race_id += 1
        if len(str(num)) == 1: # if race num. 1-9
            newnum = '0' + str(num)
            # get loop data for year/num combo
            url = 'https://www.racing-reference.info/loopdata/' + str(year) + '-' + newnum + '/W/'
            print(f'getting loop data for race {newnum} in {year} ({url})')
            response = requests.get(url)
            df = pd.read_html(response.text)[4].drop([0,1,2])
            df = df.reset_index(drop = True)
            df.columns = ['Driver', 'Start', 'Mid Race', 'Finish', 'High Pos.', 'Low Pos.', 'Avg. Pos.', 'Pass Diff.', 'Green Flag Passes', 'Green Flag Times Passed', 'Quality Passes', 'Pct. Quality Passes', 'Fastest Lap', 'Top 15 Laps', 'Pct. Top 15 Laps', 'Laps Led', 'Pct. Laps Led', 'Total Laps', 'DRIVER RATING']
            df['race_ID'] = race_id
            frames.append(df)
        else: # race num. 10-38
            # get loop data for year/num combo
            url = 'https://www.racing-reference.info/loopdata/' + str(year) + '-' + str(num) + '/W/'
            print(f'getting loop data for race {num} in {year} ({url})')
            response = requests.get(f'https://www.racing-reference.info/loopdata/{year}-{num}/W/')
            df = pd.read_html(response.text)[4].drop([0,1,2])
            df = df.reset_index(drop = True)
            df.columns = ['Driver', 'Start', 'Mid Race', 'Finish', 'High Pos.', 'Low Pos.', 'Avg. Pos.', 'Pass Diff.', 'Green Flag Passes', 'Green Flag Times Passed', 'Quality Passes', 'Pct. Quality Passes', 'Fastest Lap', 'Top 15 Laps', 'Pct. Top 15 Laps', 'Laps Led', 'Pct. Laps Led', 'Total Laps', 'DRIVER RATING']
            df['race_ID'] = race_id
            frames.append(df)

getting loop data for race 01 in 2017 (https://www.racing-reference.info/loopdata/2017-01/W/)
getting loop data for race 02 in 2017 (https://www.racing-reference.info/loopdata/2017-02/W/)
getting loop data for race 03 in 2017 (https://www.racing-reference.info/loopdata/2017-03/W/)
getting loop data for race 04 in 2017 (https://www.racing-reference.info/loopdata/2017-04/W/)
getting loop data for race 05 in 2017 (https://www.racing-reference.info/loopdata/2017-05/W/)
getting loop data for race 06 in 2017 (https://www.racing-reference.info/loopdata/2017-06/W/)
getting loop data for race 07 in 2017 (https://www.racing-reference.info/loopdata/2017-07/W/)
getting loop data for race 08 in 2017 (https://www.racing-reference.info/loopdata/2017-08/W/)
getting loop data for race 09 in 2017 (https://www.racing-reference.info/loopdata/2017-09/W/)
getting loop data for race 10 in 2017 (https://www.racing-reference.info/loopdata/2017-10/W/)
getting loop data for race 11 in 2017 (https://www.racing-re

getting loop data for race 17 in 2019 (https://www.racing-reference.info/loopdata/2019-17/W/)
getting loop data for race 18 in 2019 (https://www.racing-reference.info/loopdata/2019-18/W/)
getting loop data for race 19 in 2019 (https://www.racing-reference.info/loopdata/2019-19/W/)
getting loop data for race 20 in 2019 (https://www.racing-reference.info/loopdata/2019-20/W/)
getting loop data for race 21 in 2019 (https://www.racing-reference.info/loopdata/2019-21/W/)
getting loop data for race 22 in 2019 (https://www.racing-reference.info/loopdata/2019-22/W/)
getting loop data for race 23 in 2019 (https://www.racing-reference.info/loopdata/2019-23/W/)
getting loop data for race 24 in 2019 (https://www.racing-reference.info/loopdata/2019-24/W/)
getting loop data for race 25 in 2019 (https://www.racing-reference.info/loopdata/2019-25/W/)
getting loop data for race 26 in 2019 (https://www.racing-reference.info/loopdata/2019-26/W/)
getting loop data for race 27 in 2019 (https://www.racing-re

getting loop data for race 33 in 2021 (https://www.racing-reference.info/loopdata/2021-33/W/)
getting loop data for race 34 in 2021 (https://www.racing-reference.info/loopdata/2021-34/W/)
getting loop data for race 35 in 2021 (https://www.racing-reference.info/loopdata/2021-35/W/)
getting loop data for race 36 in 2021 (https://www.racing-reference.info/loopdata/2021-36/W/)
getting loop data for race 01 in 2022 (https://www.racing-reference.info/loopdata/2022-01/W/)
getting loop data for race 02 in 2022 (https://www.racing-reference.info/loopdata/2022-02/W/)
getting loop data for race 03 in 2022 (https://www.racing-reference.info/loopdata/2022-03/W/)
getting loop data for race 04 in 2022 (https://www.racing-reference.info/loopdata/2022-04/W/)
getting loop data for race 05 in 2022 (https://www.racing-reference.info/loopdata/2022-05/W/)
getting loop data for race 06 in 2022 (https://www.racing-reference.info/loopdata/2022-06/W/)
getting loop data for race 07 in 2022 (https://www.racing-re

In [145]:
loopdata_df = pd.concat(frames, axis = 0)

In [149]:
loopdata_df

Unnamed: 0,Driver,Start,Mid Race,Finish,High Pos.,Low Pos.,Avg. Pos.,Pass Diff.,Green Flag Passes,Green Flag Times Passed,Quality Passes,Pct. Quality Passes,Fastest Lap,Top 15 Laps,Pct. Top 15 Laps,Laps Led,Pct. Laps Led,Total Laps,DRIVER RATING,race_ID
0,Kurt Busch,8,23,1,1,37,12,17,248,231,181,73.0,5,133,66.5,1,0.5,200,107.0,1
1,Ryan Blaney,36,26,2,1,38,9,18,315,297,240,76.2,11,166,83.0,2,1.0,200,106.2,1
2,A.J. Allmendinger,38,15,3,1,38,18,53,250,197,105,42.0,6,75,37.5,2,1.0,200,80.5,1
3,Aric Almirola,13,13,4,1,31,15,-9,279,288,124,44.4,7,96,48.0,2,1.0,200,82.9,1
4,Paul Menard,33,35,5,3,37,16,14,303,289,172,56.8,8,109,54.5,0,0.0,200,92.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,"Ricky Stenhouse, Jr.",29,34,32,24,34,32,2,28,26,0,0.0,0,0,0.0,0,0.0,307,32.5,216
32,Garrett Smithley,35,35,33,33,36,35,0,5,5,0,0.0,0,0,0.0,0,0.0,304,25.5,216
33,Alex Bowman,24,28,34,1,34,21,4,108,104,9,8.3,11,27,8.9,1,0.3,304,55.8,216
34,Brad Keselowski,18,21,35,14,30,25,-13,58,71,3,5.2,0,8,3.0,0,0.0,270,43.7,216


In [150]:
loopdata_df.to_csv('data/loopdata_2017-2022.csv')