# Part 1: Get Races from 2017 - 2022

In [22]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

BASE_URL = 'https://racing-reference.info'
years = range(2017, 2023)
cup_results = [requests.get(BASE_URL + f'/season-stats/{year}/W') for year in years]
set([r.status_code for r in cup_results])

{200}

In [23]:
race_anchors = []
href_regex = re.compile('/race-results/.*/W')
race_dates = []

for c in cup_results:
    race_anchors.extend(BeautifulSoup(c.text, 'lxml').find_all(href=href_regex))
    result = BeautifulSoup(c.text, 'lxml').find_all("div", {"class": "date W"})
    for res in result:
        race_dates.append(res.text)

In [26]:
len(race_dates)

216

In [38]:
# convert race_dates to datetime objs
race_dates_dt = []
from datetime import datetime
for date in race_dates:
    race_dates_dt.append(datetime.strptime(date, '%m/%d/%y'))

In [27]:
len(race_anchors)

216

In [28]:
races = [requests.get(a.attrs['href']) for a in race_anchors]

In [29]:
set([r.status_code for r in races])

{200}

In [42]:
# display first race
pd.read_html(races[0].text, match='Sponsor / Owner', header=0)[-1]

Unnamed: 0,Pos,St,#,Driver,Sponsor / Owner,Car,Laps,Status,Led,Pts,PPts
0,1,8,41,Kurt Busch,Haas Automation / Monster Energy (Stewart Haas...,Ford,200,running,1,48,5
1,2,36,21,Ryan Blaney,Motorcraft / Quick Lane Tire & Auto Center (Wo...,Ford,200,running,2,44,0
2,3,38,47,A.J. Allmendinger,Kroger ClickList / Stouffer's / Cheez-It (JTG-...,Chevrolet,200,running,2,39,0
3,4,13,43,Aric Almirola,Smithfield Foods (Richard Petty Motorsports),Ford,200,running,2,33,0
4,5,33,27,Paul Menard,Menards / Peak (Richard Childress),Chevrolet,200,running,0,32,0
5,6,15,22,Joey Logano,Shell / Pennzoil (Roger Penske),Ford,200,running,16,40,0
6,7,26,5,Kasey Kahne,Farmers Insurance (Rick Hendrick),Chevrolet,200,running,7,30,0
7,8,30,15,Michael Waltrip,"Aaron's ""Thanks, Mikey!"" (Jay Robinson)",Toyota,200,running,0,29,0
8,9,25,32,Matt DiBenedetto,E.J. Wade Foundation (Archie St. Hilaire),Ford,200,running,0,28,0
9,10,11,6,Trevor Bayne,AdvoCare (Jack Roush),Ford,200,running,0,27,0


In [43]:
#create a dictionary containing a list of each year's tracks
trackdata = []
years = range(2017, 2023)
for x in cup_results:
    trackdata.append(BeautifulSoup(x.text, 'lxml').find_all(class_='track W'))

tracks = []
for group in trackdata:
    yeartracks = []
    for tag in group:
        yeartracks.append(tag.text.strip())
    tracks.append(yeartracks)
    
trackdict = dict(zip(years, tracks))

In [44]:
trackdict

{2017: ['Daytona',
  'Atlanta',
  'Las Vegas',
  'Phoenix',
  'Fontana',
  'Martinsville',
  'Fort Worth',
  'Bristol',
  'Richmond',
  'Talladega',
  'Kansas',
  'Charlotte',
  'Dover',
  'Pocono',
  'Michigan',
  'Sonoma',
  'Daytona',
  'Kentucky',
  'Loudon',
  'Indianapolis',
  'Pocono',
  'Watkins Glen',
  'Michigan',
  'Bristol',
  'Darlington',
  'Richmond',
  'Chicago',
  'Loudon',
  'Dover',
  'Charlotte',
  'Talladega',
  'Kansas',
  'Martinsville',
  'Fort Worth',
  'Phoenix',
  'Homestead'],
 2018: ['Daytona',
  'Atlanta',
  'Las Vegas',
  'Phoenix',
  'Fontana',
  'Martinsville',
  'Fort Worth',
  'Bristol',
  'Richmond',
  'Talladega',
  'Dover',
  'Kansas',
  'Charlotte',
  'Pocono',
  'Michigan',
  'Sonoma',
  'Chicago',
  'Daytona',
  'Kentucky',
  'Loudon',
  'Pocono',
  'Watkins Glen',
  'Michigan',
  'Bristol',
  'Darlington',
  'Indianapolis',
  'Las Vegas',
  'Richmond',
  'Charlotte (Road)',
  'Dover',
  'Talladega',
  'Kansas',
  'Martinsville',
  'Fort Worth',

In [45]:
len(races)

216

In [46]:
#flatten track list
tracklst = [x for l in tracks for x in l]

#create empty dataframe to put race data into
racedata = pd.DataFrame(columns = [
 'Pos',
 'St',
 '#',
 'Driver',
 'Sponsor / Owner',
 'Car',
 'Laps',
 'Status',
 'Led',
 'Pts',
 'PPts',
 'Track'])

#create dataframe with data for all races from 2017-2021
for i in range(len(races)):
    race = pd.read_html(races[i].text, match='Sponsor / Owner', header=0)[-1]
    race['Track'] = tracklst[i]
    race['Date'] = race_dates[i]
    race['Date_dtobj'] = race_dates_dt[i]
    racedata = pd.concat([racedata, race])
    #print(race)
    
print(racedata)

   Pos  St   #                Driver  \
0    1   8  41            Kurt Busch   
1    2  36  21           Ryan Blaney   
2    3  38  47     A.J. Allmendinger   
3    4  13  43         Aric Almirola   
4    5  33  27           Paul Menard   
..  ..  ..  ..                   ...   
31  32  29  47  Ricky Stenhouse, Jr.   
32  33  35  15      Garrett Smithley   
33  34  24  48           Alex Bowman   
34  35  18   6       Brad Keselowski   
35  36  36  77        Landon Cassill   

                                      Sponsor / Owner        Car Laps  \
0   Haas Automation / Monster Energy (Stewart Haas...       Ford  200   
1   Motorcraft / Quick Lane Tire & Auto Center (Wo...       Ford  200   
2   Kroger ClickList / Stouffer's / Cheez-It (JTG-...  Chevrolet  200   
3        Smithfield Foods (Richard Petty Motorsports)       Ford  200   
4                  Menards / Peak (Richard Childress)  Chevrolet  200   
..                                                ...        ...  ...   
31      

In [48]:
# dump racedata to csv
racedata.to_csv('data/racedata_2017-2022.csv')