In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

BASE_URL = 'https://racing-reference.info'
years = range(2012, 2017)
cup_results = [requests.get(BASE_URL + f'/season-stats/{year}/W') for year in years]
set([r.status_code for r in cup_results])

{200}

In [2]:
race_anchors = []
href_regex = re.compile('/race-results/.*/W')

for c in cup_results:
    race_anchors.extend(BeautifulSoup(c.text, 'lxml').find_all(href=href_regex))

In [3]:
races = [requests.get(a.attrs['href']) for a in race_anchors]

In [4]:
set([r.status_code for r in races])

{200}

In [5]:
# display first race
pd.read_html(races[0].text, match='Sponsor / Owner', header=0)[-1]

Unnamed: 0,Pos,St,#,Driver,Sponsor / Owner,Car,Laps,Money,Status,Led,Pts
0,1,4,17,Matt Kenseth,Best Buy (Jack Roush),Ford,202,1588887,running,50,47
1,2,5,88,"Dale Earnhardt, Jr.",Diet Mountain Dew / National Guard (Rick Hendr...,Chevrolet,202,1103150,running,0,42
2,3,2,16,Greg Biffle,3M (Jack Roush),Ford,202,804163,running,44,42
3,4,31,11,Denny Hamlin,FedEx Express (Joe Gibbs),Toyota,202,702091,running,57,42
4,5,9,31,Jeff Burton,Caterpillar (Richard Childress),Chevrolet,202,559550,running,24,40
5,6,37,27,Paul Menard,Peak / Menards (Richard Childress),Chevrolet,202,427900,running,2,39
6,7,13,29,Kevin Harvick,Budweiser (Richard Childress),Chevrolet,202,415261,running,0,37
7,8,1,99,Carl Edwards,Fastenal (Jack Roush),Ford,202,403466,running,0,36
8,9,12,20,Joey Logano,The Home Depot (Joe Gibbs),Toyota,202,346063,running,2,36
9,10,22,55,Mark Martin,Aaron's Dream Machine (Michael Waltrip),Toyota,202,323313,running,2,35


In [6]:
#create a dictionary containing a list of each year's tracks
trackdata = []
years = range(2012, 2016)
for x in cup_results:
    trackdata.append(BeautifulSoup(x.text, 'lxml').find_all(class_='track W'))

tracks = []
for group in trackdata:
    yeartracks = []
    for tag in group:
        yeartracks.append(tag.text.strip())
    tracks.append(yeartracks)
    
trackdict = dict(zip(years, tracks))

In [7]:
trackdict

{2012: ['Daytona',
  'Phoenix',
  'Las Vegas',
  'Bristol',
  'Fontana',
  'Martinsville',
  'Fort Worth',
  'Kansas',
  'Richmond',
  'Talladega',
  'Darlington',
  'Charlotte',
  'Dover',
  'Pocono',
  'Michigan',
  'Sonoma',
  'Kentucky',
  'Daytona',
  'Loudon',
  'Indianapolis',
  'Pocono',
  'Watkins Glen',
  'Michigan',
  'Bristol',
  'Atlanta',
  'Richmond',
  'Chicago',
  'Loudon',
  'Dover',
  'Talladega',
  'Charlotte',
  'Kansas',
  'Martinsville',
  'Fort Worth',
  'Phoenix',
  'Homestead'],
 2013: ['Daytona',
  'Phoenix',
  'Las Vegas',
  'Bristol',
  'Fontana',
  'Martinsville',
  'Fort Worth',
  'Kansas',
  'Richmond',
  'Talladega',
  'Darlington',
  'Charlotte',
  'Dover',
  'Pocono',
  'Michigan',
  'Sonoma',
  'Kentucky',
  'Daytona',
  'Loudon',
  'Indianapolis',
  'Pocono',
  'Watkins Glen',
  'Michigan',
  'Bristol',
  'Atlanta',
  'Richmond',
  'Chicago',
  'Loudon',
  'Dover',
  'Kansas',
  'Charlotte',
  'Talladega',
  'Martinsville',
  'Fort Worth',
  'Phoeni

In [8]:
#flatten track list
tracklst = [x for l in tracks for x in l]

#create empty dataframe to put race data into
racedata = pd.DataFrame(columns = [
 'Pos',
 'St',
 '#',
 'Driver',
 'Sponsor / Owner',
 'Car',
 'Laps',
 'Status',
 'Led',
 'Pts',
 'PPts',
 'Track'])

#create dataframe with data for all races from 2012-2016
for i in range(len(races)):
    race = pd.read_html(races[i].text, match='Sponsor / Owner', header=0)[-1]
    race['Track'] = tracklst[i]
    racedata = pd.concat([racedata, race])
    #print(race)
    
print(racedata)

   Pos  St   #               Driver  \
0    1   4  17         Matt Kenseth   
1    2   5  88  Dale Earnhardt, Jr.   
2    3   2  16          Greg Biffle   
3    4  31  11         Denny Hamlin   
4    5   9  31          Jeff Burton   
..  ..  ..  ..                  ...   
35  36   6  78    Martin Truex, Jr.   
36  37  29   5          Kasey Kahne   
37  38  35   7          Regan Smith   
38  39  39  32         Dylan Lupton   
39  40  25  43        Aric Almirola   

                                      Sponsor / Owner        Car Laps  \
0                               Best Buy (Jack Roush)       Ford  202   
1   Diet Mountain Dew / National Guard (Rick Hendr...  Chevrolet  202   
2                                     3M (Jack Roush)       Ford  202   
3                           FedEx Express (Joe Gibbs)     Toyota  202   
4                     Caterpillar (Richard Childress)  Chevrolet  202   
..                                                ...        ...  ...   
35     Bass Pro Shop

In [9]:
# use pickle to dump racedata object to file for use in other notebooks
import pickle
racedata.to_pickle('racedata_2012-2016.pkl')