In [1]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.sports-reference.com/cbb/schools/michigan-state/2019-schedule.html'
r = requests.get(url)

In [3]:
soup = BeautifulSoup(r.content,'html.parser')

In [4]:
tb = soup.find_all('table', {'id': 'schedule'})

In [5]:
lst_gameobjs = [] # Create an empty list to store game objects in.

schools = ['michigan','michigan-state'] # Create a list of schools to loop through and grab scores from

years = [2010,2011,2012, 2013,2014,2015,2016,2017,2018,2019]
# create a list of seasons that will be extracted.  The season's year represents the year
# that falls during the winter.


# loop through each school's seasons
for school in schools:
    for year in years:
        url = f'https://www.sports-reference.com/cbb/schools/{school}/{year}-schedule.html' #build the url based on school/season
        print(url) 
        r = requests.get(url) #get the html content from the webpage
        soup = BeautifulSoup(r.content,'html.parser') # convert it into a BeautifulSoup object to make it easier to reference tags
        
        
        # Get AP Poll Data
        poll_dict = {}  # create a blank dictionary object which will be filled with the dates the rankings were released and the values will be the ranks.
        poll_tb = soup.find_all('table', {'id': 'polls'})[0]
        head = poll_tb.thead.tr.find_all('th')
        body = poll_tb.tbody.tr.find_all('td')
        for i,val in enumerate(head[1:]):
            str_date = val.get_text()
            if str_date == 'Pre':
                str_date = '11/1'
                date = str(year - 1) + "/" + str_date
                date =  pd.to_datetime(date)
            elif str_date == 'Final':
                date = date + pd.Timedelta(days=7)
            elif int(str_date.split('/')[0]) < 11:
                date = str(year) + "/" + str_date
                date =  pd.to_datetime(date)

            else:
                date = str(year - 1) + "/" + str_date 
                date =  pd.to_datetime(date)
            poll_dict[date] = body[i].get_text()
            # poll_dates.append(val.get_text())
        

        tb = soup.find_all('table', {'id': 'schedule'})  # find the html table that has an id 'schedule' and extract the first instance
        
        
        # the table has rows representing games.  Each row has cells (html td tags) that represent a piece of information about the game
        # The following code will take one row and loop through all of the cells in that row assigning them to specific keys in the newly
        # created dictionary.  There is no variation in what cells contain for a game in each season. However, the source did not capture
        # time the game was scheduled to start data prior to the 2014/2015 season so that alters the placement of the data in the table
        for tr in tb[0].find_all('tr')[1:]: 
            # print(tr)
            gameobj = {} # Create/initialize a dictionary object for a specific game
            
            cells = tr.find_all('td')  # for each 
            
            if len(cells) > 0 and year >= 2015:
                gameobj['school'] = school.replace('-',' ')
                gameobj['sport'] = 'basketball'
                gameobj['gamedate'] = pd.to_datetime(cells[0].text)
                gameobj['gametime'] = cells[1].text
                gameobj['type'] = cells[2].text
                gameobj['opponent'] = cells[4].text
                gameobj['conf'] = cells[5].text
                gameobj['result'] = cells[6].text
                gameobj['team_points']=cells[7].text
                gameobj['opp_points']=cells[8].text
                gameobj['ot']=cells[9].text
                gameobj['w']=cells[10].text
                gameobj['l']=cells[11].text
                gameobj['streak']=cells[12].text
                gameobj['arena']=cells[13].text

            elif len(cells) > 0 and year < 2015:
                gameobj['school'] = school.replace('-',' ')
                gameobj['sport'] = 'basketball'
                gameobj['gamedate'] = pd.to_datetime(cells[0].text)
                gameobj['type'] = cells[1].text
                gameobj['opponent'] = cells[3].text
                gameobj['conf'] = cells[4].text
                gameobj['result'] = cells[5].text
                gameobj['team_points']=cells[6].text
                gameobj['opp_points']=cells[7].text
                gameobj['ot']=cells[8].text
                gameobj['w']=cells[9].text
                gameobj['l']=cells[10].text
                gameobj['streak']=cells[11].text
                gameobj['arena']=cells[12].text
            
            
            
            # If there were games played, we want to run through the list of dates when the rankings were updated (captured in the table at the top of each school's season page)
            # We want to confirm that each game being played is prior to the NCAA Tournament (AP Polls no longer matter at that point).  If the game is before the 
            # tourney, we want to verify what rank should be attributed to the school by comparing where the game fell in relation to the rankings.  Rankings come out weekly 
            # so we're looking to see whether the gamedate is between the loop's current date and the next date in the sequence (the next time rankings come out). If the date is greater than
            # or equal to the current iteration and less than the next iteration date, we know we can attribute that iteration date's associated ranking to the school when they played that game.
            
            if len(cells) > 0:
                polldates = list(poll_dict.keys())
                for i in range(len(polldates)):
                    if gameobj['type'] != 'NCAA':
                        if gameobj['gamedate'] > polldates[i] and gameobj['gamedate'] < polldates[i+1]:
                            gameobj['ap_rank'] = poll_dict[polldates[i]]
                            break
                        elif gameobj['gamedate'] == polldates[i+1]:
                            gameobj['ap_rank'] = poll_dict[polldates[i+1]]
                            break                            
                
                

            lst_gameobjs.append(gameobj) # add the gameobject to the list of game objects.

https://www.sports-reference.com/cbb/schools/michigan/2010-schedule.html
https://www.sports-reference.com/cbb/schools/michigan/2011-schedule.html
https://www.sports-reference.com/cbb/schools/michigan/2012-schedule.html
https://www.sports-reference.com/cbb/schools/michigan/2013-schedule.html
https://www.sports-reference.com/cbb/schools/michigan/2014-schedule.html
https://www.sports-reference.com/cbb/schools/michigan/2015-schedule.html
https://www.sports-reference.com/cbb/schools/michigan/2016-schedule.html
https://www.sports-reference.com/cbb/schools/michigan/2017-schedule.html
https://www.sports-reference.com/cbb/schools/michigan/2018-schedule.html
https://www.sports-reference.com/cbb/schools/michigan/2019-schedule.html
https://www.sports-reference.com/cbb/schools/michigan-state/2010-schedule.html
https://www.sports-reference.com/cbb/schools/michigan-state/2011-schedule.html
https://www.sports-reference.com/cbb/schools/michigan-state/2012-schedule.html
https://www.sports-reference.com/

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame(lst_gameobjs)
df.shape

(764, 16)

In [8]:
# Running is the isnull() and will give a sense how much data is missing
df.isnull().sum()

school          38
sport           38
gamedate        38
type            38
opponent        38
conf            38
result          38
team_points     38
opp_points      38
ot              38
w               38
l               38
streak          38
arena           38
ap_rank         96
gametime       397
dtype: int64

In [9]:
# We see that there are 38 instances where it appears data is missing and this is consistent across most of the columns.  Rank and gametime are higher but we expected this since they shouldn't always be filled.
# Dropping these values.
df = df.dropna(how='all')

In [14]:
# We need to extract the opponent's rank if they had one.  
df.sample(20)



Unnamed: 0,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,ot,w,l,streak,arena,ap_rank,gametime
271,michigan,basketball,2017-01-04,REG,Penn State,Big Ten,W,72,69,,11,4,W 1,Crisler Arena,-,8:30p
335,michigan,basketball,2018-03-17,NCAA,Houston (21),AAC,W,64,63,,30,7,W 11,Intrust Bank Arena,,9:40p
338,michigan,basketball,2018-03-31,NCAA,Loyola (IL),MVC,W,69,57,,33,7,W 14,Alamodome,,6:09p
55,michigan,basketball,2011-02-03,REG,Ohio State (1),Big Ten,L,53,62,,13,10,L 1,Value City Arena,-,
363,michigan,basketball,2019-02-09,REG,Wisconsin (19),Big Ten,W,61,52,,22,2,W 2,Crisler Arena,7,12:00p
207,michigan,basketball,2015-02-01,REG,Michigan State,Big Ten,L,66,76,OT,13,9,L 1,Breslin Events Center,-,1:00p
306,michigan,basketball,2017-12-04,REG,Ohio State,Big Ten,L,62,71,,7,3,L 1,Value City Arena,-,6:30p
615,michigan state,basketball,2015-11-23,REG,Eastern Michigan,MAC,W,89,65,,4,0,W 4,Breslin Events Center,3,7:00p
215,michigan,basketball,2015-03-07,REG,Rutgers,Big Ten,W,79,69,,15,15,W 1,Crisler Arena,-,2:15p
492,michigan state,basketball,2012-03-22,NCAA,Louisville (17),Big East,L,44,57,,29,8,L 1,US Airways Center,,


In [21]:
df['opponent'].str.extract(r'[A-Za-z\s\(]+([0-9]+)')

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
759,
760,
761,12
762,1


In [18]:
df.to_csv('cbb_2010_2019.csv',index=False)

In [74]:
df[df.gametime.isna()==True].shape[0]

397

In [75]:
df.dtypes

school         object
gamedate       object
type           object
opponent       object
conf           object
result         object
team_points    object
opp_points     object
ot             object
w              object
l              object
streak         object
arena          object
gametime       object
dtype: object

In [76]:
df.type.unique()

array(['REG', nan, 'CTOURN', 'NCAA'], dtype=object)