In [2]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
url = 'https://www.sports-reference.com/cbb/schools/michigan-state/2019-schedule.html'
r = requests.get(url)

In [4]:
soup = BeautifulSoup(r.content,'html.parser')

In [5]:
tb = soup.find_all('table', {'id': 'schedule'})

In [36]:
lst_gameobjs = [] # Create an empty list to store game objects in.

schools = ['michigan','michigan-state'] # Create a list of schools to loop through and grab scores from

years = [2010,2011,2012, 2013,2014,2015,2016,2017,2018,2019]
# create a list of seasons that will be extracted.  The season's year represents the year
# that falls during the winter.


# loop through each school's seasons
for school in schools:
    for year in years:
        url = f'https://www.sports-reference.com/cbb/schools/{school}/{year}-schedule.html' #build the url based on school/season
        print(f'extracting data from {url}...') 
        r = requests.get(url) #get the html content from the webpage
        soup = BeautifulSoup(r.content,'html.parser') # convert it into a BeautifulSoup object to make it easier to reference tags
        
        
        # Get AP Poll Data
        poll_dict = {}  # create a blank dictionary object which will be filled with the dates the rankings were released and the values will be the ranks.
        poll_tb = soup.find_all('table', {'id': 'polls'})[0]
        head = poll_tb.thead.tr.find_all('th')
        body = poll_tb.tbody.tr.find_all('td')
        for i,val in enumerate(head[1:]):
            str_date = val.get_text()
            if str_date == 'Pre':
                str_date = '11/1'
                date = str(year - 1) + "/" + str_date
                date =  pd.to_datetime(date)
            elif str_date == 'Final':
                date = date + pd.Timedelta(days=7)
            elif int(str_date.split('/')[0]) < 11:
                date = str(year) + "/" + str_date
                date =  pd.to_datetime(date)

            else:
                date = str(year - 1) + "/" + str_date 
                date =  pd.to_datetime(date)
            poll_dict[date] = body[i].get_text()
            # poll_dates.append(val.get_text())
        

        tb = soup.find_all('table', {'id': 'schedule'})  # find the html table that has an id 'schedule' and extract the first instance
        
        
        # the table has rows representing games.  Each row has cells (html td tags) that represent a piece of information about the game
        # The following code will take one row and loop through all of the cells in that row assigning them to specific keys in the newly
        # created dictionary.  There is no variation in what cells contain for a game in each season. However, the source did not capture
        # time the game was scheduled to start data prior to the 2014/2015 season so that alters the placement of the data in the table
        for tr in tb[0].find_all('tr')[1:]: 
            # print(tr)
            gameobj = {} # Create/initialize a dictionary object for a specific game
            
            cells = tr.find_all('td')  # for each 
            
            if len(cells) > 0 and year >= 2015:
                gameobj['school'] = school.replace('-',' ')
                gameobj['sport'] = 'basketball'
                gameobj['gamedate'] = pd.to_datetime(cells[0].text)
                gameobj['gametime'] = cells[1].text
                gameobj['type'] = cells[2].text
                gameobj['opponent'] = cells[4].text
                gameobj['conf'] = cells[5].text
                gameobj['result'] = cells[6].text
                gameobj['team_points']=cells[7].text
                gameobj['opp_points']=cells[8].text
                gameobj['ot']=cells[9].text
                gameobj['w']=cells[10].text
                gameobj['l']=cells[11].text
                gameobj['streak']=cells[12].text
                gameobj['arena']=cells[13].text
                gameobj['attendance'] = 0

            elif len(cells) > 0 and year < 2015:
                gameobj['school'] = school.replace('-',' ')
                gameobj['sport'] = 'basketball'
                gameobj['gamedate'] = pd.to_datetime(cells[0].text)
                gameobj['type'] = cells[1].text
                gameobj['opponent'] = cells[3].text
                gameobj['conf'] = cells[4].text
                gameobj['result'] = cells[5].text
                gameobj['team_points']=cells[6].text
                gameobj['opp_points']=cells[7].text
                gameobj['ot']=cells[8].text
                gameobj['w']=cells[9].text
                gameobj['l']=cells[10].text
                gameobj['streak']=cells[11].text
                gameobj['arena']=cells[12].text
                gameobj['attendance'] = 0
            
            
            
            # If there were games played, we want to run through the list of dates when the rankings were updated (captured in the table at the top of each school's season page)
            # We want to confirm that each game being played is prior to the NCAA Tournament (AP Polls no longer matter at that point).  If the game is before the 
            # tourney, we want to verify what rank should be attributed to the school by comparing where the game fell in relation to the rankings.  Rankings come out weekly 
            # so we're looking to see whether the gamedate is between the loop's current date and the next date in the sequence (the next time rankings come out). If the date is greater than
            # or equal to the current iteration and less than the next iteration date, we know we can attribute that iteration date's associated ranking to the school when they played that game.
            
            if len(cells) > 0:
                polldates = list(poll_dict.keys())
                for i in range(len(polldates)):
                    if gameobj['type'] != 'NCAA':
                        if gameobj['gamedate'] > polldates[i] and gameobj['gamedate'] < polldates[i+1]:
                            gameobj['rank'] = poll_dict[polldates[i]]
                            break
                        elif gameobj['gamedate'] == polldates[i+1]:
                            gameobj['rank'] = poll_dict[polldates[i+1]]
                            break                            
                
                

            lst_gameobjs.append(gameobj) # add the gameobject to the list of game objects.
print('extraction complete')

extracting data from https://www.sports-reference.com/cbb/schools/michigan/2010-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2011-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2012-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2013-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2014-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2015-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2016-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2017-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2018-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2019-schedule.html...
extracting data from https://w

In [37]:
import pandas as pd

In [38]:
df = pd.DataFrame(lst_gameobjs)
df.shape

(764, 17)

In [39]:
# Running is the isnull() and will give a sense how much data is missing
df.isnull().sum()

school          38
sport           38
gamedate        38
type            38
opponent        38
conf            38
result          38
team_points     38
opp_points      38
ot              38
w               38
l               38
streak          38
arena           38
attendance      38
rank            96
gametime       397
dtype: int64

In [40]:
# We see that there are 38 instances where it appears data is missing and this is consistent across most of the columns.  Rank and gametime are higher but we expected this since they shouldn't always be filled.
# Dropping these values.
df = df.dropna(how='all')

In [41]:
# We need to extract the opponent's rank if they had one.  
df.sample(20)

Unnamed: 0,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,ot,w,l,streak,arena,attendance,rank,gametime
209,michigan,basketball,2015-02-08,REG,Indiana,Big Ten,L,67,70,,13,11,L 3,Assembly Hall,0.0,-,1:00p
151,michigan,basketball,2013-11-22,REG,Florida State,ACC,W,82,80,OT,4,1,W 2,Roberto Clemente Coliseum,0.0,15,
303,michigan,basketball,2017-11-26,REG,UC Riverside,Big West,W,87,42,,6,1,W 3,Crisler Arena,0.0,-,4:00p
88,michigan,basketball,2012-01-17,REG,Michigan State (9),Big Ten,W,60,59,,15,4,W 1,Crisler Arena,0.0,20,
351,michigan,basketball,2018-12-22,REG,Air Force,MWC,W,71,50,,12,0,W 12,Crisler Arena,0.0,4,4:00p
707,michigan state,basketball,2018-01-26,REG,Wisconsin,Big Ten,W,76,61,,19,3,W 3,Breslin Events Center,0.0,6,8:00p
436,michigan state,basketball,2011-01-22,REG,Purdue (14),Big Ten,L,76,86,,12,7,L 2,Mackey Arena,0.0,17,
473,michigan state,basketball,2012-01-21,REG,Purdue,Big Ten,W,83,58,,16,4,W 1,Breslin Events Center,0.0,9,
390,michigan state,basketball,2009-12-22,REG,Texas (2),Big 12,L,68,79,,9,3,L 1,Frank Erwin Center,0.0,9,
738,michigan state,basketball,2019-01-08,REG,Purdue,Big Ten,W,77,59,,14,2,W 9,Breslin Events Center,0.0,6,9:00p


In [None]:
#save off a file for completing the gametime data
df.to_csv('cbb_2010_2019.csv',index=False)

In [3]:
# Let's grab the opponent's rank as a separate column - it's in the opponent column which includes the name of the opponent
df = pd.read_csv('../04_finaldata/df_basketball.csv', parse_dates=['start_dt'])
df['opponent_rank'] = df['opponent'].str.extract(r'[A-Za-z\s\(]+([0-9]+)')
# make sure to include the sport
df['sport'] = 'basketball'
# We'll create the window to look for crime/incidents.  Each college game presumed to be about 2 hours + an 8-hour window for incidents.
df['end_inc_window'] = df['start_dt'] + pd.to_timedelta(10, unit='h')
df.sample(15)

Unnamed: 0.1,Unnamed: 0,school,gamedate,type,opponent,conf,result,team_points,opp_points,ot,...,streak,arena,rank,gametime,Game String,US State,start_dt,opponent_rank,sport,end_inc_window
25,25,michigan,2/20/2010,REG,Penn State,Big Ten,L,51,55,,...,L 1,Crisler Arena,-,,Penn State on February 20 6:00 PM,MICH.,2010-02-20 18:00:00,,basketball,2010-02-21 04:00:00
609,609,michigan-state,2016-02-23,REG,Ohio State,Big Ten,W,81,62,,...,W 3,Value City Arena,6,9:00p,OHIO STATE,OHIO,2016-02-23 21:00:00,,basketball,2016-02-24 07:00:00
302,302,michigan,1/15/2018,REG,Maryland,Big Ten,W,68,67,,...,W 2,Crisler Arena,23,6:30p,Maryland on January 15 6:30 PM,MICH.,2018-01-15 18:30:00,,basketball,2018-01-16 04:30:00
102,102,michigan,11/12/2012,REG,IUPUI,Summit,W,91,54,,...,W 2,Crisler Arena,5,,IUPUI on November 12 9:00 PM,MICH.,2012-11-12 21:00:00,,basketball,2012-11-13 07:00:00
654,654,michigan-state,2017-11-19,REG,Stony Brook,AEC,W,93,71,,...,W 1,Breslin Events Center,2,4:00p,STONY BROOK (PK80),MICH.,2017-11-19 16:00:00,,basketball,2017-11-20 02:00:00
715,715,michigan-state,2019-03-02,REG,Indiana,Big Ten,L,62,63,,...,L 1,Assembly Hall,6,12:00p,INDIANA,IND.,2019-03-02 12:00:00,,basketball,2019-03-02 22:00:00
474,474,michigan-state,2012-11-25,REG,Louisiana,Sun Belt,W,63,60,,...,W 5,Breslin Events Center,15,,BOISE STATE,MICH.,2012-11-25 20:00:00,,basketball,2012-11-26 06:00:00
227,227,michigan,1/20/2016,REG,Minnesota,Big Ten,W,74,69,,...,W 1,Crisler Arena,-,8:30p,Minnesota on January 20 8:30 PM,MICH.,2016-01-20 20:30:00,,basketball,2016-01-21 06:30:00
93,93,michigan,2/18/2012,REG,Ohio State (6),Big Ten,W,56,51,,...,W 3,Crisler Arena,17,,Ohio State on February 18 9:00 PM,MICH.,2012-02-18 21:00:00,6.0,basketball,2012-02-19 07:00:00
434,434,michigan-state,2011-11-18,REG,Texas Southern,SWAC,W,76,41,,...,W 1,Breslin Events Center,-,,TEXAS SOUTHERN,MICH.,2011-11-18 18:30:00,,basketball,2011-11-19 04:30:00


In [4]:
# Save off file
df.to_csv('../04_finaldata/df_basketball_final.csv',index=False)