In [2]:
import pandas as pd
import json
import re

import time
import datetime

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

[Requests](#requests)

[Selenium](#selenium)
  - [Game strings](#game_strings)
  - [Start times](#start_times)
  - [US state](#us_state)
  - ~[Attendance](#attendance)~

[CBB DataFrame](#cbb_df)\
[Selenium DataFrame](#selenium_df)\
[Final DataFrame](#final_df)


<a id='requests'></a>



&nbsp;

## Requests



#### Obtaining data from sports-reference.com


In [2]:
url = 'https://www.sports-reference.com/cbb/schools/michigan-state/2019-schedule.html'
r = requests.get(url)

In [3]:
soup = BeautifulSoup(r.content,'html.parser')

In [4]:
tb = soup.find_all('table', {'id': 'schedule'})

In [5]:
lst_gameobjs = [] # Create an empty list to store game objects in.

schools = ['michigan','michigan-state'] # Create a list of schools to loop through and grab scores from

years = [2010,2011,2012, 2013,2014,2015,2016,2017,2018,2019]
# create a list of seasons that will be extracted.  The season's year represents the year
# that falls during the winter.


# loop through each school's seasons
for school in schools:
    for year in years:
        url = f'https://www.sports-reference.com/cbb/schools/{school}/{year}-schedule.html' #build the url based on school/season
        print(f'extracting data from {url}...') 
        r = requests.get(url) #get the html content from the webpage
        soup = BeautifulSoup(r.content,'html.parser') # convert it into a BeautifulSoup object to make it easier to reference tags
        
        
        # Get AP Poll Data
        poll_dict = {}  # create a blank dictionary object which will be filled with the dates the rankings were released and the values will be the ranks.
        poll_tb = soup.find_all('table', {'id': 'polls'})[0]
        head = poll_tb.thead.tr.find_all('th')
        body = poll_tb.tbody.tr.find_all('td')
        for i,val in enumerate(head[1:]):
            str_date = val.get_text()
            if str_date == 'Pre':
                str_date = '11/1'
                date = str(year - 1) + "/" + str_date
                date =  pd.to_datetime(date)
            elif str_date == 'Final':
                date = date + pd.Timedelta(days=7)
            elif int(str_date.split('/')[0]) < 11:
                date = str(year) + "/" + str_date
                date =  pd.to_datetime(date)

            else:
                date = str(year - 1) + "/" + str_date 
                date =  pd.to_datetime(date)
            poll_dict[date] = body[i].get_text()
            # poll_dates.append(val.get_text())
        

        tb = soup.find_all('table', {'id': 'schedule'})  # find the html table that has an id 'schedule' and extract the first instance
        
        
        # the table has rows representing games.  Each row has cells (html td tags) that represent a piece of information about the game
        # The following code will take one row and loop through all of the cells in that row assigning them to specific keys in the newly
        # created dictionary.  There is no variation in what cells contain for a game in each season. However, the source did not capture
        # time the game was scheduled to start data prior to the 2014/2015 season so that alters the placement of the data in the table
        for tr in tb[0].find_all('tr')[1:]: 
            # print(tr)
            gameobj = {} # Create/initialize a dictionary object for a specific game
            
            cells = tr.find_all('td')  # for each 
            
            if len(cells) > 0 and year >= 2015:
                gameobj['school'] = school.replace('-',' ')
                gameobj['sport'] = 'basketball'
                gameobj['gamedate'] = pd.to_datetime(cells[0].text)
                gameobj['gametime'] = cells[1].text
                gameobj['type'] = cells[2].text
                gameobj['opponent'] = cells[4].text
                gameobj['conf'] = cells[5].text
                gameobj['result'] = cells[6].text
                gameobj['team_points']=cells[7].text
                gameobj['opp_points']=cells[8].text
                gameobj['ot']=cells[9].text
                gameobj['w']=cells[10].text
                gameobj['l']=cells[11].text
                gameobj['streak']=cells[12].text
                gameobj['arena']=cells[13].text
                gameobj['attendance'] = 0

            elif len(cells) > 0 and year < 2015:
                gameobj['school'] = school.replace('-',' ')
                gameobj['sport'] = 'basketball'
                gameobj['gamedate'] = pd.to_datetime(cells[0].text)
                gameobj['type'] = cells[1].text
                gameobj['opponent'] = cells[3].text
                gameobj['conf'] = cells[4].text
                gameobj['result'] = cells[5].text
                gameobj['team_points']=cells[6].text
                gameobj['opp_points']=cells[7].text
                gameobj['ot']=cells[8].text
                gameobj['w']=cells[9].text
                gameobj['l']=cells[10].text
                gameobj['streak']=cells[11].text
                gameobj['arena']=cells[12].text
                gameobj['attendance'] = 0
            
            
            
            # If there were games played, we want to run through the list of dates when the rankings were updated (captured in the table at the top of each school's season page)
            # We want to confirm that each game being played is prior to the NCAA Tournament (AP Polls no longer matter at that point).  If the game is before the 
            # tourney, we want to verify what rank should be attributed to the school by comparing where the game fell in relation to the rankings.  Rankings come out weekly 
            # so we're looking to see whether the gamedate is between the loop's current date and the next date in the sequence (the next time rankings come out). If the date is greater than
            # or equal to the current iteration and less than the next iteration date, we know we can attribute that iteration date's associated ranking to the school when they played that game.
            
            if len(cells) > 0:
                polldates = list(poll_dict.keys())
                for i in range(len(polldates)):
                    if gameobj['type'] != 'NCAA':
                        if gameobj['gamedate'] > polldates[i] and gameobj['gamedate'] < polldates[i+1]:
                            gameobj['rank'] = poll_dict[polldates[i]]
                            break
                        elif gameobj['gamedate'] == polldates[i+1]:
                            gameobj['rank'] = poll_dict[polldates[i+1]]
                            break                            
                
                

            lst_gameobjs.append(gameobj) # add the gameobject to the list of game objects.
print('extraction complete')

extracting data from https://www.sports-reference.com/cbb/schools/michigan/2010-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2011-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2012-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2013-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2014-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2015-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2016-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2017-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2018-schedule.html...
extracting data from https://www.sports-reference.com/cbb/schools/michigan/2019-schedule.html...
extracting data from https://w


&nbsp;

#### Cleaning data from sports-reference.com


In [6]:
df = pd.DataFrame(lst_gameobjs)
df.shape

(764, 17)

In [7]:
# Running is the isnull() and will give a sense how much data is missing
df.isnull().sum()

school          38
sport           38
gamedate        38
type            38
opponent        38
conf            38
result          38
team_points     38
opp_points      38
ot              38
w               38
l               38
streak          38
arena           38
attendance      38
rank            96
gametime       397
dtype: int64

In [8]:
# We see that there are 38 instances where it appears data is missing and this is consistent across most of the columns.  Rank and gametime are higher but we expected this since they shouldn't always be filled.
# Dropping these values.
df = df.dropna(how='all')

In [9]:
# We need to extract the opponent's rank if they had one.  
df.sample(5)

Unnamed: 0,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,ot,w,l,streak,arena,attendance,rank,gametime
664,michigan state,basketball,2017-01-04,REG,Rutgers,Big Ten,W,93,65,,11,5,W 4,Breslin Events Center,0.0,-,6:30p
108,michigan,basketball,2012-11-13,REG,Cleveland State,Horizon,W,77,47,,3,0,W 3,Crisler Arena,0.0,5,
384,michigan state,basketball,2009-11-28,REG,Massachusetts,A-10,W,106,68,,5,1,W 1,Boardwalk Hall,0.0,2,
321,michigan,basketball,2018-02-03,REG,Minnesota,Big Ten,W,76,73,OT,19,6,W 2,Crisler Arena,0.0,24,2:30p
221,michigan,basketball,2015-11-20,REG,Xavier,Big East,L,70,86,,2,1,L 1,Crisler Arena,0.0,24,9:00p


In [10]:
#save off a file for completing the gametime data
df.to_csv('../02_sport_rawdata/cbb_2010_2019.csv', index=False)


&nbsp;


In [11]:
# df = pd.read_csv('../04_finaldata/df_basketball.csv', parse_dates=['start_dt'])

In [12]:
# Save off file
# df.to_csv('../04_finaldata/df_basketball_final.csv',index=False)

<a id='selenium'></a>


&nbsp;

# Selenium



All data was collected with chromedriver via Windows 10 ver. 21H2 (OS Build 19044.1706)

Data sourced from [sports-reference.com](https://www.sports-reference.com/cbb) only had *half* of our basketball game start times, between 2015 and 2019, for our ten-year period.\
\
[mgoblue.com](https://mgoblue.com/sports/mens-basketball/schedule/2012-13) also shifts Michigan game start times to their Eastern Standard Time equivalents while Michigan's sport-reference.com data does not, e.g.:
- A game is played in Pacific Standard Time location:
  - mgoblue.com will have a game start at **8:59pm** EST while...
  - sports-reference.com will have a game start at **11:59pm**
---
To fill in the proper start times for our basketball games, we needed to scrape each program's basketball schedule site for:
  - The game string, to drop exhibition games absent from sports-reference.com data
  - The game start time
  - The state the game was played in, to shift times to their proper EST equivalent
  - ~The game's attendance - this was later dropped. Michigan State does not post attendance values on msusparts.com.~
---
Each program's basketball scheudle data had its own idiosyncracies / style.\
\
The approach taken was to collect the data for each, clean & manipulate, then combine and append to the College Basketball DataFrame (the basketball data sourced from sports-reference.com)


In [3]:
import sys
sys.platform

'win32'

In [4]:
seasons = ['2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19']

if sys.platform == 'win32': # for Windows machines
    s = Service('../06_assets/chromedriver_win32/chromedriver.exe')
elif sys.platform == 'darwin': # for Apple machines
    s = Service('..06_assets/chromedriver_mac64/chromedriver')
browser = webdriver.Chrome(service=s)

<a id='game_strings'></a>


&nbsp;

### Obtain game string



Occassionally, a Chrome error occurs:

`cannot determine loading status from target frame detached` or `target frame detached`

If this happens, re-run the cell where the error was thrown - it should run successfully on next attempt.



#### Michigan


In [14]:
%%time # takes about 5 min.

m_strings = []
for season in seasons: # convert repeated `for loop` to function and call in subsequent cells - simplify code if there's time
    url = f'https://mgoblue.com/sports/mens-basketball/schedule/{season}'
    browser.get(url)
    time.sleep(2)
    browser.implicitly_wait(0.5)
    
    # pull strings from anchor tags
    elements = browser.find_elements(By.TAG_NAME, 'a') # convert this block, too, to a function w/ arg for tag name
    for e in elements:
        if 'Story' in e.accessible_name or 'Notes' in e.accessible_name or 'Box score' in e.accessible_name or 'Boxscore' in e.accessible_name or 'Boxcsore' in e.accessible_name or 'Stats for' in e.accessible_name or 'Quotes' in e.accessible_name or "Photos for" in e.accessible_name:
            pass
        elif ' PM' in e.accessible_name or ' AM' in e.accessible_name or ' p.m.' in e.accessible_name or 'Noon' in e.accessible_name or ' noon' in e.accessible_name or '(Exhib.)' in e.accessible_name:
            m_strings.append(e.accessible_name)
    browser.implicitly_wait(0.5)

Wall time: 5min 20s


In [15]:
print(len(m_strings))
m_strings

373


['Wayne State (EX) on November 6 7:00 PM',
 'Northern Michigan on November 14 7:00 PM',
 'Houston Baptist on November 20 7:00 PM',
 'Creighton on November 26 12:00 PM',
 'Marquette on November 27 12:00 PM',
 'Alabama on November 29 5:00 PM',
 'Boston College on December 2 7:30 PM',
 'Arkansas - Pine Bluff on December 5 2:00 PM',
 'Utah on December 9 7:00 PM',
 'Detroit on December 13 12:00 PM',
 'Kansas on December 19 11:00 AM',
 'Coppin State on December 22 7:00 PM',
 'Indiana on December 31 12:00 PM',
 'Ohio State on January 3 4:30 PM',
 'Penn State on January 7 7:00 PM',
 'Northwestern on January 10 2:30 PM',
 'Indiana on January 14 9:00 PM',
 'Connecticut on January 17 1:30 PM',
 'Wisconsin on January 20 7:30 PM',
 'Purdue on January 23 4:00 PM',
 'Michigan State on January 26 7:00 PM',
 'Iowa on January 30 4:35 PM',
 'Northwestern on February 2 6:00 PM',
 'Wisconsin on February 6 4:00 PM',
 'Minnesota on February 11 6:00 PM',
 'Iowa on February 16 8:05 PM',
 'Penn State on Februar

In [16]:
assert len(m_strings) == 373, f"Number of Michigan strings incorrect, got {len(m_strings)}"

In [17]:
m_strings = pd.Series(m_strings, name='Game String')

#### Michigan State

In [5]:
%%time

msu_strings = []
for season in seasons:
    url = f'https://msuspartans.com/sports/mens-basketball/schedule/{season}'
    browser.get(url)
    time.sleep(2)
    browser.implicitly_wait(0.5)
    
    # pull strings from anchor tags
    elements = browser.find_elements(By.XPATH, "//div[@class='sidearm-schedule-game-opponent-name']")
    for e in elements:
        msu_strings.append(e.text)
    browser.implicitly_wait(0.5)

Wall time: 35.6 s


In [6]:
print(len(msu_strings))
msu_strings

390


['GREEN AND WHITE GAME',
 'NORTHWOOD UNIVERSITY (EXHIB.)',
 'GRAND VALLEY STATE',
 'FLORIDA GULF COAST',
 'GONZAGA',
 'TOLEDO',
 'VALPARAISO',
 'FLORIDA',
 'MASSACHUSETTS',
 'NORTH CAROLINA (BIG TEN/ACC CHALLENGE)',
 'WOFFORD',
 'THE CITADEL',
 'OAKLAND',
 'IPFW',
 'TEXAS',
 'TEXAS-ARLINGTON',
 'NORTHWESTERN',
 'WISCONSIN',
 'IOWA',
 'MINNESOTA',
 'ILLINOIS',
 'IOWA',
 'MINNESOTA',
 'MICHIGAN',
 'NORTHWESTERN',
 'WISCONSIN',
 'ILLINOIS',
 'PURDUE',
 'PENN STATE',
 'INDIANA',
 'OHIO STATE',
 'PURDUE',
 'PENN STATE',
 'MICHIGAN',
 'MINNESOTA',
 'NEW MEXICO STATE',
 'MARYLAND',
 'NORTHERN IOWA',
 'TENNESSEE',
 'BUTLER',
 'MIDNIGHT MADNESS',
 'SAGINAW VALLEY STATE',
 'NEBRASKA-OMAHA',
 'EASTERN MICHIGAN',
 'SOUTH CAROLINA',
 'CHAMINADE',
 'CONNECTICUT',
 'WASHINGTON',
 'TENNESSEE TECH',
 'DUKE (ACC/BIG TEN CHALLENGE)',
 'BOWLING GREEN',
 'SYRACUSE',
 'OAKLAND',
 'PRAIRIE VIEW A&M',
 'TEXAS',
 'MINNESOTA',
 'NORTHWESTERN',
 'PENN STATE',
 'WISCONSIN',
 'NORTHWESTERN',
 'ILLINOIS',
 'PURDUE'

In [9]:
assert len(msu_strings) == 390, f"Number of msu_strings incorrect, got {len(msu_strings)}"

In [21]:
msu_strings = pd.Series(msu_strings, name='Game String')

<a id='start_times'></a>


&nbsp;

### Obtain game start times



#### Michigan


In [22]:
%%time # takes about 3 min.

m_times = []
for season in seasons:
    url = f'https://mgoblue.com/sports/mens-basketball/schedule/{season}'
    browser.get(url)
    time.sleep(2)
    browser.implicitly_wait(0.5)
    
    # pull times from anchor tags
    elements = browser.find_elements(By.TAG_NAME, 'a')
    for e in elements:
        if ' vs ' in e.accessible_name or ' at ' in e.accessible_name:
            pass
        elif ' PM' in e.accessible_name or ' p.m.' in e.accessible_name or ' AM' in e.accessible_name or ' ET' in e.accessible_name or ' Noon' in e.accessible_name or ' noon' in e.accessible_name:
            m_times.append(re.findall('\d+:\d+ \w+|(?<=\d )\d+ [\w\.]+|[Nn]oon', e.accessible_name)[0])
    browser.implicitly_wait(0.5)

Wall time: 3min 29s


In [23]:
print(len(m_times))
m_times

373


['7:00 PM',
 '7:00 PM',
 '7:00 PM',
 '12:00 PM',
 '12:00 PM',
 '5:00 PM',
 '7:30 PM',
 '2:00 PM',
 '7:00 PM',
 '12:00 PM',
 '11:00 AM',
 '7:00 PM',
 '12:00 PM',
 '4:30 PM',
 '7:00 PM',
 '2:30 PM',
 '9:00 PM',
 '1:30 PM',
 '7:30 PM',
 '4:00 PM',
 '7:00 PM',
 '4:35 PM',
 '6:00 PM',
 '4:00 PM',
 '6:00 PM',
 '8:05 PM',
 '6:00 PM',
 '7:00 PM',
 '12:00 PM',
 '7:00 PM',
 '4:00 PM',
 '2:30 PM',
 '12:00 PM',
 '7:00 PM',
 '7:00 PM',
 '7:00 PM',
 '2:00 PM',
 '8:00 PM',
 '5:30 PM',
 '9:00 PM',
 '1:00 PM',
 '7:30 PM',
 '6:30 PM',
 '7:00 PM',
 '12:00 PM',
 '6:00 PM',
 '2:00 PM',
 '4:00 PM',
 '7:30 PM',
 '4:30 PM',
 '6:30 PM',
 '8:00 PM',
 '8:00 PM',
 '7:00 PM',
 '7:00 PM',
 '4:00 PM',
 '7:00 PM',
 '12:00 PM',
 '6:30 PM',
 '4:00 PM',
 '7:30 PM',
 '3:30 PM',
 '6:30 PM',
 '3:30 PM',
 '2:00 PM',
 '2:30 PM',
 '1:40 PM',
 '12:40 PM',
 '2:45 PM',
 '7:00 PM',
 '7:00 PM',
 '8:30 PM',
 '8:30 PM',
 '10:00 AM',
 '2:00 PM',
 '2:30 PM',
 '7:00 PM',
 '12:00 PM',
 '4:00 PM',
 '7:00 PM',
 '12:00 PM',
 '6:30 PM',
 '7

In [24]:
assert len(m_times) == 373, f"Number of Michigan basketball start times incorrect, got {len(m_times)}"

In [25]:
m_times = pd.Series(m_times, name='Start time')


#### Michigan State


In [26]:
%%time

msu_times = []
for season in seasons:
    url = f'https://msuspartans.com/sports/mens-basketball/schedule/{season}'
    browser.get(url)
    time.sleep(2)
    browser.implicitly_wait(0.5)
    
    # pull times from anchor tags
    elements = browser.find_elements(By.XPATH, "//div[@class='sidearm-schedule-game-opponent-date flex-item-1']/span/following-sibling::span")
    for e in elements:
        if e.text == '':
            pass
        else:
            msu_times.append(e.text)
    browser.implicitly_wait(0.5)

Wall time: 42.1 s


In [27]:
print(len(msu_times))
msu_times

390


['3:30 PM',
 '4:00 PM',
 '7:00 PM',
 '7:00 PM',
 '8:00 PM',
 '6:30 PM',
 '12:00 PM',
 '8:00 PM',
 '5:30 PM',
 '9:00 PM',
 '7:00 PM',
 '7:00 PM',
 '7:00 PM',
 '12:00 PM',
 '7:00 PM',
 '7:00 PM',
 '6:30 PM',
 '6:30 PM',
 '5:35 PM',
 '6:30 PM',
 '3:30 PM',
 '6:30 PM',
 '12:00 PM',
 '7:00 PM',
 '7:00 PM',
 '9:00 PM',
 '9:00 PM',
 '9:00 PM',
 '12:00 PM',
 '7:00 PM',
 '12:00 PM',
 '4:00 PM',
 '7:00 PM',
 '4:00 PM',
 '9:00 PM',
 '7:20 PM',
 '2:30 PM',
 '9:37 PM',
 '2:20 PM',
 '6:07 PM',
 '9:30 PM',
 '7:00 PM',
 '7:00 PM',
 '8:30 PM',
 '10:00 PM',
 '9:30 PM',
 '7:00 PM',
 '5:00 PM',
 '1:00 PM',
 '9:30 PM',
 '1:30 PM',
 '9:00 PM',
 '12:30 PM',
 '6:30 PM',
 '7:00 PM',
 '4:00 PM',
 '7:30 PM',
 '1:00 PM',
 '7:00 PM',
 '1:00 PM',
 '7:00 PM',
 '9:00 PM',
 '7:00 PM',
 '6:00 PM',
 '8:30 PM',
 '1:00 PM',
 '7:00 PM',
 '9:00 PM',
 '9:00 PM',
 '9:00 PM',
 '1:00 PM',
 '6:30 PM',
 '2:00 PM',
 '5:00 PM',
 '6:30 PM',
 '4:15 PM',
 '9:20 PM',
 '2:00 PM',
 '7:00 PM',
 '7:00 PM',
 '7:00 PM',
 '6:30 PM',
 '6:00 PM

In [28]:
assert len(msu_times) == 390, f"Number of MSU basketball times incorrect, got {len(msu_times)}"

In [29]:
msu_times = pd.Series(msu_times, name='Start time')

<a id='us_state'></a>


&nbsp;

### Obtain US state locations



#### Michigan


In [30]:
%%time

m_states = []
for season in seasons:
    url = (f'https://mgoblue.com/sports/mens-basketball/schedule/{season}')
    browser.get(url)
    browser.implicitly_wait(0.5)
    
    # pull states out of class
    elements = browser.find_elements(By.CLASS_NAME, 'sidearm-schedule-game-location')
    for e in elements:
        if e.text == 'CRISLER CENTER':
            m_states.append('MICH.')
        else:
            m_states.append(re.findall('(?<=, ).+', e.text)[0])
    browser.implicitly_wait(0.5)

Wall time: 37.6 s


In [31]:
len(m_states)

373

In [10]:
assert len(m_states) == 373, f"Number of US states where Michigan basketball games were played is incorrect, got {len(m_states)}"

NameError: name 'm_states' is not defined

In [33]:
m_states = pd.Series(m_states, name='US State')


#### Michigan State


In [34]:
%%time

msu_states = []
for season in seasons:
    url = (f'https://msuspartans.com/sports/mens-basketball/schedule/{season}')
    browser.get(url)
    browser.implicitly_wait(0.5)
    
    # pull states out of class
    elements = browser.find_elements(By.CLASS_NAME, 'sidearm-schedule-game-location')
    for e in elements:
        if e.text == 'CRISLER CENTER':
            states.append('MICH.')
        else:
            msu_states.append(re.findall('(?<=, ).+', e.text)[0])
    browser.implicitly_wait(0.5)

Wall time: 15.6 s


In [35]:
len(msu_states)

390

In [36]:
assert len(msu_states) == 390, f"Number of US states where MSU basketball games were played is incorrect, got {len(msu_states)}"

In [37]:
msu_states = pd.Series(msu_states, name='US State')

<a id='attendance'></a>


&nbsp;

### ~Obtain attendance~


In [38]:
# %%time

# attendance = []
# for season in seasons:
#     url = f'https://mgoblue.com/sports/mens-basketball/schedule/{season}'
#     browser.get(url)
#     browser.implicitly_wait(0.5)

#     # open each game's flyout
#     elements = browser.find_elements(By.XPATH, "//button[@class='sidearm-schedule-game-toggle noprint']")
#     for e in elements:
#         browser.execute_script("arguments[0].click();", e)
#         browser.implicitly_wait(0.5)
#         time.sleep(1)

#     # grab each game's attendance from <dd> tag after expanding each game
#     elements = browser.find_elements(By.XPATH, "//div[@class='sidearm-schedule-game-extra-leaders flex-item-1']//dd[@data-bind='text: attendance']")
#     for e in elements:
#         if e.text != '':
#             attendance.append(e.get_attribute('textContent'))
#             browser.implicitly_wait(0.5)
#             time.sleep(0.2)
#     browser.implicitly_wait(0.5)

In [39]:
# print(len(attendance))
# attendance

In [40]:
# # insert empty values for the three exhibition games Michigan played w/out <dd> tags
# attendance.insert(0, '') # Wayne State EX game with no attendance recorded
# attendance.insert(33, '') # EX game with no attendance recorded
# attendance.insert(69, '') # EX game with no attendance recorded
# attendance.insert(341, '') # George Washington game had no attendance recorded

In [41]:
# attendance_series = pd.Series(attendance, name='Attendance')

<a id='cbb_df'></a>


&nbsp;

# CBB DataFrame


In [42]:
df_cbb = pd.read_csv(r'../02_sport_rawdata/cbb_2010_2019.csv')

In [43]:
print(len(df_cbb[(df_cbb['school'] == 'michigan')]))
print(len(df_cbb[(df_cbb['school'] == 'michigan-state')]))

361
0


In [44]:
df_cbb.dtypes

school          object
sport           object
gamedate        object
type            object
opponent        object
conf            object
result          object
team_points      int64
opp_points       int64
ot              object
w                int64
l                int64
streak          object
arena           object
attendance     float64
rank            object
gametime        object
dtype: object

In [45]:
# df_cbb['gamedate'] = pd.to_datetime(df_cbb['gamedate'])

In [46]:
df_cbb[(df_cbb['school'] == ('michigan-state')) 
        & (df_cbb['gamedate'] >= ('2015-10-01')) 
        & (df_cbb['gamedate'] <= ('2016-05-01'))]

Unnamed: 0,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,ot,w,l,streak,arena,attendance,rank,gametime



Michigan's game times differ from those of sports-reference.com in that Michigan applies timezone adjustment.


In [47]:
df_cbb[df_cbb['school'] == 'michigan'].tail()

Unnamed: 0,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,ot,w,l,streak,arena,attendance,rank,gametime
356,michigan,basketball,2019-03-16,CTOURN,Minnesota,Big Ten,W,76,49,,28,5,W 2,United Center,0.0,10.0,3:30p
357,michigan,basketball,2019-03-17,CTOURN,Michigan State (6),Big Ten,L,60,65,,28,6,L 1,United Center,0.0,10.0,3:30p
358,michigan,basketball,2019-03-21,NCAA,Montana,Big Sky,W,74,55,,29,6,W 1,Wells Fargo Arena,0.0,,9:20p
359,michigan,basketball,2019-03-23,NCAA,Florida,SEC,W,64,49,,30,6,W 2,Wells Fargo Arena,0.0,,5:15p
360,michigan,basketball,2019-03-28,NCAA,Texas Tech (9),Big 12,L,44,63,,30,7,L 1,Honda Center,0.0,,9:39p


<a id='selenium_df'></a>


&nbsp;

# Selenium DataFrame


#### Michigan

In [48]:
m_selenium = pd.concat([m_strings, m_times, m_states], axis=1)
m_selenium

Unnamed: 0,Game String,Start time,US State
0,Wayne State (EX) on November 6 7:00 PM,7:00 PM,MICH.
1,Northern Michigan on November 14 7:00 PM,7:00 PM,MICH.
2,Houston Baptist on November 20 7:00 PM,7:00 PM,MICH.
3,Creighton on November 26 12:00 PM,12:00 PM,FLA.
4,Marquette on November 27 12:00 PM,12:00 PM,FLA.
...,...,...,...
368,Minnesota on March 16 2:30 PM CT,2:30 PM,ILL.
369,Michigan State on March 17 2:30 PM CT,2:30 PM,ILL.
370,Montana on March 21 8:20 PM CT,8:20 PM,IOWA
371,Florida on March 23 4:15 PM CT,4:15 PM,IOWA


In [49]:
# remove rows where '(EX)' in 'Game String' before pd.concat w/ df_cbb
m_selenium = m_selenium[~m_selenium['Game String'].str.contains('EXHIBITION|\([Ee][Xx]\)|EXHIB')]

In [50]:
m_selenium = m_selenium.reset_index(drop=True)

In [51]:
# drop exhibition games by row number (improve)
# m_selenium = m_selenium.drop([0, 38, 39, 97, 121, 122, 124, 151, 152, 191, 192, 232, 233, 269, 270, 306, 307, 308, 344])

In [52]:
# m_selenium.to_csv(r'C:\Users\RJ\Downloads\m_selenium.csv')

In [53]:
m_selenium = m_selenium.reset_index(drop=True)
m_selenium

Unnamed: 0,Game String,Start time,US State
0,Northern Michigan on November 14 7:00 PM,7:00 PM,MICH.
1,Houston Baptist on November 20 7:00 PM,7:00 PM,MICH.
2,Creighton on November 26 12:00 PM,12:00 PM,FLA.
3,Marquette on November 27 12:00 PM,12:00 PM,FLA.
4,Alabama on November 29 5:00 PM,5:00 PM,FLA.
...,...,...,...
356,Minnesota on March 16 2:30 PM CT,2:30 PM,ILL.
357,Michigan State on March 17 2:30 PM CT,2:30 PM,ILL.
358,Montana on March 21 8:20 PM CT,8:20 PM,IOWA
359,Florida on March 23 4:15 PM CT,4:15 PM,IOWA


In [54]:
len(df_cbb[df_cbb['school'] == 'michigan'])

361

In [None]:
assert len(df_cbb[df_cbb['school'] == 'michigan']) == 361, "Incorrect length of Michigan Selenium frame"

#### Michigan State

In [55]:
msu_selenium = pd.concat([msu_strings, msu_times, msu_states], axis=1)
msu_selenium

Unnamed: 0,Game String,Start time,US State
0,GREEN AND WHITE GAME,3:30 PM,MICH.
1,NORTHWOOD UNIVERSITY (EXHIB.),4:00 PM,MICH.
2,GRAND VALLEY STATE,7:00 PM,MICH.
3,FLORIDA GULF COAST,7:00 PM,MICH.
4,GONZAGA,8:00 PM,MICH.
...,...,...,...
385,BRADLEY,2:45 PM ET,IOWA / WELLS FARGO ARENA
386,MINNESOTA,7:45 PM ET,IOWA / WELLS FARGO ARENA
387,#12 LSU,7 PM ET,D.C. / CAPITAL ONE ARENA
388,#1 DUKE,5:05 PM ET,D.C. / CAPITAL ONE ARENA


In [56]:
# remove rows where '(EX)' in 'Game String' before pd.concat w/ df_cbb
msu_selenium = msu_selenium[~msu_selenium['Game String'].str.contains('EXHIBITION|\(EX\)|EXHIB|MIDNIGHT|GREEN AND WHITE')]

In [57]:
msu_selenium = msu_selenium.reset_index(drop=True)

In [58]:
# drop exhibition games by row number (improve)
msu_selenium = msu_selenium.drop([0, 38, 39, 97, 121, 122, 124, 151, 152, 191, 192, 232, 233, 269, 270, 306, 307, 308, 344])

In [59]:
# msu_selenium.to_csv(r'C:\Users\RJ\Downloads\df_msu.csv')

In [60]:
msu_selenium = msu_selenium.reset_index(drop=True)
msu_selenium

Unnamed: 0,Game String,Start time,US State
0,FLORIDA GULF COAST,7:00 PM,MICH.
1,GONZAGA,8:00 PM,MICH.
2,TOLEDO,6:30 PM,MICH.
3,VALPARAISO,12:00 PM,MICH.
4,FLORIDA,8:00 PM,N.J.
...,...,...,...
360,BRADLEY,2:45 PM ET,IOWA / WELLS FARGO ARENA
361,MINNESOTA,7:45 PM ET,IOWA / WELLS FARGO ARENA
362,#12 LSU,7 PM ET,D.C. / CAPITAL ONE ARENA
363,#1 DUKE,5:05 PM ET,D.C. / CAPITAL ONE ARENA


In [61]:
len(df_cbb[df_cbb['school'] == 'michigan state'])

365

In [None]:
assert len(df_cbb[df_cbb['school'] == 'michigan state']) == 365, "Incorrect length of MSU Selenium frame"

<a id='final_df'></a>


&nbsp;

# Final College Basketball DataFrame


Split Michigan and Michigan State into separate DataFrames, concat columns w/ axis=1, then concat vertically w/ axis=0


#### Michigan State


In [62]:
df_msu = df_cbb[df_cbb['school'] == 'michigan state']

In [63]:
df_msu = df_msu.reset_index()

In [64]:
df_msu = pd.concat([df_msu, msu_selenium], axis=1)

In [65]:
# reimport df_msu from file - used so you don't have to scrape from Selenium each time
# df_msu = pd.read_csv(r'../04_finaldata/bball_data_michigan_state.csv')

In [66]:
df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace({'\.': '', 
                                      ' ET': '',
                                      'NOON': '12:00 PM',
                                      '(?<!\d:\d\d)(?= PM)': ':00'}, regex=True)
df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace({'7 PM': '7:00 PM', '6 PM': '6:00 PM'})
df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace('(?=\/)\/.+', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace({'\.': '',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace({'7 PM': '7:00 PM', '6 PM': '6:00 PM'})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace('(?=\/)\/.+', '', regex=True)


In [67]:
df_msu['start_dt'] = df_msu['gamedate'] + ' ' + df_msu['Start time']

In [68]:
df_msu['start_dt']

0       2009-11-13 7:00 PM
1       2009-11-17 8:00 PM
2       2009-11-20 6:30 PM
3      2009-11-22 12:00 PM
4       2009-11-27 8:00 PM
              ...         
360     2019-03-21 2:45 PM
361     2019-03-23 7:45 PM
362     2019-03-29 7:00 PM
363     2019-03-31 5:05 PM
364     2019-04-06 8:49 PM
Name: start_dt, Length: 365, dtype: object

In [69]:
df_msu['start_dt'] = pd.to_datetime(df_msu['start_dt'])

In [70]:
df_msu[(df_msu['gamedate'] >= ('2018-10-01')) 
       & (df_msu['gamedate'] <= ('2019-05-01'))]

Unnamed: 0,index,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,...,l,streak,arena,attendance,rank,gametime,Game String,Start time,US State,start_dt
326,687,michigan state,basketball,2018-11-06,REG,Kansas (1),Big 12,L,87,92,...,1,L 1,Bankers Life Fieldhouse,0.0,10.0,7:00p,#1 KANSAS,7:00 PM,IND.,2018-11-06 19:00:00
327,688,michigan state,basketball,2018-11-11,REG,Florida Gulf Coast,A-Sun,W,106,82,...,1,W 1,Breslin Events Center,0.0,10.0,6:00p,FLORIDA GULF COAST,6:00 PM,MICH. BRESLIN CENTER,2018-11-11 18:00:00
328,689,michigan state,basketball,2018-11-14,REG,Louisiana-Monroe,Sun Belt,W,80,59,...,1,W 2,Breslin Events Center,0.0,11.0,7:00p,LOUISIANA MONROE,7:00 PM,MICH. BRESLIN CENTER,2018-11-14 19:00:00
329,690,michigan state,basketball,2018-11-18,REG,Tennessee Tech,OVC,W,101,33,...,1,W 3,Breslin Events Center,0.0,11.0,6:00p,TENNESSEE TECH,6:00 PM,MICH. BRESLIN CENTER,2018-11-18 18:00:00
330,691,michigan state,basketball,2018-11-22,REG,UCLA (17),Pac-12,W,87,67,...,1,W 4,Orleans Arena,0.0,11.0,10:00p,#17 UCLA,10:00 PM,NEV.,2018-11-22 22:00:00
331,692,michigan state,basketball,2018-11-23,REG,Texas,Big 12,W,78,68,...,1,W 5,Orleans Arena,0.0,11.0,6:30p,TEXAS,6:30 PM,NEV.,2018-11-23 18:30:00
332,693,michigan state,basketball,2018-11-27,REG,Louisville,ACC,L,78,82,...,2,L 1,KFC Yum! Center,0.0,9.0,7:30p,LOUISVILLE,7:30 PM,KY.,2018-11-27 19:30:00
333,694,michigan state,basketball,2018-11-30,REG,Rutgers,Big Ten,W,78,67,...,2,W 1,Louis Brown Athletic Center,0.0,9.0,6:00p,RUTGERS,6:00 PM,N.J.,2018-11-30 18:00:00
334,695,michigan state,basketball,2018-12-03,REG,Iowa (18),Big Ten,W,90,68,...,2,W 2,Breslin Events Center,0.0,10.0,6:30p,#18 IOWA,6:30 PM,MICH. BRESLIN CENTER,2018-12-03 18:30:00
335,696,michigan state,basketball,2018-12-08,REG,Florida,SEC,W,63,59,...,2,W 3,Stephen C. O'Connell Center,0.0,10.0,12:00p,FLORIDA,12:00 PM,FLA.,2018-12-08 12:00:00


In [71]:
# df_msu.to_csv(r'C:\Users\RJ\Downloads\bball_data_michigan_state.csv')


#### Michigan


In [72]:
df_m = df_cbb[df_cbb['school'] == 'michigan']

In [73]:
df_m = df_m.reset_index()

In [74]:
df_m = pd.concat([df_m, m_selenium], axis=1)

In [75]:
# reimport df w/ Michigan bball data scraped w/ Selenium - used so you don't have to scrape from Selenium each time
# df_m = pd.read_csv(r'../04_finaldata/bball_data_with_michigans_numbers.csv', usecols=range(2, 20))

In [76]:
df_m['Start time'] = df_m['Start time'].replace('Noon', '12:00 PM')

In [77]:
df_m['start_dt'] = df_m['gamedate'] + ' ' + df_m['Start time']

In [78]:
df_m['start_dt'] = pd.to_datetime(df_m['start_dt'])

In [79]:
# Michigan's mgoblue gametimes all map to location's local time. Use dict to move each time back to EST.
time_dict = {'ILL.': 1, 'WIS.': 1, 'UTAH': 2, 'KAN.': 1, 'MINN.': 1, 'IOWA': 1, 'HAWAII': 6, 'ARK.': 1,
             'NEB.': 1, 'TEXAS': 1, 'ARIZ.': 2, 'CALIF.': 3, 'MO.': 1, 'HAWAI\'I': 1}

In [80]:
def time_converter(dt, state):
    if state not in time_dict:
        return dt
    else:
        return dt + datetime.timedelta(hours=time_dict[state])

In [81]:
df_m['start_dt'] = df_m.apply(lambda x: time_converter(x['start_dt'], x['US State']), axis=1)

In [82]:
df_m[-20:-1]

Unnamed: 0,index,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,...,l,streak,arena,attendance,rank,gametime,Game String,Start time,US State,start_dt
341,341,michigan,basketball,2019-01-19,REG,Wisconsin,Big Ten,L,54,64,...,1,L 1,Kohl Center,0.0,2.0,12:00p,Wisconsin on January 19 11:00 AM CT,11:00 AM,WIS.,2019-01-19 12:00:00
342,342,michigan,basketball,2019-01-22,REG,Minnesota,Big Ten,W,59,57,...,1,W 1,Crisler Arena,0.0,5.0,7:00p,Minnesota on January 22 7:00 PM,7:00 PM,MICH.,2019-01-22 19:00:00
343,343,michigan,basketball,2019-01-25,REG,Indiana,Big Ten,W,69,46,...,1,W 2,Assembly Hall,0.0,5.0,6:30p,Indiana on January 25 6:30 PM,6:30 PM,IND.,2019-01-25 18:30:00
344,344,michigan,basketball,2019-01-29,REG,Ohio State,Big Ten,W,65,49,...,1,W 3,Crisler Arena,0.0,5.0,9:00p,Ohio State on January 29 9:00 PM,9:00 PM,MICH.,2019-01-29 21:00:00
345,345,michigan,basketball,2019-02-01,REG,Iowa,Big Ten,L,59,74,...,2,L 1,Carver-Hawkeye Arena,0.0,5.0,7:00p,Iowa on February 1 6:00 PM CT,6:00 PM,IOWA,2019-02-01 19:00:00
346,346,michigan,basketball,2019-02-05,REG,Rutgers,Big Ten,W,77,65,...,2,W 1,Louis Brown Athletic Center,0.0,7.0,8:00p,Rutgers on February 5 8:00 PM,8:00 PM,N.J.,2019-02-05 20:00:00
347,347,michigan,basketball,2019-02-09,REG,Wisconsin (19),Big Ten,W,61,52,...,2,W 2,Crisler Arena,0.0,7.0,12:00p,Wisconsin on February 9 Noon,12:00 PM,MICH.,2019-02-09 12:00:00
348,348,michigan,basketball,2019-02-12,REG,Penn State,Big Ten,L,69,75,...,3,L 1,Bryce Jordan Center,0.0,6.0,8:30p,Penn State on February 12 8:30 PM,8:30 PM,PA.,2019-02-12 20:30:00
349,349,michigan,basketball,2019-02-16,REG,Maryland (24),Big Ten,W,65,52,...,3,W 1,Crisler Arena,0.0,6.0,12:00p,Maryland on February 16 Noon,12:00 PM,MICH.,2019-02-16 12:00:00
350,350,michigan,basketball,2019-02-21,REG,Minnesota,Big Ten,W,69,60,...,3,W 2,Williams Arena,0.0,7.0,7:00p,Minnesota on February 21 6:00 PM CT,6:00 PM,MINN.,2019-02-21 19:00:00



#### Concatenate


In [83]:
df_basketball = pd.concat([df_m, df_msu]).reset_index(drop=True)

In [84]:
df_basketball[(df_basketball['school'] == ('michigan')) 
              & (df_basketball['start_dt'] >= ('2014-10-01')) 
              & (df_basketball['start_dt'] <= ('2015-05-01'))]

Unnamed: 0,index,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,...,l,streak,arena,attendance,rank,gametime,Game String,Start time,US State,start_dt
177,177,michigan,basketball,2014-11-15,REG,Hillsdale,,W,92,68,...,0,W 1,Crisler Arena,0.0,24,2:00p,Hillsdale College on November 15 2:00 PM,2:00 PM,MICH.,2014-11-15 14:00:00
178,178,michigan,basketball,2014-11-17,REG,Bucknell,Patriot,W,77,53,...,0,W 2,Crisler Arena,0.0,24,8:00p,Bucknell on November 17 8:00 PM,8:00 PM,MICH.,2014-11-17 20:00:00
179,179,michigan,basketball,2014-11-20,REG,Detroit Mercy,Horizon,W,71,62,...,0,W 3,Crisler Arena,0.0,24,6:00p,Detroit on November 20 6:00 PM,6:00 PM,MICH.,2014-11-20 18:00:00
180,180,michigan,basketball,2014-11-24,REG,Oregon,Pac-12,W,70,63,...,0,W 4,Barclays Center,0.0,19,9:45p,Oregon on November 24 9:00 PM,9:00 PM,N.Y.,2014-11-24 21:00:00
181,181,michigan,basketball,2014-11-25,REG,Villanova (12),Big East,L,55,60,...,1,L 1,Barclays Center,0.0,19,10:15p,Villanova on November 25 10:00 PM,10:00 PM,N.Y.,2014-11-25 22:00:00
182,182,michigan,basketball,2014-11-29,REG,Nicholls State,Southland,W,91,62,...,1,W 1,Crisler Arena,0.0,19,4:00p,Nicholls State on November 29 4:00 PM,4:00 PM,MICH.,2014-11-29 16:00:00
183,183,michigan,basketball,2014-12-02,REG,Syracuse,ACC,W,68,65,...,1,W 2,Crisler Arena,0.0,17,7:30p,Syracuse on December 2 7:30 PM,7:30 PM,MICH.,2014-12-02 19:30:00
184,184,michigan,basketball,2014-12-06,REG,NJIT,Ind,L,70,72,...,2,L 1,Crisler Arena,0.0,17,12:00p,NJIT on December 6 12:00 PM,12:00 PM,MICH.,2014-12-06 12:00:00
185,185,michigan,basketball,2014-12-09,REG,Eastern Michigan,MAC,L,42,45,...,3,L 2,Crisler Arena,0.0,-,9:00p,Eastern Michigan on December 9 9:00 PM,9:00 PM,MICH.,2014-12-09 21:00:00
186,186,michigan,basketball,2014-12-13,REG,Arizona (3),Pac-12,L,53,80,...,4,L 3,McKale Center,0.0,-,5:15p,Arizona on December 13 3:25 PM,3:25 PM,ARIZ.,2014-12-13 17:25:00


In [85]:
df_basketball[-20:-1]

Unnamed: 0,index,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,...,l,streak,arena,attendance,rank,gametime,Game String,Start time,US State,start_dt
706,706,michigan state,basketball,2019-01-24,REG,Iowa (19),Big Ten,W,82,67,...,2,W 13,Carver-Hawkeye Arena,0.0,6.0,7:00p,#19 IOWA,7:00 PM,IOWA,2019-01-24 19:00:00
707,707,michigan state,basketball,2019-01-27,REG,Purdue,Big Ten,L,63,73,...,3,L 1,Mackey Arena,0.0,6.0,1:00p,PURDUE,1:00 PM,IND.,2019-01-27 13:00:00
708,708,michigan state,basketball,2019-02-02,REG,Indiana,Big Ten,L,75,79,...,4,L 2,Breslin Events Center,0.0,6.0,6:00p,INDIANA,6:00 PM,MICH. BRESLIN CENTER,2019-02-02 18:00:00
709,709,michigan state,basketball,2019-02-05,REG,Illinois,Big Ten,L,74,79,...,5,L 3,State Farm Center,0.0,9.0,7:00p,ILLINOIS,7:00 PM,ILL.,2019-02-05 19:00:00
710,710,michigan state,basketball,2019-02-09,REG,Minnesota,Big Ten,W,79,55,...,5,W 1,Breslin Events Center,0.0,9.0,2:00p,MINNESOTA,2:00 PM,MICH. BRESLIN CENTER,2019-02-09 14:00:00
711,711,michigan state,basketball,2019-02-12,REG,Wisconsin (20),Big Ten,W,67,59,...,5,W 2,Kohl Center,0.0,11.0,7:00p,#20 WISCONSIN,7:00 PM,WIS.,2019-02-12 19:00:00
712,712,michigan state,basketball,2019-02-17,REG,Ohio State,Big Ten,W,62,44,...,5,W 3,Breslin Events Center,0.0,11.0,1:00p,OHIO STATE,1:00 PM,MICH. BRESLIN CENTER,2019-02-17 13:00:00
713,713,michigan state,basketball,2019-02-20,REG,Rutgers,Big Ten,W,71,60,...,5,W 4,Breslin Events Center,0.0,10.0,6:30p,RUTGERS,6:30 PM,MICH. BRESLIN CENTER,2019-02-20 18:30:00
714,714,michigan state,basketball,2019-02-24,REG,Michigan (7),Big Ten,W,77,70,...,5,W 5,Crisler Arena,0.0,10.0,3:45p,#7 MICHIGAN,3:45 PM,MICH.,2019-02-24 15:45:00
715,715,michigan state,basketball,2019-03-02,REG,Indiana,Big Ten,L,62,63,...,6,L 1,Assembly Hall,0.0,6.0,12:00p,INDIANA,12:00 PM,IND.,2019-03-02 12:00:00


In [87]:
df_basketball = df_basketball.drop(['Start time', 'index'], axis=1)

In [89]:
# Let's grab the opponent's rank as a separate column - it's in the opponent column which includes the name of the opponent
df_basketball['opponent_rank'] = df_basketball['opponent'].str.extract(r'[A-Za-z\s\(]+([0-9]+)')
# make sure to include the sport
df_basketball['sport'] = 'basketball'
# We'll create the window to look for crime/incidents.  Each college game presumed to be about 2 hours + an 8-hour window for incidents.
df_basketball['end_inc_window'] = df_basketball['start_dt'] + pd.to_timedelta(10, unit='h')
df_basketball.sample(15)

Unnamed: 0,school,sport,gamedate,type,opponent,conf,result,team_points,opp_points,ot,...,streak,arena,attendance,rank,gametime,Game String,US State,start_dt,opponent_rank,end_inc_window
43,michigan,basketball,2010-12-23,REG,Bryant,NEC,W,87,71,,...,W 7,Crisler Arena,0.0,-,,Bryant on December 23 6:00 PM,MICH.,2010-12-23 18:00:00,,2010-12-24 04:00:00
336,michigan,basketball,2018-12-30,REG,Binghamton,AEC,W,74,52,,...,W 13,Crisler Arena,0.0,2,12:00p,Binghamton on December 30 Noon,MICH.,2018-12-30 12:00:00,,2018-12-30 22:00:00
680,michigan state,basketball,2018-02-17,REG,Northwestern,Big Ten,W,65,60,,...,W 10,Allstate Arena,0.0,2,2:00p,NORTHWESTERN,ILL.,2018-02-17 14:00:00,,2018-02-18 00:00:00
419,michigan state,basketball,2011-02-02,REG,Iowa,Big Ten,L,52,72,,...,L 1,Carver-Hawkeye Arena,0.0,-,,IOWA,IOWA,2011-02-02 20:30:00,,2011-02-03 06:30:00
708,michigan state,basketball,2019-02-02,REG,Indiana,Big Ten,L,75,79,OT,...,L 2,Breslin Events Center,0.0,6,6:00p,INDIANA,MICH. BRESLIN CENTER,2019-02-02 18:00:00,,2019-02-03 04:00:00
687,michigan state,basketball,2018-11-06,REG,Kansas (1),Big 12,L,87,92,,...,L 1,Bankers Life Fieldhouse,0.0,10,7:00p,#1 KANSAS,IND.,2018-11-06 19:00:00,1.0,2018-11-07 05:00:00
232,michigan,basketball,2016-02-06,REG,Michigan State (10),Big Ten,L,73,89,,...,L 2,Crisler Arena,0.0,-,2:00p,Michigan State on February 6 2:00 PM,MICH.,2016-02-06 14:00:00,10.0,2016-02-07 00:00:00
142,michigan,basketball,2013-11-17,REG,Iowa State,Big 12,L,70,77,,...,L 1,James H. Hilton Coliseum,0.0,7,,Iowa State on November 17 4:00 PM,IOWA,2013-11-17 17:00:00,,2013-11-18 03:00:00
7,michigan,basketball,2009-12-09,REG,Utah,MWC,L,52,68,,...,L 1,Jon M. Huntsman Center,0.0,-,,Utah on December 9 7:00 PM,UTAH,2009-12-09 21:00:00,,2009-12-10 07:00:00
704,michigan state,basketball,2019-01-17,REG,Nebraska,Big Ten,W,70,64,,...,W 11,Pinnacle Bank Arena,0.0,6,8:00p,NEBRASKA,NEB.,2019-01-17 20:00:00,,2019-01-18 06:00:00


In [91]:
df_basketball.to_csv(r'../04_finaldata/df_basketball_final.csv', sep=',')