In [1]:
import pandas as pd
import json
import re

import time
import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

[Selenium](#selenium)
  - [Game strings](#game_strings)
  - [Start times](#start_times)
  - [US state](#us_state)
  - ~[Attendance](#attendance)~

[CBB DataFrame](#cbb_df)\
[Selenium DataFrame](#selenium_df)\
[Final DataFrame](#final_df)

<a id='selenium'></a>


&nbsp;

# Selenium



Data sourced from [sports-reference.com](https://www.sports-reference.com/cbb) only had *half* of our basketball game start times, between 2015 and 2019, for our ten-year period.\
\
[mgoblue.com](https://mgoblue.com/sports/mens-basketball/schedule/2012-13) also shifts Michigan game start times to their Eastern Standard Time equivalents while Michigan's sport-reference.com data does not, e.g.:
- A game is played in Pacific Standard Time location:
  - mgoblue.com will have a game start at **8:59pm** EST while...
  - sports-reference.com will have a game start at **11:59pm**
---
To fill in the proper start times for our basketball games, we needed to scrape each program's basketball schedule site for:
  - The game string, to drop exhibition games absent from sports-reference.com data
  - The game start time
  - The state the game was played in, to shift times to their proper EST equivalent
  - ~The game's attendance - this was later dropped. Michigan State does not post attendance values on msusparts.com.~
---
Each program's basketball scheudle data had its own idiosyncracies / style.\
\
The approach taken was to collect the data for each, clean & manipulate, then combine and append to the College Basketball DataFrame (the basketball data sourced from sports-reference.com)


In [2]:
seasons = ['2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19']

s = Service('../07_assets/chromedriver.exe')
browser = webdriver.Chrome(service=s)

<a id='game_strings'></a>


&nbsp;

### Obtain game string



#### Michigan


In [3]:
%%time

m_strings = []
for season in seasons: # convert first for loop to function and call in subsequent cells
    url = f'https://mgoblue.com/sports/mens-basketball/schedule/{season}'
    browser.get(url)
    time.sleep(2)
    browser.implicitly_wait(0.5)
    
    # pull strings from anchor tags
    elements = browser.find_elements(By.TAG_NAME, 'a') # convert this block, too, to a function w/ arg for tag name
    for e in elements:
        if 'Story' in e.accessible_name or 'Notes' in e.accessible_name or 'Box score' in e.accessible_name or 'Boxscore' in e.accessible_name or 'Boxcsore' in e.accessible_name or 'Stats for' in e.accessible_name or 'Quotes' in e.accessible_name or "Photos for" in e.accessible_name:
            pass
        elif ' PM' in e.accessible_name or ' AM' in e.accessible_name or ' p.m.' in e.accessible_name or 'Noon' in e.accessible_name or ' noon' in e.accessible_name or '(Exhib.)' in e.accessible_name:
            m_strings.append(e.accessible_name)
    browser.implicitly_wait(0.5)

Wall time: 5min 12s


In [4]:
print(len(m_strings))
m_strings

373


['Wayne State (EX) on November 6 7:00 PM',
 'Northern Michigan on November 14 7:00 PM',
 'Houston Baptist on November 20 7:00 PM',
 'Creighton on November 26 12:00 PM',
 'Marquette on November 27 12:00 PM',
 'Alabama on November 29 5:00 PM',
 'Boston College on December 2 7:30 PM',
 'Arkansas - Pine Bluff on December 5 2:00 PM',
 'Utah on December 9 7:00 PM',
 'Detroit on December 13 12:00 PM',
 'Kansas on December 19 11:00 AM',
 'Coppin State on December 22 7:00 PM',
 'Indiana on December 31 12:00 PM',
 'Ohio State on January 3 4:30 PM',
 'Penn State on January 7 7:00 PM',
 'Northwestern on January 10 2:30 PM',
 'Indiana on January 14 9:00 PM',
 'Connecticut on January 17 1:30 PM',
 'Wisconsin on January 20 7:30 PM',
 'Purdue on January 23 4:00 PM',
 'Michigan State on January 26 7:00 PM',
 'Iowa on January 30 4:35 PM',
 'Northwestern on February 2 6:00 PM',
 'Wisconsin on February 6 4:00 PM',
 'Minnesota on February 11 6:00 PM',
 'Iowa on February 16 8:05 PM',
 'Penn State on Februar

In [5]:
m_strings = pd.Series(m_strings, name='Game String')

#### Michigan State

In [6]:
%%time

msu_strings = []
for season in seasons:
    url = f'https://msuspartans.com/sports/mens-basketball/schedule/{season}'
    browser.get(url)
    time.sleep(2)
    browser.implicitly_wait(0.5)
    
    # pull strings from anchor tags
    elements = browser.find_elements(By.XPATH, "//div[@class='sidearm-schedule-game-opponent-name']")
    for e in elements:
        msu_strings.append(e.text)
    browser.implicitly_wait(0.5)

Wall time: 33.1 s


In [7]:
print(len(msu_strings)) # 390
msu_strings

390


['GREEN AND WHITE GAME',
 'NORTHWOOD UNIVERSITY (EXHIB.)',
 'GRAND VALLEY STATE',
 'FLORIDA GULF COAST',
 'GONZAGA',
 'TOLEDO',
 'VALPARAISO',
 'FLORIDA',
 'MASSACHUSETTS',
 'NORTH CAROLINA (BIG TEN/ACC CHALLENGE)',
 'WOFFORD',
 'THE CITADEL',
 'OAKLAND',
 'IPFW',
 'TEXAS',
 'TEXAS-ARLINGTON',
 'NORTHWESTERN',
 'WISCONSIN',
 'IOWA',
 'MINNESOTA',
 'ILLINOIS',
 'IOWA',
 'MINNESOTA',
 'MICHIGAN',
 'NORTHWESTERN',
 'WISCONSIN',
 'ILLINOIS',
 'PURDUE',
 'PENN STATE',
 'INDIANA',
 'OHIO STATE',
 'PURDUE',
 'PENN STATE',
 'MICHIGAN',
 'MINNESOTA',
 'NEW MEXICO STATE',
 'MARYLAND',
 'NORTHERN IOWA',
 'TENNESSEE',
 'BUTLER',
 'MIDNIGHT MADNESS',
 'SAGINAW VALLEY STATE',
 'NEBRASKA-OMAHA',
 'EASTERN MICHIGAN',
 'SOUTH CAROLINA',
 'CHAMINADE',
 'CONNECTICUT',
 'WASHINGTON',
 'TENNESSEE TECH',
 'DUKE (ACC/BIG TEN CHALLENGE)',
 'BOWLING GREEN',
 'SYRACUSE',
 'OAKLAND',
 'PRAIRIE VIEW A&M',
 'TEXAS',
 'MINNESOTA',
 'NORTHWESTERN',
 'PENN STATE',
 'WISCONSIN',
 'NORTHWESTERN',
 'ILLINOIS',
 'PURDUE'

In [8]:
msu_strings = pd.Series(msu_strings, name='Game String')

<a id='start_times'></a>


&nbsp;

### Obtain game start times



#### Michigan


In [9]:
%%time

m_times = []
for season in seasons:
    url = f'https://mgoblue.com/sports/mens-basketball/schedule/{season}'
    browser.get(url)
    time.sleep(2)
    browser.implicitly_wait(0.5)
    
    # pull times from anchor tags
    elements = browser.find_elements(By.TAG_NAME, 'a')
    for e in elements:
        if ' vs ' in e.accessible_name or ' at ' in e.accessible_name:
            pass
        elif ' PM' in e.accessible_name or ' p.m.' in e.accessible_name or ' AM' in e.accessible_name or ' ET' in e.accessible_name or ' Noon' in e.accessible_name or ' noon' in e.accessible_name:
            m_times.append(re.findall('\d+:\d+ \w+|(?<=\d )\d+ [\w\.]+|[Nn]oon', e.accessible_name)[0])
    browser.implicitly_wait(0.5)

Wall time: 4min 2s


In [10]:
print(len(m_times))
m_times

373


['7:00 PM',
 '7:00 PM',
 '7:00 PM',
 '12:00 PM',
 '12:00 PM',
 '5:00 PM',
 '7:30 PM',
 '2:00 PM',
 '7:00 PM',
 '12:00 PM',
 '11:00 AM',
 '7:00 PM',
 '12:00 PM',
 '4:30 PM',
 '7:00 PM',
 '2:30 PM',
 '9:00 PM',
 '1:30 PM',
 '7:30 PM',
 '4:00 PM',
 '7:00 PM',
 '4:35 PM',
 '6:00 PM',
 '4:00 PM',
 '6:00 PM',
 '8:05 PM',
 '6:00 PM',
 '7:00 PM',
 '12:00 PM',
 '7:00 PM',
 '4:00 PM',
 '2:30 PM',
 '12:00 PM',
 '7:00 PM',
 '7:00 PM',
 '7:00 PM',
 '2:00 PM',
 '8:00 PM',
 '5:30 PM',
 '9:00 PM',
 '1:00 PM',
 '7:30 PM',
 '6:30 PM',
 '7:00 PM',
 '12:00 PM',
 '6:00 PM',
 '2:00 PM',
 '4:00 PM',
 '7:30 PM',
 '4:30 PM',
 '6:30 PM',
 '8:00 PM',
 '8:00 PM',
 '7:00 PM',
 '7:00 PM',
 '4:00 PM',
 '7:00 PM',
 '12:00 PM',
 '6:30 PM',
 '4:00 PM',
 '7:30 PM',
 '3:30 PM',
 '6:30 PM',
 '3:30 PM',
 '2:00 PM',
 '2:30 PM',
 '1:40 PM',
 '12:40 PM',
 '2:45 PM',
 '7:00 PM',
 '7:00 PM',
 '8:30 PM',
 '8:30 PM',
 '10:00 AM',
 '2:00 PM',
 '2:30 PM',
 '7:00 PM',
 '12:00 PM',
 '4:00 PM',
 '7:00 PM',
 '12:00 PM',
 '6:30 PM',
 '7

In [11]:
m_times = pd.Series(m_times, name='Start time')


#### Michigan State


In [12]:
%%time

msu_times = []
for season in seasons:
    url = f'https://msuspartans.com/sports/mens-basketball/schedule/{season}'
    browser.get(url)
    time.sleep(2)
    browser.implicitly_wait(0.5)
    
    # pull times from anchor tags
    elements = browser.find_elements(By.XPATH, "//div[@class='sidearm-schedule-game-opponent-date flex-item-1']/span/following-sibling::span")
    for e in elements:
        if e.text == '':
            pass
        else:
            msu_times.append(e.text)
    browser.implicitly_wait(0.5)

Wall time: 37.3 s


In [13]:
print(len(msu_times))
msu_times

390


['3:30 PM',
 '4:00 PM',
 '7:00 PM',
 '7:00 PM',
 '8:00 PM',
 '6:30 PM',
 '12:00 PM',
 '8:00 PM',
 '5:30 PM',
 '9:00 PM',
 '7:00 PM',
 '7:00 PM',
 '7:00 PM',
 '12:00 PM',
 '7:00 PM',
 '7:00 PM',
 '6:30 PM',
 '6:30 PM',
 '5:35 PM',
 '6:30 PM',
 '3:30 PM',
 '6:30 PM',
 '12:00 PM',
 '7:00 PM',
 '7:00 PM',
 '9:00 PM',
 '9:00 PM',
 '9:00 PM',
 '12:00 PM',
 '7:00 PM',
 '12:00 PM',
 '4:00 PM',
 '7:00 PM',
 '4:00 PM',
 '9:00 PM',
 '7:20 PM',
 '2:30 PM',
 '9:37 PM',
 '2:20 PM',
 '6:07 PM',
 '9:30 PM',
 '7:00 PM',
 '7:00 PM',
 '8:30 PM',
 '10:00 PM',
 '9:30 PM',
 '7:00 PM',
 '5:00 PM',
 '1:00 PM',
 '9:30 PM',
 '1:30 PM',
 '9:00 PM',
 '12:30 PM',
 '6:30 PM',
 '7:00 PM',
 '4:00 PM',
 '7:30 PM',
 '1:00 PM',
 '7:00 PM',
 '1:00 PM',
 '7:00 PM',
 '9:00 PM',
 '7:00 PM',
 '6:00 PM',
 '8:30 PM',
 '1:00 PM',
 '7:00 PM',
 '9:00 PM',
 '9:00 PM',
 '9:00 PM',
 '1:00 PM',
 '6:30 PM',
 '2:00 PM',
 '5:00 PM',
 '6:30 PM',
 '4:15 PM',
 '9:20 PM',
 '2:00 PM',
 '7:00 PM',
 '7:00 PM',
 '7:00 PM',
 '6:30 PM',
 '6:00 PM

In [14]:
msu_times = pd.Series(msu_times, name='Start time')

<a id='us_state'></a>


&nbsp;

### Obtain US state locations



#### Michigan


In [15]:
%%time

m_states = []
for season in seasons:
    url = (f'https://mgoblue.com/sports/mens-basketball/schedule/{season}')
    browser.get(url)
    browser.implicitly_wait(0.5)
    
    # pull states out of class
    elements = browser.find_elements(By.CLASS_NAME, 'sidearm-schedule-game-location')
    for e in elements:
        if e.text == 'CRISLER CENTER':
            m_states.append('MICH.')
        else:
            m_states.append(re.findall('(?<=, ).+', e.text)[0])
    browser.implicitly_wait(0.5)

Wall time: 37.6 s


In [16]:
len(m_states)

373

In [17]:
m_states = pd.Series(m_states, name='US State')


#### Michigan State


In [18]:
%%time

msu_states = []
for season in seasons:
    url = (f'https://msuspartans.com/sports/mens-basketball/schedule/{season}')
    browser.get(url)
    browser.implicitly_wait(0.5)
    
    # pull states out of class
    elements = browser.find_elements(By.CLASS_NAME, 'sidearm-schedule-game-location')
    for e in elements:
        if e.text == 'CRISLER CENTER':
            states.append('MICH.')
        else:
            msu_states.append(re.findall('(?<=, ).+', e.text)[0])
    browser.implicitly_wait(0.5)

Wall time: 18.9 s


In [19]:
len(msu_states)

390

In [20]:
msu_states = pd.Series(msu_states, name='US State')

<a id='attendance'></a>


&nbsp;

### ~Obtain attendance~


In [None]:
%%time

attendance = []
for season in seasons:
    url = f'https://mgoblue.com/sports/mens-basketball/schedule/{season}'
    browser.get(url)
    browser.implicitly_wait(0.5)

    # open each game's flyout
    elements = browser.find_elements(By.XPATH, "//button[@class='sidearm-schedule-game-toggle noprint']")
    for e in elements:
        browser.execute_script("arguments[0].click();", e)
        browser.implicitly_wait(0.5)
        time.sleep(1)

    # grab each game's attendance from <dd> tag after expanding each game
    elements = browser.find_elements(By.XPATH, "//div[@class='sidearm-schedule-game-extra-leaders flex-item-1']//dd[@data-bind='text: attendance']")
    for e in elements:
        if e.text != '':
            attendance.append(e.get_attribute('textContent'))
            browser.implicitly_wait(0.5)
            time.sleep(0.2)
    browser.implicitly_wait(0.5)

In [None]:
print(len(attendance))
attendance

In [None]:
# insert empty values for the three exhibition games Michigan played w/out <dd> tags
attendance.insert(0, '') # Wayne State EX game with no attendance recorded
attendance.insert(33, '') # EX game with no attendance recorded
attendance.insert(69, '') # EX game with no attendance recorded
attendance.insert(341, '') # George Washington game had no attendance recorded

In [None]:
attendance_series = pd.Series(attendance, name='Attendance')

<a id='cbb_df'></a>


&nbsp;

# CBB DataFrame


In [72]:
df_cbb = pd.read_csv(r'C:\Users\RJ\OneDrive\education\mads\591_592\milestone\02_sport_rawdata\cbb_2010_2019.csv')

In [73]:
print(len(df_cbb[(df_cbb['school'] == 'michigan')]))
print(len(df_cbb[(df_cbb['school'] == 'michigan-state')]))

361
365


In [74]:
df_cbb.dtypes

Unnamed: 0      int64
school         object
gamedate       object
type           object
opponent       object
conf           object
result         object
team_points     int64
opp_points      int64
ot             object
w               int64
l               int64
streak         object
arena          object
rank           object
gametime       object
dtype: object

In [75]:
# df_cbb['gamedate'] = pd.to_datetime(df_cbb['gamedate'])

In [76]:
df_cbb[(df_cbb['school'] == ('michigan-state')) 
        & (df_cbb['gamedate'] >= ('2015-10-01')) 
        & (df_cbb['gamedate'] <= ('2016-05-01'))]

Unnamed: 0.1,Unnamed: 0,school,gamedate,type,opponent,conf,result,team_points,opp_points,ot,w,l,streak,arena,rank,gametime
582,612,michigan-state,2015-11-13,REG,Florida Atlantic,CUSA,W,82,55,,1,0,W 1,Breslin Events Center,13.0,7:00p
583,613,michigan-state,2015-11-17,REG,Kansas (4),Big 12,W,79,73,,2,0,W 2,United Center,13.0,10:00p
584,614,michigan-state,2015-11-20,REG,Arkansas-Pine Bluff,SWAC,W,92,46,,3,0,W 3,Breslin Events Center,13.0,7:00p
585,615,michigan-state,2015-11-23,REG,Eastern Michigan,MAC,W,89,65,,4,0,W 4,Breslin Events Center,3.0,7:00p
586,616,michigan-state,2015-11-26,REG,Boston College,ACC,W,99,68,,5,0,W 5,Titan Gym,3.0,6:30p
587,617,michigan-state,2015-11-27,REG,Boise State,MWC,W,77,67,,6,0,W 6,Titan Gym,3.0,5:30p
588,618,michigan-state,2015-11-29,REG,Providence,Big East,W,77,64,,7,0,W 7,Honda Center,3.0,10:00p
589,619,michigan-state,2015-12-02,REG,Louisville (24),ACC,W,71,67,,8,0,W 8,Breslin Events Center,3.0,7:15p
590,620,michigan-state,2015-12-05,REG,Binghamton,AEC,W,76,33,,9,0,W 9,Breslin Events Center,3.0,12:00p
591,621,michigan-state,2015-12-09,REG,Maryland-Eastern Shore,MEAC,W,78,35,,10,0,W 10,Breslin Events Center,1.0,7:00p



Michigan's game times differ from those of sports-reference.com in that Michigan applies timezone adjustment.


In [77]:
df_cbb[df_cbb['school'] == 'michigan'].tail()

Unnamed: 0.1,Unnamed: 0,school,gamedate,type,opponent,conf,result,team_points,opp_points,ot,w,l,streak,arena,rank,gametime
356,373,michigan,2019-03-16,CTOURN,Minnesota,Big Ten,W,76,49,,28,5,W 2,United Center,10.0,3:30p
357,374,michigan,2019-03-17,CTOURN,Michigan State (6),Big Ten,L,60,65,,28,6,L 1,United Center,10.0,3:30p
358,376,michigan,2019-03-21,NCAA,Montana,Big Sky,W,74,55,,29,6,W 1,Wells Fargo Arena,,9:20p
359,377,michigan,2019-03-23,NCAA,Florida,SEC,W,64,49,,30,6,W 2,Wells Fargo Arena,,5:15p
360,378,michigan,2019-03-28,NCAA,Texas Tech (9),Big 12,L,44,63,,30,7,L 1,Honda Center,,9:39p


<a id='selenium_df'></a>


&nbsp;

# Selenium DataFrame


#### Michgian

In [48]:
m_selenium = pd.concat([m_strings, m_times, m_states], axis=1)
m_selenium

Unnamed: 0,Game String,Start time,US State
0,Wayne State (EX) on November 6 7:00 PM,7:00 PM,MICH.
1,Northern Michigan on November 14 7:00 PM,7:00 PM,MICH.
2,Houston Baptist on November 20 7:00 PM,7:00 PM,MICH.
3,Creighton on November 26 12:00 PM,12:00 PM,FLA.
4,Marquette on November 27 12:00 PM,12:00 PM,FLA.
...,...,...,...
368,Minnesota on March 16 2:30 PM CT,2:30 PM,ILL.
369,Michigan State on March 17 2:30 PM CT,2:30 PM,ILL.
370,Montana on March 21 8:20 PM CT,8:20 PM,IOWA
371,Florida on March 23 4:15 PM CT,4:15 PM,IOWA


In [49]:
# remove rows where '(EX)' in 'Game String' before pd.concat w/ df_cbb
m_selenium = m_selenium[~m_selenium['Game String'].str.contains('EXHIBITION|\([Ee][Xx]\)|EXHIB')]

In [50]:
m_selenium = m_selenium.reset_index(drop=True)

In [51]:
# drop exhibition games by row number (improve)
# m_selenium = m_selenium.drop([0, 38, 39, 97, 121, 122, 124, 151, 152, 191, 192, 232, 233, 269, 270, 306, 307, 308, 344])

In [52]:
# m_selenium.to_csv(r'C:\Users\RJ\Downloads\m_selenium.csv')

In [53]:
m_selenium = m_selenium.reset_index(drop=True)
m_selenium

Unnamed: 0,Game String,Start time,US State
0,Northern Michigan on November 14 7:00 PM,7:00 PM,MICH.
1,Houston Baptist on November 20 7:00 PM,7:00 PM,MICH.
2,Creighton on November 26 12:00 PM,12:00 PM,FLA.
3,Marquette on November 27 12:00 PM,12:00 PM,FLA.
4,Alabama on November 29 5:00 PM,5:00 PM,FLA.
...,...,...,...
356,Minnesota on March 16 2:30 PM CT,2:30 PM,ILL.
357,Michigan State on March 17 2:30 PM CT,2:30 PM,ILL.
358,Montana on March 21 8:20 PM CT,8:20 PM,IOWA
359,Florida on March 23 4:15 PM CT,4:15 PM,IOWA


In [35]:
len(df_cbb[df_cbb['school'] == 'michigan'])

361

#### Michigan State

In [36]:
msu_selenium = pd.concat([msu_strings, msu_times, msu_states], axis=1)
msu_selenium

Unnamed: 0,Game String,Start time,US State
0,GREEN AND WHITE GAME,3:30 PM,MICH.
1,NORTHWOOD UNIVERSITY (EXHIB.),4:00 PM,MICH.
2,GRAND VALLEY STATE,7:00 PM,MICH.
3,FLORIDA GULF COAST,7:00 PM,MICH.
4,GONZAGA,8:00 PM,MICH.
...,...,...,...
385,BRADLEY,2:45 PM ET,IOWA / WELLS FARGO ARENA
386,MINNESOTA,7:45 PM ET,IOWA / WELLS FARGO ARENA
387,#12 LSU,7 PM ET,D.C. / CAPITAL ONE ARENA
388,#1 DUKE,5:05 PM ET,D.C. / CAPITAL ONE ARENA


In [37]:
# remove rows where '(EX)' in 'Game String' before pd.concat w/ df_cbb
msu_selenium = msu_selenium[~msu_selenium['Game String'].str.contains('EXHIBITION|\(EX\)|EXHIB|MIDNIGHT|GREEN AND WHITE')]

In [38]:
msu_selenium = msu_selenium.reset_index(drop=True)

In [39]:
# drop exhibition games by row number (improve)
msu_selenium = msu_selenium.drop([0, 38, 39, 97, 121, 122, 124, 151, 152, 191, 192, 232, 233, 269, 270, 306, 307, 308, 344])

In [40]:
# msu_selenium.to_csv(r'C:\Users\RJ\Downloads\df_msu.csv')

In [41]:
msu_selenium = msu_selenium.reset_index(drop=True)
msu_selenium

Unnamed: 0,Game String,Start time,US State
0,FLORIDA GULF COAST,7:00 PM,MICH.
1,GONZAGA,8:00 PM,MICH.
2,TOLEDO,6:30 PM,MICH.
3,VALPARAISO,12:00 PM,MICH.
4,FLORIDA,8:00 PM,N.J.
...,...,...,...
360,BRADLEY,2:45 PM ET,IOWA / WELLS FARGO ARENA
361,MINNESOTA,7:45 PM ET,IOWA / WELLS FARGO ARENA
362,#12 LSU,7 PM ET,D.C. / CAPITAL ONE ARENA
363,#1 DUKE,5:05 PM ET,D.C. / CAPITAL ONE ARENA


In [43]:
len(df_cbb[df_cbb['school'] == 'michigan-state'])

365

<a id='final_df'></a>


&nbsp;

# Final College Basketball DataFrame


Split Michigan and Michigan State into separate DataFrames, concat columns w/ axis=1, then concat vertically w/ axis=0


#### Michigan State


In [78]:
df_msu = df_cbb[df_cbb['school'] == 'michigan-state']

In [79]:
df_msu = df_msu.reset_index()

In [80]:
df_msu = pd.concat([df_msu, msu_selenium], axis=1)

In [81]:
# reimport df_msu from file - used so you don't have to scrape from Selenium each time
# df_msu = pd.read_csv(r'../04_finaldata/bball_data_michigan_state.csv')

In [82]:
df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace({'\.': '', 
                                      ' ET': '',
                                      'NOON': '12:00 PM',
                                      '(?<!\d:\d\d)(?= PM)': ':00'}, regex=True)
df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace({'7 PM': '7:00 PM', '6 PM': '6:00 PM'})
df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace('(?=\/)\/.+', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace({'\.': '',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace({'7 PM': '7:00 PM', '6 PM': '6:00 PM'})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_msu['Start time'][326:365] = df_msu['Start time'][326:365].replace('(?=\/)\/.+', '', regex=True)


In [83]:
df_msu['start_dt'] = df_msu['gamedate'] + ' ' + df_msu['Start time']

In [84]:
df_msu['start_dt']

0       2009-11-13 7:00 PM
1       2009-11-17 8:00 PM
2       2009-11-20 6:30 PM
3      2009-11-22 12:00 PM
4       2009-11-27 8:00 PM
              ...         
360     2019-03-21 2:45 PM
361     2019-03-23 7:45 PM
362     2019-03-29 7:00 PM
363     2019-03-31 5:05 PM
364     2019-04-06 8:49 PM
Name: start_dt, Length: 365, dtype: object

In [85]:
df_msu['start_dt'] = pd.to_datetime(df_msu['start_dt'])

In [86]:
df_msu[(df_msu['gamedate'] >= ('2018-10-01')) 
       & (df_msu['gamedate'] <= ('2019-05-01'))]

Unnamed: 0.1,index,Unnamed: 0,school,gamedate,type,opponent,conf,result,team_points,opp_points,...,w,l,streak,arena,rank,gametime,Game String,Start time,US State,start_dt
326,687,723,michigan-state,2018-11-06,REG,Kansas (1),Big 12,L,87,92,...,0,1,L 1,Bankers Life Fieldhouse,10.0,7:00p,#1 KANSAS,7:00 PM,IND.,2018-11-06 19:00:00
327,688,724,michigan-state,2018-11-11,REG,Florida Gulf Coast,A-Sun,W,106,82,...,1,1,W 1,Breslin Events Center,10.0,6:00p,FLORIDA GULF COAST,6:00 PM,MICH. BRESLIN CENTER,2018-11-11 18:00:00
328,689,725,michigan-state,2018-11-14,REG,Louisiana-Monroe,Sun Belt,W,80,59,...,2,1,W 2,Breslin Events Center,11.0,7:00p,LOUISIANA MONROE,7:00 PM,MICH. BRESLIN CENTER,2018-11-14 19:00:00
329,690,726,michigan-state,2018-11-18,REG,Tennessee Tech,OVC,W,101,33,...,3,1,W 3,Breslin Events Center,11.0,6:00p,TENNESSEE TECH,6:00 PM,MICH. BRESLIN CENTER,2018-11-18 18:00:00
330,691,727,michigan-state,2018-11-22,REG,UCLA (17),Pac-12,W,87,67,...,4,1,W 4,Orleans Arena,11.0,10:00p,#17 UCLA,10:00 PM,NEV.,2018-11-22 22:00:00
331,692,728,michigan-state,2018-11-23,REG,Texas,Big 12,W,78,68,...,5,1,W 5,Orleans Arena,11.0,6:30p,TEXAS,6:30 PM,NEV.,2018-11-23 18:30:00
332,693,729,michigan-state,2018-11-27,REG,Louisville,ACC,L,78,82,...,5,2,L 1,KFC Yum! Center,9.0,7:30p,LOUISVILLE,7:30 PM,KY.,2018-11-27 19:30:00
333,694,730,michigan-state,2018-11-30,REG,Rutgers,Big Ten,W,78,67,...,6,2,W 1,Louis Brown Athletic Center,9.0,6:00p,RUTGERS,6:00 PM,N.J.,2018-11-30 18:00:00
334,695,731,michigan-state,2018-12-03,REG,Iowa (18),Big Ten,W,90,68,...,7,2,W 2,Breslin Events Center,10.0,6:30p,#18 IOWA,6:30 PM,MICH. BRESLIN CENTER,2018-12-03 18:30:00
335,696,732,michigan-state,2018-12-08,REG,Florida,SEC,W,63,59,...,8,2,W 3,Stephen C. O'Connell Center,10.0,12:00p,FLORIDA,12:00 PM,FLA.,2018-12-08 12:00:00


In [87]:
# df_msu.to_csv(r'C:\Users\RJ\Downloads\bball_data_michigan_state.csv')


#### Michigan


In [88]:
df_m = df_cbb[df_cbb['school'] == 'michigan']

In [89]:
df_m = df_m.reset_index()

In [90]:
df_m = pd.concat([df_m, m_selenium], axis=1)

In [91]:
# reimport df w/ Michigan bball data scraped w/ Selenium - used so you don't have to scrape from Selenium each time
# df_m = pd.read_csv(r'../04_finaldata/bball_data_with_michigans_numbers.csv', usecols=range(2, 20))

In [92]:
df_m['Start time'] = df_m['Start time'].replace('Noon', '12:00 PM')

In [93]:
df_m['start_dt'] = df_m['gamedate'] + ' ' + df_m['Start time']

In [94]:
df_m['start_dt'] = pd.to_datetime(df_m['start_dt'])

In [95]:
# Michigan's mgoblue gametimes all map to location's local time. Use dict to move each time back to EST.
time_dict = {'ILL.': 1, 'WIS.': 1, 'UTAH': 2, 'KAN.': 1, 'MINN.': 1, 'IOWA': 1, 'HAWAII': 6, 'ARK.': 1,
             'NEB.': 1, 'TEXAS': 1, 'ARIZ.': 2, 'CALIF.': 3, 'MO.': 1, 'HAWAI\'I': 1}

In [96]:
def time_converter(dt, state):
    if state not in time_dict:
        return dt
    else:
        return dt + datetime.timedelta(hours=time_dict[state])

In [97]:
df_m['start_dt'] = df_m.apply(lambda x: time_converter(x['start_dt'], x['US State']), axis=1)

In [98]:
df_m[-20:-1]

Unnamed: 0.1,index,Unnamed: 0,school,gamedate,type,opponent,conf,result,team_points,opp_points,...,w,l,streak,arena,rank,gametime,Game String,Start time,US State,start_dt
341,341,357,michigan,2019-01-19,REG,Wisconsin,Big Ten,L,54,64,...,17,1,L 1,Kohl Center,2.0,12:00p,Wisconsin on January 19 11:00 AM CT,11:00 AM,WIS.,2019-01-19 12:00:00
342,342,358,michigan,2019-01-22,REG,Minnesota,Big Ten,W,59,57,...,18,1,W 1,Crisler Arena,5.0,7:00p,Minnesota on January 22 7:00 PM,7:00 PM,MICH.,2019-01-22 19:00:00
343,343,359,michigan,2019-01-25,REG,Indiana,Big Ten,W,69,46,...,19,1,W 2,Assembly Hall,5.0,6:30p,Indiana on January 25 6:30 PM,6:30 PM,IND.,2019-01-25 18:30:00
344,344,360,michigan,2019-01-29,REG,Ohio State,Big Ten,W,65,49,...,20,1,W 3,Crisler Arena,5.0,9:00p,Ohio State on January 29 9:00 PM,9:00 PM,MICH.,2019-01-29 21:00:00
345,345,361,michigan,2019-02-01,REG,Iowa,Big Ten,L,59,74,...,20,2,L 1,Carver-Hawkeye Arena,5.0,7:00p,Iowa on February 1 6:00 PM CT,6:00 PM,IOWA,2019-02-01 19:00:00
346,346,362,michigan,2019-02-05,REG,Rutgers,Big Ten,W,77,65,...,21,2,W 1,Louis Brown Athletic Center,7.0,8:00p,Rutgers on February 5 8:00 PM,8:00 PM,N.J.,2019-02-05 20:00:00
347,347,363,michigan,2019-02-09,REG,Wisconsin (19),Big Ten,W,61,52,...,22,2,W 2,Crisler Arena,7.0,12:00p,Wisconsin on February 9 Noon,12:00 PM,MICH.,2019-02-09 12:00:00
348,348,364,michigan,2019-02-12,REG,Penn State,Big Ten,L,69,75,...,22,3,L 1,Bryce Jordan Center,6.0,8:30p,Penn State on February 12 8:30 PM,8:30 PM,PA.,2019-02-12 20:30:00
349,349,365,michigan,2019-02-16,REG,Maryland (24),Big Ten,W,65,52,...,23,3,W 1,Crisler Arena,6.0,12:00p,Maryland on February 16 Noon,12:00 PM,MICH.,2019-02-16 12:00:00
350,350,366,michigan,2019-02-21,REG,Minnesota,Big Ten,W,69,60,...,24,3,W 2,Williams Arena,7.0,7:00p,Minnesota on February 21 6:00 PM CT,6:00 PM,MINN.,2019-02-21 19:00:00



#### Concatenate


In [99]:
df_basketball = pd.concat([df_m, df_msu]).reset_index(drop=True)

In [100]:
df_basketball[(df_basketball['school'] == ('michigan')) 
              & (df_basketball['start_dt'] >= ('2014-10-01')) 
              & (df_basketball['start_dt'] <= ('2015-05-01'))]

Unnamed: 0.1,index,Unnamed: 0,school,gamedate,type,opponent,conf,result,team_points,opp_points,...,w,l,streak,arena,rank,gametime,Game String,Start time,US State,start_dt
177,177,186,michigan,2014-11-15,REG,Hillsdale,,W,92,68,...,1,0,W 1,Crisler Arena,24,2:00p,Hillsdale College on November 15 2:00 PM,2:00 PM,MICH.,2014-11-15 14:00:00
178,178,187,michigan,2014-11-17,REG,Bucknell,Patriot,W,77,53,...,2,0,W 2,Crisler Arena,24,8:00p,Bucknell on November 17 8:00 PM,8:00 PM,MICH.,2014-11-17 20:00:00
179,179,188,michigan,2014-11-20,REG,Detroit Mercy,Horizon,W,71,62,...,3,0,W 3,Crisler Arena,24,6:00p,Detroit on November 20 6:00 PM,6:00 PM,MICH.,2014-11-20 18:00:00
180,180,189,michigan,2014-11-24,REG,Oregon,Pac-12,W,70,63,...,4,0,W 4,Barclays Center,19,9:45p,Oregon on November 24 9:00 PM,9:00 PM,N.Y.,2014-11-24 21:00:00
181,181,190,michigan,2014-11-25,REG,Villanova (12),Big East,L,55,60,...,4,1,L 1,Barclays Center,19,10:15p,Villanova on November 25 10:00 PM,10:00 PM,N.Y.,2014-11-25 22:00:00
182,182,191,michigan,2014-11-29,REG,Nicholls State,Southland,W,91,62,...,5,1,W 1,Crisler Arena,19,4:00p,Nicholls State on November 29 4:00 PM,4:00 PM,MICH.,2014-11-29 16:00:00
183,183,192,michigan,2014-12-02,REG,Syracuse,ACC,W,68,65,...,6,1,W 2,Crisler Arena,17,7:30p,Syracuse on December 2 7:30 PM,7:30 PM,MICH.,2014-12-02 19:30:00
184,184,193,michigan,2014-12-06,REG,NJIT,Ind,L,70,72,...,6,2,L 1,Crisler Arena,17,12:00p,NJIT on December 6 12:00 PM,12:00 PM,MICH.,2014-12-06 12:00:00
185,185,194,michigan,2014-12-09,REG,Eastern Michigan,MAC,L,42,45,...,6,3,L 2,Crisler Arena,-,9:00p,Eastern Michigan on December 9 9:00 PM,9:00 PM,MICH.,2014-12-09 21:00:00
186,186,195,michigan,2014-12-13,REG,Arizona (3),Pac-12,L,53,80,...,6,4,L 3,McKale Center,-,5:15p,Arizona on December 13 3:25 PM,3:25 PM,ARIZ.,2014-12-13 17:25:00


In [101]:
df_basketball[-20:-1]

Unnamed: 0.1,index,Unnamed: 0,school,gamedate,type,opponent,conf,result,team_points,opp_points,...,w,l,streak,arena,rank,gametime,Game String,Start time,US State,start_dt
706,706,742,michigan-state,2019-01-24,REG,Iowa (19),Big Ten,W,82,67,...,18,2,W 13,Carver-Hawkeye Arena,6.0,7:00p,#19 IOWA,7:00 PM,IOWA,2019-01-24 19:00:00
707,707,743,michigan-state,2019-01-27,REG,Purdue,Big Ten,L,63,73,...,18,3,L 1,Mackey Arena,6.0,1:00p,PURDUE,1:00 PM,IND.,2019-01-27 13:00:00
708,708,744,michigan-state,2019-02-02,REG,Indiana,Big Ten,L,75,79,...,18,4,L 2,Breslin Events Center,6.0,6:00p,INDIANA,6:00 PM,MICH. BRESLIN CENTER,2019-02-02 18:00:00
709,709,745,michigan-state,2019-02-05,REG,Illinois,Big Ten,L,74,79,...,18,5,L 3,State Farm Center,9.0,7:00p,ILLINOIS,7:00 PM,ILL.,2019-02-05 19:00:00
710,710,746,michigan-state,2019-02-09,REG,Minnesota,Big Ten,W,79,55,...,19,5,W 1,Breslin Events Center,9.0,2:00p,MINNESOTA,2:00 PM,MICH. BRESLIN CENTER,2019-02-09 14:00:00
711,711,747,michigan-state,2019-02-12,REG,Wisconsin (20),Big Ten,W,67,59,...,20,5,W 2,Kohl Center,11.0,7:00p,#20 WISCONSIN,7:00 PM,WIS.,2019-02-12 19:00:00
712,712,748,michigan-state,2019-02-17,REG,Ohio State,Big Ten,W,62,44,...,21,5,W 3,Breslin Events Center,11.0,1:00p,OHIO STATE,1:00 PM,MICH. BRESLIN CENTER,2019-02-17 13:00:00
713,713,749,michigan-state,2019-02-20,REG,Rutgers,Big Ten,W,71,60,...,22,5,W 4,Breslin Events Center,10.0,6:30p,RUTGERS,6:30 PM,MICH. BRESLIN CENTER,2019-02-20 18:30:00
714,714,750,michigan-state,2019-02-24,REG,Michigan (7),Big Ten,W,77,70,...,23,5,W 5,Crisler Arena,10.0,3:45p,#7 MICHIGAN,3:45 PM,MICH.,2019-02-24 15:45:00
715,715,751,michigan-state,2019-03-02,REG,Indiana,Big Ten,L,62,63,...,23,6,L 1,Assembly Hall,6.0,12:00p,INDIANA,12:00 PM,IND.,2019-03-02 12:00:00


In [103]:
df_basketball = df_basketball.drop(['Start time', 'Unnamed: 0', 'index'], axis=1)

In [104]:
df_basketball

Unnamed: 0,school,gamedate,type,opponent,conf,result,team_points,opp_points,ot,w,l,streak,arena,rank,gametime,Game String,US State,start_dt
0,michigan,2009-11-14,REG,Northern Michigan,,W,97,50,,1,0,W 1,Crisler Arena,15,,Northern Michigan on November 14 7:00 PM,MICH.,2009-11-14 19:00:00
1,michigan,2009-11-20,REG,Houston Baptist,GWC,W,77,55,,2,0,W 2,Crisler Arena,15,,Houston Baptist on November 20 7:00 PM,MICH.,2009-11-20 19:00:00
2,michigan,2009-11-26,REG,Creighton,MVC,W,83,76,OT,3,0,W 3,The Milk House,15,,Creighton on November 26 12:00 PM,FLA.,2009-11-26 12:00:00
3,michigan,2009-11-27,REG,Marquette,Big East,L,65,79,,3,1,L 1,The Milk House,15,,Marquette on November 27 12:00 PM,FLA.,2009-11-27 12:00:00
4,michigan,2009-11-29,REG,Alabama,SEC,L,66,68,,3,2,L 2,The Milk House,15,,Alabama on November 29 5:00 PM,FLA.,2009-11-29 17:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,michigan-state,2019-03-21,NCAA,Bradley,MVC,W,76,65,,29,6,W 6,Wells Fargo Arena,,2:45p,BRADLEY,IOWA / WELLS FARGO ARENA,2019-03-21 14:45:00
722,michigan-state,2019-03-23,NCAA,Minnesota,Big Ten,W,70,50,,30,6,W 7,Wells Fargo Arena,,7:45p,MINNESOTA,IOWA / WELLS FARGO ARENA,2019-03-23 19:45:00
723,michigan-state,2019-03-29,NCAA,Louisiana State (12),SEC,W,80,63,,31,6,W 8,Capital One Arena,,7:09p,#12 LSU,D.C. / CAPITAL ONE ARENA,2019-03-29 19:00:00
724,michigan-state,2019-03-31,NCAA,Duke (1),ACC,W,68,67,,32,6,W 9,Capital One Arena,,5:05p,#1 DUKE,D.C. / CAPITAL ONE ARENA,2019-03-31 17:05:00


In [105]:
df_basketball.to_csv(r'C:\Users\RJ\Downloads\df_basketball.csv', sep=',')