In [87]:
%matplotlib inline 

import requests
from bs4 import BeautifulSoup
from bs4 import NavigableString
import pandas as pd
import StringIO
import urllib
from datetime import date, datetime, timedelta
import itertools
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
from scipy.stats.stats import pearsonr
import math
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

plt.style.use('ggplot')

In [243]:
# Simple lambda function
def split(x, num):
    return int(x.split('-')[num])

# Write a function that scrapes the team trend graph for each game
def ncaatrendgraph(gameid):

    # Scrape the reference team page
    url = 'http://espn.go.com/mens-college-basketball/playbyplay?gameId=' + str(gameid)
    html = urllib.urlopen(url)
    soup = BeautifulSoup(html, "html.parser") 

    # Identify which team is which from the basic page
    teams = soup.find_all('div', {'class', "team-container"})

    away = str(teams[0].find('span', {'class', 'long-name'}).contents[0]).split(';')[0]
    home = str(teams[1].find('span', {'class', 'long-name'}).contents[0]).split(';')[0]

    # # Run through the table and get all the relevant events from that quarter
    qes = []
    tps = len(soup.find_all('table'))-1
    for num in range(1,tps):
        for item in soup.find_all('table')[num].find_all('tr')[1:]:
            event = []
            for td in item.find_all('td'):
                if len(td.contents) > 0:
                    if 'img' in str(td.contents[0]):
                        event.append(str(td.contents[0]['src']).split('/')[-1].split('.')[0].upper())
                    else:
                        if ':' in str(td.contents[0]):
                            if num > 2:
                                minutes = 4-int(td.contents[0].split(':')[0]) + 40 + ((num-3)*5)
                            else:
                                minutes = 19-int(td.contents[0].split(':')[0]) +((num-1)*20)
                            seconds = 60-int(td.contents[0].split(':')[1])

                            # Make an adjustment for exact minute calculations
                            if seconds == 60:
                                minutes = minutes + 1
                                seconds = 0

                            event.append(minutes)
                            event.append(seconds)
                        else:
                            event.append(str(td.contents[0]))
            qes.append(event)

    # Make this data into a Dataframe
    bsd = pd.DataFrame(qes, columns = ['Minutes', 'Seconds', 'Team', 'Event', 'Score'])
    bsd[away] = bsd['Score'].apply(lambda x: split(x, 0))
    bsd[home] = bsd['Score'].apply(lambda x: split(x, 1))
    bsd = bsd.drop('Score', 1)

    # # Write a quick function converting the minutes and seconds to a percentage
    if (tps) == 3:
        numminutes = 40
    else:
        numminutes = 40 + 5*(tps-3)
    lengame = numminutes*60.0
    bsd['PercDone'] = [float(100*round((mins*60+sec)/lengame,4)) for (mins, sec) in zip(bsd['Minutes'], bsd['Seconds'])]
    return bsd

In [244]:
# Write a function that processes the events that happen during this period
"""
Gamedata: The dataframe containing all the data above
Start: The time at which you'd like to start looking (%H:%M, ex. 7:20 (EST))
Finish: The time at which you'd like to stop looking (see above)
"""
def gamevents(gamedata, start, finish, begin, end):
    start_conv = datetime.strptime(start, '%H:%M')
    finish_conv = datetime.strptime(finish, '%H:%M')
    
    # Make quick sanity checks
    if start_conv < begin:
        start_conv = begin
    if finish_conv > end:
        finish_conv = end
    
    # Check where each event is in relation to halftime
    sloc, endloc = 0, 0
    htbegin = begin + timedelta(0, (end-begin).seconds/2 - lenhalf/2)
    htend = begin + timedelta(0,(end-begin).seconds/2 + lenhalf/2)
    if start_conv > htend:
        sloc = 1
    if finish_conv > htend:
        endloc = 1
        
    # Assume for now that both are on the same side of halftime
    if endloc == 0:
        ttb1, ttb2 = (start_conv - begin).seconds, (finish_conv - begin).seconds
        perc1, perc2 = round(100*(1.0*ttb1)/(2*(htbegin-begin).seconds),2), round(100*(1.0*ttb2)/(2*(htbegin-begin).seconds),2)
    elif endloc == 1:
        ttb1, ttb2 = (start_conv - htend).seconds, (finish_conv - htend).seconds
        perc1, perc2 = round(100*(1.0*ttb1)/(end-htend).seconds,2)+50, round(100*(1.0*ttb2)/(end-htend).seconds,2)+50
    
    # Grab the data from the gamedata
    return gdata[(gdata['PercDone'] > perc1) & (gdata['PercDone'] < perc2)]

In [245]:
"""
Large function that will return to you the relevant game events and then a list of tweets (with their times) 
from the time period that you specify. Currently works by submitting actual times.

Inputs:
gp1: The first game participant
gp2: The second game participant
tp1: Beginning of time period (%H:%M, ex. 7:20), measured by EST
tp2: End of time period

Outputs:
dt1: Dataframe containing all the game events from that period 
dt2: Dataframe containing the tweets from that period
"""

def databytime(gp1, gp2, tp1, tp2):
    
    # Get the important metadata and define a useful constant
    gmdat = pd.read_csv("GameMetadata.csv")
    metadat = gmdat[((gmdat['Team1'] == gp1) | (gmdat['Team1'] == gp2)) & ((gmdat['Team2'] == gp1) | (gmdat['Team2'] == gp2))]
    eid = metadat['espn_id'].iloc[0]
    begin, end = datetime.strptime(metadat['Start'].iloc[0], '%H:%M'), datetime.strptime(metadat['End'].iloc[0], '%H:%M')
    lenhalf = 20*60
    
    # Grab the game data for the time period
    gdata = ncaatrendgraph(eid)
    dt1 = gamevents(gdata, tp1, tp2, begin, end)
    
    # Grab the relevant data from the twitter file
    fname = metadat['Filename'].iloc[0]
    tweets =  pd.read_csv(fname)
    tweets['time_chg'] = tweets['time'].apply(lambda x: x.split(' ')[3])
    
    indices = []
    for num in range(0, len(tweets['time_chg'])):
        tweetstamp = datetime.strptime(tweets['time_chg'].iloc[num], "%H:%M:%S") - timedelta(hours=4)
        if (tweetstamp > begin) and (tweetstamp < end):
            if (tweetstamp > datetime.strptime(tp1, '%H:%M')) and (tweetstamp < datetime.strptime(tp2, '%H:%M')):
                indices.append(num)
    dt2 = tweets[tweets.index.isin(indices)]
    
    return dt1, dt2

In [246]:
samp1, samp2 = databytime('Virginia', 'Syracuse', '18:10', '18:20')

In [247]:
samp1

Unnamed: 0,Minutes,Seconds,Team,Event,Syracuse,Virginia,PercDone
1,0,24,183,Trevor Cooney missed Three Point Jumper.,0,0,1.0
2,0,24,183,Malachi Richardson Offensive Rebound.,0,0,1.0
3,0,31,183,Malachi Richardson Turnover.,0,0,1.29
4,0,45,258,Malcolm Brogdon missed Three Point Jumper.,0,0,1.87
5,0,45,183,Malachi Richardson Defensive Rebound.,0,0,1.87
6,1,9,183,Michael Gbinije missed Jumper.,0,0,2.88
7,1,9,258,Malcolm Brogdon Block.,0,0,2.88
8,1,9,258,London Perrantes Defensive Rebound.,0,0,2.88
9,1,22,258,London Perrantes missed Three Point Jumper.,0,0,3.42
10,1,22,183,Michael Gbinije Defensive Rebound.,0,0,3.42


In [248]:
samp2

Unnamed: 0,text,time,time_chg
1999,RT @fullcourtprez: Time to watch 2-3 zone for ...,Sun Mar 27 22:10:01 +0000 2016,22:10:01
2000,Malachi scored 23 points in the regular season...,Sun Mar 27 22:10:01 +0000 2016,22:10:01
2001,Almost time for tip off between #UVA and #Syra...,Sun Mar 27 22:10:03 +0000 2016,22:10:03
2002,#UVA looking for first Final 4 since this guy ...,Sun Mar 27 22:10:03 +0000 2016,22:10:03
2003,We are set here in #Chicago!!!! #Syracuse #Vir...,Sun Mar 27 22:10:04 +0000 2016,22:10:04
2004,Go cavaliers go #UVA,Sun Mar 27 22:10:06 +0000 2016,22:10:06
2005,Syracuse Horns #1 - FastModel Sports https://t...,Sun Mar 27 22:10:06 +0000 2016,22:10:06
2006,#GoHoos,Sun Mar 27 22:10:08 +0000 2016,22:10:08
2007,.@BrentAxeMedia I like Boehiem's suit tonight....,Sun Mar 27 22:10:09 +0000 2016,22:10:09
2008,RT @Cuse_MBB: Halftime stats\r\n#CuseMode #Mar...,Sun Mar 27 22:10:11 +0000 2016,22:10:11
