In [13]:
%matplotlib inline 

import requests
from bs4 import BeautifulSoup
from bs4 import NavigableString
import pandas as pd
import StringIO
import urllib
from datetime import date, datetime, timedelta
import itertools
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
from scipy.stats.stats import pearsonr
import math
import sys
from sentiment import Classifier

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

plt.style.use('ggplot')

In [14]:
classifier = Classifier()

In [15]:
# Simple lambda function
def split(x, num):
    return int(x.split('-')[num])

# Write a function that scrapes the team trend graph for each game
def ncaatrendgraph(gameid):

    # Scrape the reference team page
    url = 'http://espn.go.com/mens-college-basketball/playbyplay?gameId=' + str(gameid)
    html = urllib.urlopen(url)
    soup = BeautifulSoup(html, "html.parser") 

    # Identify which team is which from the basic page
    teams = soup.find_all('div', {'class', "team-container"})

    away = str(teams[0].find('span', {'class', 'long-name'}).contents[0]).split(';')[0]
    home = str(teams[1].find('span', {'class', 'long-name'}).contents[0]).split(';')[0]

    # # Run through the table and get all the relevant events from that quarter
    qes = []
    tps = len(soup.find_all('table'))-1
    for num in range(1,tps):
        for item in soup.find_all('table')[num].find_all('tr')[1:]:
            event = []
            for td in item.find_all('td'):
                if len(td.contents) > 0:
                    if 'img' in str(td.contents[0]):
                        event.append(str(td.contents[0]['src']).split('/')[-1].split('.')[0].upper())
                    else:
                        if ':' in str(td.contents[0]):
                            if num > 2:
                                minutes = 4-int(td.contents[0].split(':')[0]) + 40 + ((num-3)*5)
                            else:
                                minutes = 19-int(td.contents[0].split(':')[0]) +((num-1)*20)
                            seconds = 60-int(td.contents[0].split(':')[1])

                            # Make an adjustment for exact minute calculations
                            if seconds == 60:
                                minutes = minutes + 1
                                seconds = 0

                            event.append(minutes)
                            event.append(seconds)
                        else:
                            event.append(str(td.contents[0]))
            qes.append(event)

    # Make this data into a Dataframe
    bsd = pd.DataFrame(qes, columns = ['Minutes', 'Seconds', 'Team', 'Event', 'Score'])
    bsd[away] = bsd['Score'].apply(lambda x: split(x, 0))
    bsd[home] = bsd['Score'].apply(lambda x: split(x, 1))
    bsd = bsd.drop('Score', 1)

    # # Write a quick function converting the minutes and seconds to a percentage
    if (tps) == 3:
        numminutes = 40
    else:
        numminutes = 40 + 5*(tps-3)
    lengame = numminutes*60.0
    bsd['PercDone'] = [float(100*round((mins*60+sec)/lengame,4)) for (mins, sec) in zip(bsd['Minutes'], bsd['Seconds'])]
    return bsd

In [16]:
# Write a function that processes the events that happen during this period
"""
Gamedata: The dataframe containing all the data above
Start: The time at which you'd like to start looking (%H:%M, ex. 7:20 (EST))
Finish: The time at which you'd like to stop looking (see above)
"""
def gamevents(gamedata, start, finish, begin, end):
    start_conv = datetime.strptime(start, '%H:%M')
    finish_conv = datetime.strptime(finish, '%H:%M')
        
    # Make quick sanity checks
    if start_conv < begin:
        start_conv = begin
    if finish_conv > end:
        finish_conv = end
    # Check where each event is in relation to halftime
    sloc, endloc = 0, 0
    lenhalf = 20*60
    htbegin = begin + timedelta(0, (end-begin).seconds/2 - lenhalf/2)
    htend = begin + timedelta(0,(end-begin).seconds/2 + lenhalf/2)
    if start_conv > htend:
        sloc = 1
    if finish_conv > htend:
        endloc = 1
        
    # Assume for now that both are on the same side of halftime
    if endloc == 0:
        ttb1, ttb2 = (start_conv - begin).seconds, (finish_conv - begin).seconds
        perc1, perc2 = round(100*(1.0*ttb1)/(2*(htbegin-begin).seconds),2), round(100*(1.0*ttb2)/(2*(htbegin-begin).seconds),2)
    elif endloc == 1:
        ttb1, ttb2 = (start_conv - htend).seconds, (finish_conv - htend).seconds
        perc1, perc2 = round(100*(1.0*ttb1)/(end-htend).seconds,2)+50, round(100*(1.0*ttb2)/(end-htend).seconds,2)+50
    
    # Grab the data from the gamedata
    return gamedata[(gamedata['PercDone'] > perc1) & (gamedata['PercDone'] < perc2)]

In [17]:
"""
Large function that will return to you the relevant game events and then a list of tweets (with their times) 
from the time period that you specify. Currently works by submitting actual times.

Inputs:
gp1: The first game participant
gp2: The second game participant
tp1: Beginning of time period (%H:%M, ex. 7:20), measured by EST
tp2: End of time period

Outputs:
dt1: Dataframe containing all the game events from that period 
dt2: Dataframe containing the tweets from that period
"""

def databytime(gp1, gp2, tp1, tp2):
    
    # Get the important metadata and define a useful constant
    gmdat = pd.read_csv("GameMetadata.csv")
    metadat = gmdat[((gmdat['Team1'] == gp1) | (gmdat['Team1'] == gp2)) & ((gmdat['Team2'] == gp1) | (gmdat['Team2'] == gp2))]
    eid = metadat['espn_id'].iloc[0]
    begin, end = datetime.strptime(metadat['Start'].iloc[0], '%H:%M'), datetime.strptime(metadat['End'].iloc[0], '%H:%M')
    lenhalf = 20*60
    
    # Grab the game data for the time period
    gdata = ncaatrendgraph(eid)
    dt1 = gamevents(gdata, tp1, tp2, begin, end)
    
    # Grab the relevant data from the twitter file
    fname = '../separated/' + metadat['Filename'].iloc[0]
    tweets =  pd.read_csv(fname)
    tweets['time_chg'] = tweets['time'].apply(lambda x: x.split(' ')[3])
    
    indices = []
    for num in range(0, len(tweets['time_chg'])):
        tweetstamp = datetime.strptime(tweets['time_chg'].iloc[num], "%H:%M:%S") - timedelta(hours=4)
        if (tweetstamp > begin) and (tweetstamp < end):
            if (tweetstamp > datetime.strptime(tp1, '%H:%M')) and (tweetstamp < datetime.strptime(tp2, '%H:%M')):
                indices.append(num)
    dt2 = tweets[tweets.index.isin(indices)]
    
    return dt1, dt2

In [18]:
"""
Large function that will return to you the relevant game events and then a list of tweets (with their times) 
from the time period that you specify. Currently works by submitting actual times.

Inputs:
gp1: The first game participant
gp2: The second game participant
gtp1: Beginning of game time period (20:00 -- Halftime, 35:00 -- After halftime)
gtp2: End of game time period

Outputs:
dt1: Dataframe containing all the game events from that period 
dt2: Dataframe containing the tweets from that period
"""
def databygametime(gp1, gp2, gtp1, gtp2):
    
    # Get the important metadata and define a useful constant
    gmdat = pd.read_csv("GameMetadata.csv")
    metadat = gmdat[((gmdat['Team1'] == gp1) | (gmdat['Team1'] == gp2)) & ((gmdat['Team2'] == gp1) | (gmdat['Team2'] == gp2))]
    eid = metadat['espn_id'].iloc[0]
    begin, end = datetime.strptime(metadat['Start'].iloc[0], '%H:%M'), datetime.strptime(metadat['End'].iloc[0], '%H:%M')
    lenhalf = 20*60
    htbegin = begin + timedelta(0, (end-begin).seconds/2 - lenhalf/2)
    htend = begin + timedelta(0,(end-begin).seconds/2 + lenhalf/2)
    
    # Grab the initial data by mapping gametimes to parts of the game
    fullgamedat = ncaatrendgraph(eid)
    bmin, bsec = gtp1.split(":")
    emin, esec = gtp2.split(":")
    btot = int(bmin)*60 + int(bsec)
    etot = int(emin)*60 + int(bsec)
    maxmin = np.max(fullgamedat['Minutes'])*60
    bperc, eperc = (100.0*btot)/maxmin, (100.0*etot)/maxmin
    dt1 = fullgamedat[(fullgamedat['PercDone'] >= bperc) & (fullgamedat['PercDone'] <= eperc)]
    
    # Use the percentages of the game to map gametimes to actual times
    acttimes = []
    for item in [bperc, eperc]:
        hperc = (item*2)/100
        if item <= 50.0:
            secdelt = round((htbegin-begin).seconds*hperc,0)
            acttimes.append(secdelt)
        elif item > 50.0:
            secdelt = round((end-htend).seconds*(hperc-1.0) + (htend-begin).seconds,0)
            acttimes.append(secdelt)
    
    # Convert the time to usable timedelt objects
    tp1, tp2 = begin + timedelta(0,seconds = acttimes[0]), begin + timedelta(0,seconds = acttimes[1])
    tp1, tp2 = str(tp1).split(" ")[1], str(tp2).split(" ")[1]
    
    # Get the relevant Tweets over that time period using identical code to before
    fname = '../separated/'+metadat['Filename'].iloc[0]
    tweets =  pd.read_csv(fname)
    tweets['time_chg'] = tweets['time'].apply(lambda x: x.split(' ')[3])
    
    indices = []
    for num in range(0, len(tweets['time_chg'])):
        tweetstamp = datetime.strptime(tweets['time_chg'].iloc[num], "%H:%M:%S") - timedelta(hours=4)
        if (tweetstamp > begin) and (tweetstamp < end):
            if (tweetstamp > datetime.strptime(tp1, '%H:%M:%S')) and (tweetstamp < datetime.strptime(tp2, '%H:%M:%S')):
                indices.append(num)
    dt2 = tweets[tweets.index.isin(indices)]
    
    return dt1, dt2

In [38]:
samp1, samp2 = databygametime('Virginia', 'Syracuse', '22:10', '40:00')

In [20]:
samp3, samp4 = databytime('Virginia', 'Syracuse', '18:10', '18:20')

In [21]:
tw = samp4.iloc[120]['text']
classifier.classify(tw)

'neutral'

In [12]:
classifier.classify("go lions yay team")

'neutral'

In [39]:
for tw in samp2['text'].iteritems():
    res = classifier.classify(tw[1])
    if res != 'neutral' and res != 'irrelevant' :
        print res

negative
negative
negative
positive
negative
positive
negative
positive
negative
negative
positive
negative
negative
negative
negative
positive
positive
negative
positive
negative
negative
negative
negative
positive
negative
negative
negative
negative
negative
positive
negative
positive
positive
negative
negative
negative
negative
negative
positive
negative
positive
negative
negative
positive
negative
negative
negative
negative
positive
negative
negative
positive
negative
positive
negative
negative
positive
positive
negative
negative
positive
positive
positive
positive
positive
positive
negative
positive
negative
positive
positive
positive
negative
negative
positive
negative
positive
positive
negative
positive
positive
positive
positive
positive
negative
positive
positive
positive
