In [5]:
# Box Score Scraper
# David Freed and Samuel Green
# February 29, 2016

In [14]:
%matplotlib inline 

import requests
from bs4 import BeautifulSoup
from bs4 import NavigableString
import pandas as pd
import StringIO
import urllib
from datetime import date, datetime
import itertools
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
from scipy.stats.stats import pearsonr
import math

plt.style.use('ggplot')

In [7]:
# Simple lambda function
def split(x, num):
    return int(x.split('-')[num])

In [85]:
# Write a function that scrapes the team trend graph for each game
# Find game id by going to box scores and taking the end of the url
def trendgraph(gameid):

    # Scrape the reference team page
    url = 'http://espn.go.com/nba/playbyplay?gameId=' + str(gameid)
    html = urllib.urlopen(url)
    soup = BeautifulSoup(html, "html.parser") 

    # Identify which team is which from the basic page
    teams = soup.find_all('div', {'class', "team-container"})

    away = str(teams[0].contents[0].find('img')['src'].split('/')[-1].split('.')[0]).upper()
    home = str(teams[1].contents[0].find('img')['src'].split('/')[-1].split('.')[0]).upper()

    # Run through the table and get all the relevant events from that quarter
    qes = []
    for num in range(1,5):
        for item in reversed(soup.find_all('table')[5-num].find_all('tr')[1:]):
            event = []
            for td in item.find_all('td'):
                if len(td.contents) > 0:
                    if 'img' in str(td.contents[0]):
                        event.append(str(td.contents[0]['src']).split('/')[-1].split('.')[0].upper())
                    else:
                        if ':' in str(td.contents[0]):
                            minutes = 11-int(td.contents[0].split(':')[0]) +((num-1)*12)
                            seconds = 60-int(td.contents[0].split(':')[1])

                            # Make an adjustment for exact minute calculations
                            if seconds == 60:
                                minutes = minutes + 1
                                seconds = 0

                            event.append(minutes)
                            event.append(seconds)
                        else:
                            event.append(str(td.contents[0]))
            qes.append(event)

    # Make this data into a Dataframe
    bsd = pd.DataFrame(qes, columns = ['Minutes', 'Seconds', 'Team', 'Event', 'Score'])
    bsd[away] = bsd['Score'].apply(lambda x: split(x, 0))
    bsd[home] = bsd['Score'].apply(lambda x: split(x, 1))
    bsd = bsd.drop('Score', 1)

    # Write a quick function converting the minutes and seconds to a percentage
    lengame = 48*60.0
    bsd['PercDone'] = [float(100*round((mins*60+sec)/lengame,4)) for (mins, sec) in zip(bsd['Minutes'], bsd['Seconds'])]

    # Make a quick line graph of the scores over time
    fig = plt.figure(figsize=(14,14))
    plt.title('Game Trends: ' + away + ' at ' + home)
    plt.xlabel('Percent of Game Completed')
    plt.ylabel('Points')
    plt.plot(list(bsd['PercDone']), list(bsd[away]), c = 'r')
    plt.plot(list(bsd['PercDone']), list(bsd[home]), c = 'b')

    # Plot quarter boundaries
    plt.axvline(x=25, linewidth = 0.5, color='gray')
    plt.axvline(x=50, linewidth = 0.5, color='gray')
    plt.axvline(x=75, linewidth = 0.5, color='gray')

    red_patch = mpatches.Patch(color='red', label=away)
    blue_patch = mpatches.Patch(color='blue', label=home)
    plt.legend(handles=[red_patch, blue_patch], loc = 2)
    plt.grid()
    plt.show()