# Scraping script for worldfootball.net

In [13]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import re

### Defining the list of teams for BPL 12-13

In [75]:
teams = ['arsenal-fc','aston-villa','chelsea-fc','everton-fc','fulham-fc','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','queens-park-rangers','reading-fc','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united','wigan-athletic']

In [5]:
len(teams)

20

### Defining the scraping function:

This function takes for argument a BeautifoulSoup HTML parsed object (e.g. BeautifulSoup(test,"html.parser"))

It returns a dictionary with keys away and home and their respective values are lists of triples and quadruples of the form:

$\star$(Goal scorer, assist player, time of goal, type of goal) if there is an assist player

$\star$(Goal scorer, time of goal, type of goal) if there isn't an assist player

The type of goal is an integer corresponding:

$\bullet$ if type = +1: goal is a tie breaker (e.g 1-0,0-1, 2-1, 1-2...)

$\bullet$ if type = 0: goal is an equaliser (e.g 1-1,2-2,..)

$\bullet$ if type = -1: goal is a score reducer (e.g 0-2 to 1-2)

$\bullet$ if type = +/-k for k>1: goal is a score increaser/reducer (e.g 0-1 to 0-2 or 3-0 to 3-1)

In [71]:
def summary(page):
    home = []
    away = []
    score = 0
    
    #Checking if the score was 0-0 as there won't be any scorers to scrape
    
    #Getting the score:
    finalscore = page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[0]
    finalscore = map(int,re.findall(r'\b\d+\b',finalscore.find("div",attrs={'class':"resultat"}).get_text()))
    
    #Return dictionary with empyt home and away scorer lists:
    if np.sum(finalscore)==0:
        return dict([('home',[]),('away',[])])
    
    else:
        for td in page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[1].find_all('td'):
            #Not considering the Goals heading of the table:
            if td.has_attr('align'):
                continue
                
            #Getting the home scorers:   
            elif td.has_attr('class') and (not td.has_attr('style')):
                #Update the score
                score += 1
                text = td.get_text()
                ga = td.find_all("a")
                
                #Checking if there is a player assisting on the goal:
                if len(ga)>1:
                    sc = ga[0].get_text()
                    assis = ga[1].get_text()
                    home.append((sc,assis,map(int,re.findall(r'\b\d+\b', text))[0],score))
                    
                #If no assist:
                else:
                    sc = ga[0].get_text()
                    home.append((sc,map(int,re.findall(r'\b\d+\b', text))[0],score))
        
            elif td.has_attr('class') and i.has_attr('style'):
                #Update the score
                score -= 1
                text = td.get_text()
                ga = td.find_all("a")
                
                #Checking if there is a player assisting on the goal:
                if len(ga)>1:
                    sc = ga[0].get_text()
                    assis = ga[1].get_text()
                    #Note we return -score to account for the fact that if the away team scores the tie breaker goal, we still output +1:
                    away.append((sc,assis,map(int,re.findall(r'\b\d+\b', text))[0],-score))
                    
                #If no assist:
                else:
                    sc = ga[0].get_text()
                    #Same remark
                    away.append((sc,map(int,re.findall(r'\b\d+\b', text))[0],-score))
            
            else:
                continue
        
        return dict([('home',home),('away',away)])

### Testing the function:

In [51]:
test = requests.get("http://www.worldfootball.net/report/premier-league-2012-2013-norwich-city-aston-villa/").text

In [52]:
test = BeautifulSoup(test,"html.parser")

##### YAY!!!!!

In [54]:
summary(test)

{'away': [(u'Gabriel Agbonlahor', u'Ashley Westwood', 55, 1),
  (u'Gabriel Agbonlahor', u'Ashley Westwood', 89, 1)],
 'home': [(u'Grant Holt', 74, 0)]}

### Now the entire season 12-13:

We first create the list of games using the format of the url needed:

In [79]:
games = []
for t1 in teams:
    for t2 in teams:
        if t1!=t2:
            games.append(t1+"-"+t2)

We then scrape the website and create a dictionary to store the webpages per game:

In [80]:
games_page = dict([game,requests.get("http://www.worldfootball.net/report/premier-league-2012-2013-%s" % game).text] 
             for game in games)

#Don't forget that step:
for k in games_page.keys():
    games_page[k] = BeautifulSoup(games_page[k],"html.parser")

In [81]:
#sanity check
len(games_page)

380

In [82]:
#sanity check for empty pages
any(a == [] for a in games_page.values())

False

We now extract the relevant information from each page and create a new dictionary with games as keys and the output of summary for values:

In [83]:
games_scorers_assists = {}
for k in games_page.keys():
    print k
    games_scorers_assists[k] = summary(games_page[k])

sunderland-afc-manchester-city
west-ham-united-aston-villa
norwich-city-arsenal-fc
reading-fc-arsenal-fc
norwich-city-tottenham-hotspur
tottenham-hotspur-southampton-fc
newcastle-united-reading-fc
fulham-fc-liverpool-fc
west-ham-united-norwich-city
aston-villa-manchester-united
everton-fc-manchester-city
manchester-united-norwich-city
swansea-city-arsenal-fc
everton-fc-fulham-fc
arsenal-fc-west-ham-united
tottenham-hotspur-wigan-athletic
sunderland-afc-fulham-fc
sunderland-afc-southampton-fc
newcastle-united-queens-park-rangers
west-ham-united-tottenham-hotspur
swansea-city-west-ham-united
swansea-city-liverpool-fc
swansea-city-southampton-fc
sunderland-afc-reading-fc
chelsea-fc-tottenham-hotspur
fulham-fc-manchester-city
norwich-city-fulham-fc
norwich-city-wigan-athletic
sunderland-afc-manchester-united
sunderland-afc-everton-fc
swansea-city-queens-park-rangers
fulham-fc-newcastle-united
aston-villa-norwich-city
southampton-fc-everton-fc
tottenham-hotspur-arsenal-fc
reading-fc-wigan-a

In [92]:
#Testing ! It worked super well ! check the website below:
games_scorers_assists['tottenham-hotspur-manchester-united']

{'away': [(u'Robin van Persie', u'Tom Cleverley', 25, 1)],
 'home': [(u'Clint Dempsey', u'Aaron Lennon', 90, 0)]}

http://www.worldfootball.net/report/premier-league-2012-2013-tottenham-hotspur-manchester-united/

Early goal form RVP and equaliser of Dempsey (USA USA USA) at the last minute :-)

In [89]:
import json
with open('BPL12-13.json', 'w') as fp:
    json.dump(games_scorers_assists, fp)

In [90]:
#In case you need to reload it:
with open('BPL/BPL12-13.json', 'r') as fp:
    data = json.load(fp)

In [91]:
data['tottenham-hotspur-manchester-united']

{u'away': [[u'Robin van Persie', u'Tom Cleverley', 25, 1]],
 u'home': [[u'Clint Dempsey', u'Aaron Lennon', 90, 0]]}