# Scraping script for worldfootball.net

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import re

### Defining the list of teams for BPL 12-13

In [90]:
teams = ['arsenal-fc','aston-villa','chelsea-fc','everton-fc','fulham-fc','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','queens-park-rangers','reading-fc','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united','wigan-athletic']

In [91]:
len(teams)

20

### Defining the scraping function:

This function takes for argument a BeautifoulSoup HTML parsed object (e.g. BeautifulSoup(test,"html.parser"))

It returns a dictionary with keys away and home and their respective values are lists of triples and quadruples of the form:

$\star$(Goal scorer, assist player, time of goal, type of goal, scorer was subbed in, assister was subbed in) if there is an assist player

$\star$(Goal scorer, time of goal, type of goal, scorer was subbed in) if there isn't an assist player

The type of goal is an integer corresponding:

$\bullet$ if type = +1: goal is a tie breaker (e.g 1-0,0-1, 2-1, 1-2...)

$\bullet$ if type = 0: goal is an equaliser (e.g 1-1,2-2,..)

$\bullet$ if type = -1: goal is a score reducer (e.g 0-2 to 1-2)

$\bullet$ if type = +/-k for k>1: goal is a score increaser/reducer (e.g 0-1 to 0-2 or 3-0 to 3-1)

The subbed in features is binary:

$\bullet$ 0 if not subbed in

$\bullet$ 1 if subbed in

In [87]:
def summary(page):
    home = []
    away = []
    
    score = 0
    
    #Checking if the score was 0-0 as there won't be any scorers to scrape
    
    #Getting the score:
    finalscore = page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[0]
    finalscore = map(int,re.findall(r'\b\d+\b',finalscore.find("div",attrs={'class':"resultat"}).get_text()))
    
    #Return dictionary with empyt home and away scorer lists:
    if np.sum(finalscore)==0:
        return dict([('home',[]),('away',[])])
    
    else:
        for td in page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[1].find_all('td'):
            #Not considering the Goals heading of the table:
            if td.has_attr('align'):
                continue
                
            #Getting the home scorers:   
            elif td.has_attr('class') and (not td.has_attr('style')):
                #Update the score
                score += 1
                text = td.get_text()
                ga = td.find_all("a")
                
                #Checking if there is a player assisting on the goal:
                if len(ga)>1:
                    sc = ga[0].get_text()                    
                    assis = ga[1].get_text()                    
                    home.append((sc,assis,map(int,re.findall(r'\b\d+\b', text))[0],score))
                    
                #If no assist:
                else:
                    sc = ga[0].get_text()
                    home.append((sc,map(int,re.findall(r'\b\d+\b', text))[0],score))


            elif td.has_attr('class') and td.has_attr('style'):
                #Update the score
                score -= 1
                text = td.get_text()
                ga = td.find_all("a")
                
                #Checking if there is a player assisting on the goal:
                if len(ga)>1:
                    sc = ga[0].get_text()                    
                    assis = ga[1].get_text()
                    #Note we return -score to account for the fact that if the away team scores the tie breaker goal, we still output +1:
                    away.append((sc,assis,map(int,re.findall(r'\b\d+\b', text))[0],-score))
                    
                #If no assist:
                else:
                    sc = ga[0].get_text()
                    #Same remark
                    away.append((sc,map(int,re.findall(r'\b\d+\b', text))[0],-score))
                    
            else:
                continue
                
        #Obtaining the list of players:    
        home_players = page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[2].find_all('td')
        #Obtaining the list of players subbed in:
        home_sub =[]
        for td in range(34,len(home_players),3):
            home_sub.append(home_players[td+1].get_text().rstrip().lstrip())
        
        #Similarly for the away team:
        away_players = page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[3].find_all('td')
        away_sub =[]
        for td in range(34,len(away_players),3):
            away_sub.append(away_players[td+1].get_text().rstrip().lstrip())

        #Adding the subbed in feature to the existing tuples:
        for i in range(len(home)):
            #Account for the presence or not of an assist player
            if len(home[i])==4:
                #Four cases:
                # - both subbed in
                if home[i][0] in home_sub:
                    if home[i][1] in home_sub:
                        temp = list(home[i])
                        temp.append(1)
                        temp.append(1)
                        home[i] = tuple(temp)
                    # - Only scorer subbed in
                    else:
                        temp = list(home[i])
                        temp.append(1)
                        temp.append(0)
                        home[i] = tuple(temp)
                else:
                    # - Only assist playr subbed in
                    if home[i][1] in home_sub:
                        temp = list(home[i])
                        temp.append(0)
                        temp.append(1)
                        home[i] = tuple(temp)
                    # - None of them subbed in
                    else:
                        temp = list(home[i])
                        temp.append(0)
                        temp.append(0)
                        home[i] = tuple(temp)
                        
            #Case with no assist player
            else:
                if home[i][0] in home_sub:
                    temp = list(home[i])
                    temp.append(1)
                    home[i] = tuple(temp)
                else:
                    temp = list(home[i])
                    temp.append(0)
                    home[i] = tuple(temp)
                    
        #Repeat for away team           
        for j in range(len(away)):
            if len(away[j])==4:
                if away[j][0] in away:
                    if away[j][1] in away_sub:                        
                        temp = list(away[j])
                        temp.append(1)
                        temp.append(1)
                        away[j] = tuple(temp)
                    else:
                        temp = list(away[j])
                        temp.append(1)
                        temp.append(0)
                        away[j] = tuple(temp)
                else:
                    if away[j][1] in away_sub:
                        temp = list(away[j])
                        temp.append(0)
                        temp.append(1)
                        away[j] = tuple(temp)
                    else:
                        temp = list(away[j])
                        temp.append(0)
                        temp.append(0)
                        away[j] = tuple(temp)
            else:
                if away[j][0] in away_sub:
                    temp = list(away[j])
                    temp.append(1)
                    away[j] = tuple(temp)
                else:
                    temp = list(away[j])
                    temp.append(0)
                    away[j] = tuple(temp)
        
   
    return dict([('home',home),('away',away)])

### Testing the function:

In [84]:
test = requests.get("http://www.worldfootball.net/report/premier-league-2012-2013-norwich-city-aston-villa/").text

In [85]:
test = BeautifulSoup(test,"html.parser")

##### YAY!!!!!

In [88]:
summary(test)

{'away': [(u'Gabriel Agbonlahor', u'Ashley Westwood', 55, 1, 0, 0),
  (u'Gabriel Agbonlahor', u'Ashley Westwood', 89, 1, 0, 0)],
 'home': [(u'Grant Holt', 74, 0, 0)]}

### Now the entire season 12-13:

We first create the list of games using the format of the url needed:

In [92]:
games = []
for t1 in teams:
    for t2 in teams:
        if t1!=t2:
            games.append(t1+"-"+t2)

We then scrape the website and create a dictionary to store the webpages per game:

In [93]:
games_page = dict([game,requests.get("http://www.worldfootball.net/report/premier-league-2012-2013-%s" % game).text] 
             for game in games)

#Don't forget that step:
for k in games_page.keys():
    games_page[k] = BeautifulSoup(games_page[k],"html.parser")

In [94]:
#sanity check
len(games_page)

380

In [95]:
#sanity check for empty pages
any(a == [] for a in games_page.values())

False

We now extract the relevant information from each page and create a new dictionary with games as keys and the output of summary for values:

In [96]:
games_scorers_assists = {}
for k in games_page.keys():
    games_scorers_assists[k] = summary(games_page[k])

In [97]:
#Testing ! It worked super well ! check the website below:
games_scorers_assists['tottenham-hotspur-manchester-united']

{'away': [(u'Robin van Persie', u'Tom Cleverley', 25, 1, 0, 0)],
 'home': [(u'Clint Dempsey', u'Aaron Lennon', 90, 0, 0, 0)]}

http://www.worldfootball.net/report/premier-league-2012-2013-tottenham-hotspur-manchester-united/

Early goal form RVP and equaliser of Dempsey (USA USA USA) at the last minute :-)

In [98]:
import json
with open('BPL12-13.json', 'w') as fp:
    json.dump(games_scorers_assists, fp)

In [90]:
#In case you need to reload it:
with open('BPL/BPL12-13.json', 'r') as fp:
    data = json.load(fp)

In [91]:
data['tottenham-hotspur-manchester-united']

{u'away': [[u'Robin van Persie', u'Tom Cleverley', 25, 1]],
 u'home': [[u'Clint Dempsey', u'Aaron Lennon', 90, 0]]}

### Now the entire season 13-14:

In [99]:
teams13 = ['arsenal-fc','aston-villa','cardiff-city','chelsea-fc','crystal-palace','everton-fc','fulham-fc','hull-city','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united']

In [100]:
len(teams13)

20

In [101]:
games13 = []
for t1 in teams13:
    for t2 in teams13:
        if t1!=t2:
            games13.append(t1+"-"+t2)

In [102]:
games_page13 = dict([game,requests.get("http://www.worldfootball.net/report/premier-league-2013-2014-%s" % game).text] 
             for game in games13)

#Don't forget that step:
for k in games_page13.keys():
    games_page13[k] = BeautifulSoup(games_page13[k],"html.parser")

In [103]:
#sanity check
print len(games_page13)
#sanity check for empty pages
print any(a == [] for a in games_page13.values())

380
False


In [104]:
games_scorers_assists13 = {}
for k in games_page13.keys():
    games_scorers_assists13[k] = summary(games_page13[k])

In [105]:
import json
with open('BPL13-14.json', 'w') as fp:
    json.dump(games_scorers_assists13, fp)

In [None]:
#In case you need to reload it:
with open('BPL/BPL13-14.json', 'r') as fp:
    data = json.load(fp)

### French league:

#### Season 12-13

In [106]:
frenchteams12 = ['ac-ajaccio','as-nancy','as-saint-etienne','estac-troyes','evian-thonon-gaillard','fc-lorient','fc-sochaux','girondins-bordeaux','lille-osc','montpellier-hsc','ogc-nice','olympique-lyon','olympique-marseille','paris-saint-germain','sc-bastia','stade-brest','stade-reims','stade-rennes','toulouse-fc','valenciennes-fc']

In [107]:
len(list(set(frenchteams12)))

20

In [108]:
frenchgames12 = []
for t1 in frenchteams12:
    for t2 in frenchteams12:
        if t1!=t2:
            frenchgames12.append(t1+"-"+t2)

In [109]:
len(frenchgames12)

380

In [110]:
frenchgames_page12 = dict([game,requests.get("http://www.worldfootball.net/report/ligue-1-2012-2013-%s" % game).text] 
             for game in frenchgames12)

#Don't forget that step:
for k in frenchgames_page12.keys():
    frenchgames_page12[k] = BeautifulSoup(frenchgames_page12[k],"html.parser")

In [111]:
#sanity check
print len(frenchgames_page12)
#sanity check for empty pages
print any(a == [] for a in frenchgames_page12.values())

380
False


In [112]:
frenchgames_scorers_assists12 = {}
for k in frenchgames_page12.keys():
    frenchgames_scorers_assists12[k] = summary(frenchgames_page12[k])

In [19]:
#test:
frenchgames_scorers_assists12['montpellier-hsc-toulouse-fc']

{'away': [(u'Wissam Ben Yedder', u'Serge Aurier', 72, 0)],
 'home': [(u'Souleymane Camara', u'John Utaka', 34, 1)]}

In [113]:
import json
with open('French12-13.json', 'w') as fp:
    json.dump(frenchgames_scorers_assists12, fp)

In [None]:
#In case you need to reload it:
with open('French/French12-13.json', 'r') as fp:
    data = json.load(fp)

#### Season 13-14

In [114]:
frenchteams13 = ['ac-ajaccio','as-monaco','as-saint-etienne','ea-guingamp','evian-thonon-gaillard','fc-lorient','fc-nantes','fc-sochaux','girondins-bordeaux','lille-osc','montpellier-hsc','ogc-nice','olympique-lyon','olympique-marseille','paris-saint-germain','sc-bastia','stade-reims','stade-rennes','toulouse-fc','valenciennes-fc']
len(frenchteams13)
frenchgames13 = []

for t1 in frenchteams13:
    for t2 in frenchteams13:
        if t1!=t2:
            frenchgames13.append(t1+"-"+t2)
            
frenchgames_page13 = dict([game,requests.get("http://www.worldfootball.net/report/ligue-1-2013-2014-%s" % game).text] 
             for game in frenchgames13)

#Don't forget that step:
for k in frenchgames_page13.keys():
    frenchgames_page13[k] = BeautifulSoup(frenchgames_page13[k],"html.parser")

In [None]:
#sanity check
print len(frenchgames_page13)
#sanity check for empty pages
print any(a == [] for a in frenchgames_page13.values())

380
False


In [None]:
frenchgames_scorers_assists13 = {}
for k in frenchgames_page13.keys():
    frenchgames_scorers_assists13[k] = summary(frenchgames_page13[k])

In [None]:
import json
with open('French13-14.json', 'w') as fp:
    json.dump(frenchgames_scorers_assists13, fp)

In [None]:
#In case you need to reload it:
with open('French/French13-14.json', 'r') as fp:
    data = json.load(fp)