# Scraping script for worldfootball.net

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import re

### Defining the list of teams for BPL 12-13

In [16]:
teams = ['arsenal-fc','aston-villa','chelsea-fc','everton-fc','fulham-fc','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','queens-park-rangers','reading-fc','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united','wigan-athletic']

In [17]:
len(teams)

20

### Defining the scraping function:

This function takes for arguments: a BeautifoulSoup HTML parsed object (e.g. BeautifulSoup(test,"html.parser")) and boolean variable that indicates if there is a integer in the name of a league (e.g Ligue 1 in France)

It returns a dictionary with keys away and home and their respective values are lists of triples and quadruples of the form:

$\star$(Goal scorer, assist player, time of goal, type of goal, scorer was subbed in, assister was subbed in) if there is an assist player

$\star$(Goal scorer, " " , time of goal, type of goal, scorer was subbed in) if there isn't an assist player

The type of goal is an integer corresponding:

$\bullet$ if type = +1: goal is a tie breaker (e.g 1-0,0-1, 2-1, 1-2...)

$\bullet$ if type = 0: goal is an equaliser (e.g 1-1,2-2,..)

$\bullet$ if type = -1: goal is a score reducer (e.g 0-2 to 1-2)

$\bullet$ if type = +/-k for k>1: goal is a score increaser/reducer (e.g 0-1 to 0-2 or 3-0 to 3-1)

The subbed in features is binary:

$\bullet$ 0 if not subbed in

$\bullet$ 1 if subbed in

In [117]:
def summary(page,num_in_league_title):
    
    if num_in_league_title:
        day = int(re.findall(r'\b\d+\b',page.find_all("div", attrs={'class': 'breadcrumb'})[0].get_text())[3])
    else:
        day = int(re.findall(r'\b\d+\b',page.find_all("div", attrs={'class': 'breadcrumb'})[0].get_text())[2])
        
    home = []
    away = []
    
    score = 0
    
    #Checking if the score was 0-0 as there won't be any scorers to scrape
    
    #Getting the score:
    finalscore = page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[0]
    finalscore = map(int,re.findall(r'\b\d+\b',finalscore.find("div",attrs={'class':"resultat"}).get_text()))
    
    #Return dictionary with empyt home and away scorer lists:
    if np.sum(finalscore)==0:
        return dict([('home',[]),('day',day),('away',[])])
    
    else:
        for td in page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[1].find_all('td'):
            #Not considering the Goals heading of the table:
            if td.has_attr('align'):
                continue
                
            #Getting the home scorers:   
            elif td.has_attr('class') and (not td.has_attr('style')):
                #Update the score
                score += 1
                text = td.get_text()
                ga = td.find_all("a")
                
                #Checking if there is a player assisting on the goal:
                if len(ga)>1:
                    sc = ga[0].get_text()                    
                    assis = ga[1].get_text()
                    #To fix some bad HTML on the site:
                    if sc == assis:
                        assis = ' '
                    home.append((sc,assis,map(int,re.findall(r'\b\d+\b', text))[0],score))
                    
                #If no assist:
                else:
                    sc = ga[0].get_text()
                    home.append((sc," ",map(int,re.findall(r'\b\d+\b', text))[0],score))


            elif td.has_attr('class') and td.has_attr('style'):
                #Update the score
                score -= 1
                text = td.get_text()
                ga = td.find_all("a")
                
                #Checking if there is a player assisting on the goal:
                if len(ga)>1:
                    sc = ga[0].get_text()                    
                    assis = ga[1].get_text()
                    if sc == assis:
                        assis = ' '
                    #Note we return -score to account for the fact that if the away team scores the tie breaker goal, we still output +1:
                    away.append((sc,assis,map(int,re.findall(r'\b\d+\b', text))[0],-score))
                    
                #If no assist:
                else:
                    sc = ga[0].get_text()
                    #Same remark
                    away.append((sc," ",map(int,re.findall(r'\b\d+\b', text))[0],-score))
                    
            else:
                continue
                
        #Obtaining the list of players:    
        home_players = page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[2].find_all('td')
        #Obtaining the list of players subbed in:
        home_sub =[]
        for td in range(34,len(home_players),3):
            home_sub.append(home_players[td+1].get_text().rstrip().lstrip())
        
        #Similarly for the away team:
        away_players = page.find_all("table",attrs={'class': "standard_tabelle",'cellpadding': '3'})[3].find_all('td')
        away_sub =[]
        for td in range(34,len(away_players),3):
            away_sub.append(away_players[td+1].get_text().rstrip().lstrip())

        #Adding the subbed in feature to the existing tuples:
        for i in range(len(home)):
            #Four cases:
            # - both subbed in
            if home[i][0] in home_sub:
                if home[i][1] in home_sub:
                    temp = list(home[i])
                    temp.append(1)
                    temp.append(1)
                    home[i] = tuple(temp)
                # - Only scorer subbed in
                else:
                    temp = list(home[i])
                    temp.append(1)
                    temp.append(0)
                    home[i] = tuple(temp)
            else:
                # - Only assist playr subbed in
                if home[i][1] in home_sub:
                    temp = list(home[i])
                    temp.append(0)
                    temp.append(1)
                    home[i] = tuple(temp)
                # - None of them subbed in
                else:
                    temp = list(home[i])
                    temp.append(0)
                    temp.append(0)
                    home[i] = tuple(temp)
            
                    
        #Repeat for away team           
        for j in range(len(away)):
            if away[j][0] in away_sub:
                if away[j][1] in away_sub:                        
                    temp = list(away[j])
                    temp.append(1)
                    temp.append(1)
                    away[j] = tuple(temp)
                else:
                    temp = list(away[j])
                    temp.append(1)
                    temp.append(0)
                    away[j] = tuple(temp)
            else:
                if away[j][1] in away_sub:
                    temp = list(away[j])
                    temp.append(0)
                    temp.append(1)
                    away[j] = tuple(temp)
                else:
                    temp = list(away[j])
                    temp.append(0)
                    temp.append(0)
                    away[j] = tuple(temp)
   
    return dict([('day',day),('home',home),('away',away)])

### Testing the function:

In [86]:
test = requests.get("http://www.worldfootball.net/report/premier-league-2012-2013-norwich-city-aston-villa/").text

In [87]:
test = BeautifulSoup(test,"html.parser")

##### YAY!!!!!

In [88]:
summary(test,False)

{'away': [(u'Gabriel Agbonlahor', u'Ashley Westwood', 55, 1, 0, 0),
  (u'Gabriel Agbonlahor', u'Ashley Westwood', 89, 1, 0, 0)],
 'day': 36,
 'home': [(u'Grant Holt', ' ', 74, 0, 0, 0)]}

### Now the entire season 12-13:

We first create the list of games using the format of the url needed:

In [18]:
games = []
for t1 in teams:
    for t2 in teams:
        if t1!=t2:
            games.append(t1+"-"+t2)

We then scrape the website and create a dictionary to store the webpages per game:

In [19]:
games_page = dict([game,requests.get("http://www.worldfootball.net/report/premier-league-2012-2013-%s" % game).text] 
             for game in games)

#Don't forget that step:
for k in games_page.keys():
    games_page[k] = BeautifulSoup(games_page[k],"html.parser")

In [20]:
#sanity check
len(games_page)

380

In [21]:
#sanity check for empty pages
any(a == [] for a in games_page.values())

False

We now extract the relevant information from each page and create a new dictionary with games as keys and the output of summary for values:

In [119]:
games_scorers_assists = {}
for k in games_page.keys():
    games_scorers_assists[k] = summary(games_page[k],False)

In [120]:
#Testing ! It worked super well ! check the website below:
games_scorers_assists['arsenal-fc-manchester-united']

{'away': [(u'Robin van Persie', ' ', 44, 0, 0, 0)],
 'day': 35,
 'home': [(u'Theo Walcott',
   u'Tom\xc3\xa1\xc5\xa1 Rosick\xc3\xbd',
   2,
   1,
   0,
   0)]}

http://www.worldfootball.net/report/premier-league-2012-2013-tottenham-hotspur-manchester-united/

Early goal form RVP and equaliser of Dempsey (USA USA USA) at the last minute :-)

In [121]:
import json
with open('BPL12-13.json', 'w') as fp:
    json.dump(games_scorers_assists, fp)

In [25]:
#In case you need to reload it:
#with open('BPL/BPL12-13.json', 'r') as fp:
#    data = json.load(fp)

In [26]:
#data['tottenham-hotspur-manchester-united']

### Now the entire season 13-14:

In [27]:
teams13 = ['arsenal-fc','aston-villa','cardiff-city','chelsea-fc','crystal-palace','everton-fc','fulham-fc','hull-city','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united']

In [28]:
len(teams13)

20

In [29]:
games13 = []
for t1 in teams13:
    for t2 in teams13:
        if t1!=t2:
            games13.append(t1+"-"+t2)

In [30]:
games_page13 = dict([game,requests.get("http://www.worldfootball.net/report/premier-league-2013-2014-%s" % game).text] 
             for game in games13)

#Don't forget that step:
for k in games_page13.keys():
    games_page13[k] = BeautifulSoup(games_page13[k],"html.parser")

In [31]:
#sanity check
print len(games_page13)
#sanity check for empty pages
print any(a == [] for a in games_page13.values())

380
False


In [122]:
games_scorers_assists13 = {}
for k in games_page13.keys():
    games_scorers_assists13[k] = summary(games_page13[k],False)

In [123]:
import json
with open('BPL13-14.json', 'w') as fp:
    json.dump(games_scorers_assists13, fp)

In [34]:
#In case you need to reload it:
#with open('BPL/BPL13-14.json', 'r') as fp:
#    data = json.load(fp)

### French league:

#### Season 12-13

In [35]:
frenchteams12 = ['ac-ajaccio','as-nancy','as-saint-etienne','estac-troyes','evian-thonon-gaillard','fc-lorient','fc-sochaux','girondins-bordeaux','lille-osc','montpellier-hsc','ogc-nice','olympique-lyon','olympique-marseille','paris-saint-germain','sc-bastia','stade-brest','stade-reims','stade-rennes','toulouse-fc','valenciennes-fc']

In [36]:
len(list(set(frenchteams12)))

20

In [37]:
frenchgames12 = []
for t1 in frenchteams12:
    for t2 in frenchteams12:
        if t1!=t2:
            frenchgames12.append(t1+"-"+t2)

In [38]:
len(frenchgames12)

380

In [39]:
frenchgames_page12 = dict([game,requests.get("http://www.worldfootball.net/report/ligue-1-2012-2013-%s" % game).text] 
             for game in frenchgames12)

#Don't forget that step:
for k in frenchgames_page12.keys():
    frenchgames_page12[k] = BeautifulSoup(frenchgames_page12[k],"html.parser")

In [40]:
#sanity check
print len(frenchgames_page12)
#sanity check for empty pages
print any(a == [] for a in frenchgames_page12.values())

380
False


In [124]:
frenchgames_scorers_assists12 = {}
for k in frenchgames_page12.keys():
    frenchgames_scorers_assists12[k] = summary(frenchgames_page12[k],True)

In [99]:
#test:
frenchgames_scorers_assists12['valenciennes-fc-fc-sochaux']

{'away': [(u'S\xc3\xa9bastien Roudet',
   u'J\xc3\xa9r\xc3\xb4me Roussillon',
   62,
   -2,
   0,
   0)],
 'day': 11,
 'home': [(u'Foued Kadir', ' ', 12, 1, 0, 0),
  (u'Gr\xc3\xa9gory Pujol', u'Foued Kadir', 27, 2, 0, 0),
  (u'Gr\xc3\xa9gory Pujol', u'Foued Kadir', 60, 3, 0, 0)]}

In [125]:
import json
with open('French12-13.json', 'w') as fp:
    json.dump(frenchgames_scorers_assists12, fp)

In [44]:
#In case you need to reload it:
#with open('French/French12-13.json', 'r') as fp:
#    data = json.load(fp)

#### Season 13-14

In [45]:
frenchteams13 = ['ac-ajaccio','as-monaco','as-saint-etienne','ea-guingamp','evian-thonon-gaillard','fc-lorient','fc-nantes','fc-sochaux','girondins-bordeaux','lille-osc','montpellier-hsc','ogc-nice','olympique-lyon','olympique-marseille','paris-saint-germain','sc-bastia','stade-reims','stade-rennes','toulouse-fc','valenciennes-fc']
len(frenchteams13)
frenchgames13 = []

for t1 in frenchteams13:
    for t2 in frenchteams13:
        if t1!=t2:
            frenchgames13.append(t1+"-"+t2)
            
frenchgames_page13 = dict([game,requests.get("http://www.worldfootball.net/report/ligue-1-2013-2014-%s" % game).text] 
             for game in frenchgames13)

#Don't forget that step:
for k in frenchgames_page13.keys():
    frenchgames_page13[k] = BeautifulSoup(frenchgames_page13[k],"html.parser")

In [46]:
#sanity check
print len(frenchgames_page13)
#sanity check for empty pages
print any(a == [] for a in frenchgames_page13.values())

380
False


In [126]:
frenchgames_scorers_assists13 = {}
for k in frenchgames_page13.keys():
    frenchgames_scorers_assists13[k] = summary(frenchgames_page13[k],True)

In [127]:
import json
with open('French13-14.json', 'w') as fp:
    json.dump(frenchgames_scorers_assists13, fp)

In [49]:
#In case you need to reload it:
#with open('French/French13-14.json', 'r') as fp:
#    data = json.load(fp)

### Spanish division:

#### Season 12/13

In [106]:
spanishteams12 = ['real-madrid','ca-osasuna','fc-barcelona','malaga-cf','granada-cf','getafe-cf','sevilla-fc','valencia-cf','deportivo-la-coruna','real-sociedad','celta-vigo','espanyol-barcelona','rcd-mallorca','real-valladolid','real-zaragoza','atletico-madrid','levante-ud','real-betis','rayo-vallecano','athletic-bilbao']
print len(spanishteams12)

spanishgames12 = []

for t1 in spanishteams12:
    for t2 in spanishteams12:
        if t1!=t2:
            spanishgames12.append(t1+"-"+t2)
            
spanishgames_page12 = dict([game,requests.get("http://www.worldfootball.net/report/primera-division-2012-2013-%s" % game).text] 
             for game in spanishgames12)

#Don't forget that step:
for k in spanishgames_page12.keys():
    spanishgames_page12[k] = BeautifulSoup(spanishgames_page12[k],"html.parser")

20


In [107]:
#sanity check
print len(spanishgames_page12)
#sanity check for empty pages
print any(a == [] for a in spanishgames_page12.values())

380
False


In [128]:
spanishgames_scorers_assists12 = {}
for k in spanishgames_page12.keys():
    spanishgames_scorers_assists12[k] = summary(spanishgames_page12[k],False)

In [129]:
summary(spanishgames_page12['fc-barcelona-malaga-cf'],False)

{'away': [(u'Pedro Morales', u'Jes\xc3\xbas G\xc3\xa1mez', 55, -3, 1, 0)],
 'day': 38,
 'home': [(u'David Villa', u'Pedro', 2, 1, 0, 0),
  (u'Cesc F\xc3\xa0bregas', u'Iniesta', 13, 2, 0, 0),
  (u'Mart\xc3\xadn Montoya', ' ', 15, 3, 0, 0),
  (u'Iniesta', ' ', 50, 4, 0, 0)]}

In [130]:
import json
with open('Spanish12-13.json', 'w') as fp:
    json.dump(spanishgames_scorers_assists12, fp)

In [None]:
#In case you need to reload it:
#with open('Spanish/Spanish12-13.json', 'r') as fp:
#    data = json.load(fp)

#### Season 13/14

In [112]:
spanishteams13 = ['real-madrid','ca-osasuna','fc-barcelona','malaga-cf','granada-cf','getafe-cf','sevilla-fc','valencia-cf','elche-cf','real-sociedad','celta-vigo','espanyol-barcelona','villarreal-cf','real-valladolid','ud-almeria','atletico-madrid','levante-ud','real-betis','rayo-vallecano','athletic-bilbao']
print len(spanishteams13)

spanishgames13 = []

for t1 in spanishteams13:
    for t2 in spanishteams13:
        if t1!=t2:
            spanishgames13.append(t1+"-"+t2)
            
spanishgames_page13 = dict([game,requests.get("http://www.worldfootball.net/report/primera-division-2013-2014-%s" % game).text] 
             for game in spanishgames13)

#Don't forget that step:
for k in spanishgames_page13.keys():
    spanishgames_page13[k] = BeautifulSoup(spanishgames_page13[k],"html.parser")

20


In [132]:
print len(spanishgames_page13)
#sanity check for empty pages
print any(a == [] for a in spanishgames_page13.values())

380
False


In [131]:
spanishgames_scorers_assists13 = {}
for k in spanishgames_page13.keys():
    spanishgames_scorers_assists13[k] = summary(spanishgames_page13[k],False)

In [133]:
import json
with open('Spanish13-14.json', 'w') as fp:
    json.dump(spanishgames_scorers_assists13, fp)

In [None]:
#In case you need to reload it:
#with open('Spanish/Spanish13-14.json', 'r') as fp:
#    data = json.load(fp)

### Bundesliga

#### Season 12/13

In [138]:
bundesteams12 = ['bor-moenchengladbach','bayern-muenchen','sc-freiburg','fc-schalke-04','hamburger-sv','bayer-leverkusen','eintracht-frankfurt','vfl-wolfsburg','1-fc-nuernberg','werder-bremen','borussia-dortmund','1899-hoffenheim','vfb-stuttgart','1-fsv-mainz-05','fc-augsburg','spvgg-greuther-fuerth','hannover-96','fortuna-duesseldorf']
print len(bundesteams12)

bundesgames12 = []

for t1 in bundesteams12:
    for t2 in bundesteams12:
        if t1!=t2:
            bundesgames12.append(t1+"-"+t2)
            
bundesgames_page12 = dict([game,requests.get("http://www.worldfootball.net/report/bundesliga-2012-2013-%s" % game).text] 
             for game in bundesgames12)

#Don't forget that step:
for k in bundesgames_page12.keys():
    bundesgames_page12[k] = BeautifulSoup(bundesgames_page12[k],"html.parser")

18


In [139]:
#sanity check
print len(bundesgames_page12)
#sanity check for empty pages
print any(a == [] for a in bundesgames_page12.values())

306
False


In [140]:
bundesgames_scorers_assists12 = {}
for k in bundesgames_page12.keys():
    bundesgames_scorers_assists12[k] = summary(bundesgames_page12[k],False)

In [141]:
bundesgames_scorers_assists12['bor-moenchengladbach-bayern-muenchen']

{'away': [(u'Javi Mart\xc3\xadnez', u'Franck Rib\xc3\xa9ry', 7, -1, 0, 0),
  (u'Franck Rib\xc3\xa9ry', u'Thomas M\xc3\xbcller', 18, -1, 0, 0),
  (u'Franck Rib\xc3\xa9ry', u'Philipp Lahm', 53, 0, 0, 0),
  (u'Arjen Robben', u'Franck Rib\xc3\xa9ry', 59, 1, 0, 0)],
 'day': 34,
 'home': [(u'Martin Stranzl', u'Juan Arango', 4, 1, 0, 0),
  (u'Mike Hanke', u'Branimir Hrgota', 5, 2, 0, 0),
  (u'H\xc3\xa5vard Nordtveit', u'Patrick Herrmann', 10, 2, 0, 0)]}

In [142]:
import json
with open('Bundes12-13.json', 'w') as fp:
    json.dump(bundesgames_scorers_assists12, fp)

In [None]:
#In case you need to reload it:
#with open('Germany/Bundes12-13.json', 'r') as fp:
#    data = json.load(fp)

#### Season 13/14

In [143]:
bundesteams13 = ['bor-moenchengladbach','bayern-muenchen','sc-freiburg','fc-schalke-04','hamburger-sv','bayer-leverkusen','eintracht-frankfurt','vfl-wolfsburg','1-fc-nuernberg','werder-bremen','borussia-dortmund','1899-hoffenheim','vfb-stuttgart','1-fsv-mainz-05','fc-augsburg','eintracht-braunschweig','hannover-96','hertha-bsc']
print len(bundesteams13)

bundesgames13 = []

for t1 in bundesteams13:
    for t2 in bundesteams13:
        if t1!=t2:
            bundesgames13.append(t1+"-"+t2)
            
bundesgames_page13 = dict([game,requests.get("http://www.worldfootball.net/report/bundesliga-2013-2014-%s" % game).text] 
             for game in bundesgames13)

#Don't forget that step:
for k in bundesgames_page13.keys():
    bundesgames_page13[k] = BeautifulSoup(bundesgames_page13[k],"html.parser")

18


In [144]:
#sanity check
print len(bundesgames_page13)
#sanity check for empty pages
print any(a == [] for a in bundesgames_page13.values())

306
False


In [145]:
bundesgames_scorers_assists13 = {}
for k in bundesgames_page13.keys():
    bundesgames_scorers_assists13[k] = summary(bundesgames_page13[k],False)

In [146]:
bundesgames_scorers_assists13['bor-moenchengladbach-bayern-muenchen']

{'away': [(u'Mario G\xc3\xb6tze', u'Thomas M\xc3\xbcller', 7, 1, 0, 0),
  (u'Thomas M\xc3\xbcller', ' ', 53, 2, 0, 0)],
 'day': 18,
 'home': []}

In [147]:
import json
with open('Bundes13-14.json', 'w') as fp:
    json.dump(bundesgames_scorers_assists13, fp)

In [None]:
#In case you need to reload it:
#with open('Germany/Bundes13-14.json', 'r') as fp:
#    data = json.load(fp)

### Italy

#### Season 12/13

In [150]:
itateams12 = ['sampdoria','juventus','atalanta-bergamo','chievo-verona','bologna-fc','genoa-cfc','torino-fc','calcio-catania','cagliari-calcio','lazio-roma','delfino-pescara-1936','acf-fiorentina','ac-siena','ac-milan','as-roma','ssc-napoli','us-palermo','parma-fc','inter','udinese-calcio']
print len(itateams12)

itagames12 = []

for t1 in itateams12:
    for t2 in itateams12:
        if t1!=t2:
            itagames12.append(t1+"-"+t2)
            
itagames_page12 = dict([game,requests.get("http://www.worldfootball.net/report/serie-a-2012-2013-%s" % game).text] 
             for game in itagames12)

#Don't forget that step:
for k in itagames_page12.keys():
    itagames_page12[k] = BeautifulSoup(itagames_page12[k],"html.parser")

20


In [151]:
#sanity check
print len(itagames_page12)
#sanity check for empty pages
print any(a == [] for a in itagames_page12.values())

380
False


In [155]:
itagames_scorers_assists12 = {}
for k in itagames_page12.keys():
    if k == 'cagliari-calcio-as-roma':
        continue
    else:
        itagames_scorers_assists12[k] = summary(itagames_page12[k],False)
#This game was awarded to Roma due to legal ruling, we will count it as 0 0:
itagames_scorers_assists12['cagliari-calcio-as-roma'] = dict([('home',[]),('day',4),('away',[])])

In [156]:
itagames_scorers_assists12['sampdoria-juventus']

{'away': [(u'Fabio Quagliarella', u'Andrea Pirlo', 25, 1, 0, 0),
  (u'Emanuele Giaccherini', u'Stephan Lichtsteiner', 90, -1, 0, 1)],
 'day': 38,
 'home': [(u'\xc3\x89der', ' ', 31, 0, 0, 0),
  (u'Lorenzo De Silvestri', u'Marcelo Estigarribia', 57, 1, 0, 0),
  (u'Mauro Icardi', u'Marcelo Estigarribia', 75, 2, 0, 0)]}

In [157]:
import json
with open('Italy12-13.json', 'w') as fp:
    json.dump(itagames_scorers_assists12, fp)

In [None]:
#In case you need to reload it:
#with open('Italy/Italy12-13.json', 'r') as fp:
#    data = json.load(fp)

#### Season 13/14 

In [158]:
itateams13 = ['sampdoria','juventus','atalanta-bergamo','chievo-verona','bologna-fc','genoa-cfc','torino-fc','calcio-catania','cagliari-calcio','lazio-roma','sassuolo-calcio','acf-fiorentina','hellas-verona','ac-milan','as-roma','ssc-napoli','as-livorno','parma-fc','inter','udinese-calcio']
print len(itateams13)

itagames13 = []

for t1 in itateams13:
    for t2 in itateams13:
        if t1!=t2:
            itagames13.append(t1+"-"+t2)
            
itagames_page13 = dict([game,requests.get("http://www.worldfootball.net/report/serie-a-2013-2014-%s" % game).text] 
             for game in itagames13)

#Don't forget that step:
for k in itagames_page13.keys():
    itagames_page13[k] = BeautifulSoup(itagames_page13[k],"html.parser")

20


In [159]:
#sanity check
print len(itagames_page13)
#sanity check for empty pages
print any(a == [] for a in itagames_page13.values())

380
False


In [167]:
'atalanta-bergamo' in 'ssc-napoli-atalanta-bergamo'

True

In [168]:
itagames_scorers_assists13 = {}
for k in itagames_page13.keys():
    print k
    if 'atalanta-bergamo' in k:
        continue
    else:
        itagames_scorers_assists13[k] = summary(itagames_page13[k],False)

calcio-catania-genoa-cfc
torino-fc-sampdoria
sassuolo-calcio-udinese-calcio
chievo-verona-juventus
juventus-bologna-fc
hellas-verona-juventus
inter-torino-fc
calcio-catania-bologna-fc
sassuolo-calcio-juventus
calcio-catania-udinese-calcio
juventus-ac-milan
as-livorno-inter
lazio-roma-as-livorno
acf-fiorentina-as-livorno
atalanta-bergamo-udinese-calcio
acf-fiorentina-lazio-roma
ssc-napoli-atalanta-bergamo
as-livorno-hellas-verona
chievo-verona-genoa-cfc
sampdoria-genoa-cfc
torino-fc-inter
calcio-catania-cagliari-calcio
chievo-verona-udinese-calcio
cagliari-calcio-acf-fiorentina
udinese-calcio-chievo-verona
cagliari-calcio-inter
genoa-cfc-ac-milan
sassuolo-calcio-genoa-cfc
atalanta-bergamo-ac-milan
lazio-roma-genoa-cfc
sassuolo-calcio-bologna-fc
ssc-napoli-genoa-cfc
hellas-verona-atalanta-bergamo
cagliari-calcio-torino-fc
bologna-fc-sampdoria
udinese-calcio-parma-fc
hellas-verona-parma-fc
sassuolo-calcio-calcio-catania
torino-fc-sassuolo-calcio
atalanta-bergamo-cagliari-calcio
genoa-cfc-

In [169]:
atl = []
for k in itagames_page13.keys():
    if 'atalanta-bergamo' in k:
        atl.append(k)
print atl

['atalanta-bergamo-udinese-calcio', 'ssc-napoli-atalanta-bergamo', 'atalanta-bergamo-ac-milan', 'hellas-verona-atalanta-bergamo', 'atalanta-bergamo-cagliari-calcio', 'genoa-cfc-atalanta-bergamo', 'atalanta-bergamo-hellas-verona', 'atalanta-bergamo-as-livorno', 'cagliari-calcio-atalanta-bergamo', 'atalanta-bergamo-lazio-roma', 'udinese-calcio-atalanta-bergamo', 'atalanta-bergamo-torino-fc', 'atalanta-bergamo-as-roma', 'bologna-fc-atalanta-bergamo', 'sampdoria-atalanta-bergamo', 'acf-fiorentina-atalanta-bergamo', 'atalanta-bergamo-juventus', 'chievo-verona-atalanta-bergamo', 'calcio-catania-atalanta-bergamo', 'atalanta-bergamo-genoa-cfc', 'atalanta-bergamo-bologna-fc', 'sassuolo-calcio-atalanta-bergamo', 'lazio-roma-atalanta-bergamo', 'as-roma-atalanta-bergamo', 'juventus-atalanta-bergamo', 'as-livorno-atalanta-bergamo', 'atalanta-bergamo-sassuolo-calcio', 'parma-fc-atalanta-bergamo', 'atalanta-bergamo-sampdoria', 'ac-milan-atalanta-bergamo', 'atalanta-bergamo-ssc-napoli', 'atalanta-berg

In [170]:
itagames_scorers_assists13['ssc-napoli-atalanta-bergamo'] =  dict([('home',[('Gonzalo Higuain',' ',71,1,0,0),('José Callejón',' ',81,2,0,0)]),('day',4),('away',[])])
itagames_scorers_assists13['atalanta-bergamo-udinese-calcio'] = dict([('home',[('Germán Denis',' ',45,1,0,0),('Germán Denis',' ',63,2,0,0)]),('day',4),('away',[])])
itagames_scorers_assists13['atalanta-bergamo-ac-milan'] = dict([('home',[('Germán Denis',' ',68,1,0,0),('Franco Brienza',' ',96,1,0,0)]),('day',37),('away',[])])
itagames_scorers_assists13['hellas-verona-atalanta-bergamo'] = dict([('home',[('Juanito',' ',82,0,0,0),('Jorginho', ' ',87,1,0,0)]),('day',15),('away',[('Germán Denis',' ',42,1,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-cagliari-calcio'] = dict([('home',[('Giacomo Bonaventura',' ',68,1,0,0)]),('day',20),('away',[])])
itagames_scorers_assists13['genoa-cfc-atalanta-bergamo'] = dict([('home',[('Andrea Bertolacci',' ',72,1,0,0)]),('day',16),('away',[('Giuseppe De Luca',' ',90,0,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-hellas-verona']=dict([('home',[('Germán Denis',' ',87,-1,0,0)]),('day',34),('away',[('Massimo Donati', ' ',53,1,0,0),('Luca Toni',' ',74,2,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-as-livorno']=dict([('home',[('Giuseppe De Luca',' ',22,1,0,0),('Germán Denis',' ',59,2,0,0)]),('day',[30]),('away',[])])
itagames_scorers_assists13['cagliari-calcio-atalanta-bergamo']=dict([('home',[('Guglielmo Stendardo', ' ',27,1,0,0)]),('day',1),('away',[('Radja Nainggolan',' ',28,0,0,0),('Matías Cabrera', ' ',63,1,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-lazio-roma']=dict([('home',[('Luca Cigarini',' ',42,1,0,0),('Germán Denis',' ',84,1,0,0)]),('day',8),('away',[('Brayan Perea',' ',53,0,0,0)])])
itagames_scorers_assists13['udinese-calcio-atalanta-bergamo']=dict([('home',[('Antonio Di Natale',' ',74,0,0,0)]),('day',25),('away',[('Davide Brivio',' ',27,1,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-torino-fc']=dict([('home',[('Guglielmo Stendardo', ' ',57,1,0,0),('Stefano Lucchini',' ',81,2,0,0)]),('day',2),('away',[])])
itagames_scorers_assists13['atalanta-bergamo-as-roma']=dict([('home',[('Davide Brivio',' ',51,1,0,0)]),('day',14),('away',[('Kevin Strootman',' ',90,0,0,0)])])
itagames_scorers_assists13['bologna-fc-atalanta-bergamo']=dict([('home',[]),('day',31),('away',[('Giuseppe De Luca',' ',22,1,0,0),('Marcelo Estigarribia',' ',27,2,0,0)])])
itagames_scorers_assists13['sampdoria-atalanta-bergamo']=dict([('home',[('Shkodran Mostafi',' ',57,1,0,0)]),('day',9),('away',[])])
itagames_scorers_assists13['acf-fiorentina-atalanta-bergamo']=dict([('home',[('Josip Iličić',' ',16,1,0,0),('Rafał Wolski',' ',80,2,0,0)]),('day',22),('away',[])])
itagames_scorers_assists13['atalanta-bergamo-juventus']=dict([('home',[('Maximiliano Moralez',' ',15,0,0,0)]),('day',17),('away',[('Carlos Tevez',' ',6,1,0,0),('Paul Pogba',' ',46,1,0,0),('Fernando Llorente',' ',75,2,0,0),('Arturo Vidal',' ',79,3,0,0)])])
itagames_scorers_assists13['chievo-verona-atalanta-bergam']=dict([('home',[]),('day',7),('away',[('Maximiliano Moralez',' ',16,1,0,0)])])
itagames_scorers_assists13['calcio-catania-atalanta-bergamo']=dict([('home',[('Francesco Lodi',' ',65,1,0,0),('Gonzalo Rubén Bergessio',' ',90,1,0,0)]),('day',38),('away',[('Moussa Kone',' ',80,0,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-genoa-cfc']=dict([('home',[('Giuseppe De Luca',' ',82,0,0,0)]),('day',35),('away',[('Paolo De Ceglie',' ',27,1,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-bologna-fc']=dict([('home',[('Davide Brivio',' ',74,1,0,0),('Marko Livaja',' ',90,1,0,0)]),('day',12),('away',[('Rolando Bianchi',' ',77,0,0,0)])])
itagames_scorers_assists13['sassuolo-calcio-atalanta-bergamo']=dict([('home',[('Simone Zaza',' ',63,1,0,0),('Domenico Berardi',' ',67,2,0,0)]),('day',13),('away',[])])
itagames_scorers_assists13['lazio-roma-atalanta-bergamo']=dict([('home',[]),('day',27),('away',[('Maximiliano Moralez',' ',60,1,0,0)])])
itagames_scorers_assists13['as-roma-atalanta-bergamo']=dict([('home',[('Rodrigo Taddei',' ',13,1,0,0),('Adem Ljajić',' ',44,2,0,0),('Gervinho',' ',63,3,0,0)]),('day',33),('away',[('Giulio Migliaccio',' ',78,-2,0,0)])])
itagames_scorers_assists13['juventus-atalanta-bergamo']=dict([('home',[('Simone Padoin',' ',72,1,0,0)]),('day',36),('away',[])])
itagames_scorers_assists13['as-livorno-atalanta-bergamo']=dict([('home',[('Paulinho', ' ',11,1,0,0)]),('day',11),('away',[])])
itagames_scorers_assists13['atalanta-bergamo-sassuolo-calcio']=dict([('home',[]),('day',32),('away',[('Nicola Sansone', ' ',33,1,0,0),('Nicola Sansone', ' ',70,2,0,0)])])
itagames_scorers_assists13['parma-fc-atalanta-bergamo']=dict([('home',[('Djamel Mesbah', ' ',19,1,0,0),('Marco Parolo', ' ',28,1,0,0),('Marco Parolo', ' ',40,3,0,0),('Aleandro Rosi', ' ',38,2,0,0)]),('day',5),('away',[('Giacomo Bonaventura', ' ',20,0,0,0),('Germán Denis', ' ',44,-2,0,0),('Marko Livaja', ' ',79,1,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-sampdoria']=dict([('home',[('Carlos Carmona', ' ',36,1,0,0),('Giacomo Bonaventura', ' ',42,2,0,0),('Germán Denis', ' ',55,3,0,0)]),('day',28),('away',[])])
itagames_scorers_assists13['ac-milan-atalanta-bergamo']=dict([('home',[('Kaká', ' ',35,1,0,0),('Kaka', ' ',65,2,0,0),('Bryan Cristante', ' ',67,3,0,0)]),('day',18),('away',[])])
itagames_scorers_assists13['atalanta-bergamo-ssc-napoli']=dict([('home',[('Germán Denis', ' ',47,1,0,0),('Germán Denis', ' ',64,2,0,0),('Maximiliano Moralez', ' ',70,3,0,0)]),('day',22),('away',[])])
itagames_scorers_assists13['atalanta-bergamo-chievo-verona']=dict([('home',[('Carlos Carmona', ' ',21,1,0,0),('Luca Cigarini', ' ',80,1,0,0)]),('day',26),('away',[('Dario Dainelli',' ',70,0,0,0)])])
itagames_scorers_assists13['torino-fc-atalanta-bergamo']=dict([('home',[('Alessio Cerci', ' ',60,1,0,0)]),('day',21),('away',[])])
itagames_scorers_assists13['atalanta-bergamo-calcio-catania']=dict([('home',[('Germán Denis', ' ',67,1,0,0),('Maximiliano Moralez', ' ',87,2,0,0)]),('day',19),('away',[('Sebastián Leto',' ',89,-1,0,0)])])
itagames_scorers_assists13['inter-atalanta-bergamo']=dict([('home',[('Mauro Icardi', ' ',36,1,0,0)]),('day',29),('away',[('Giacomo Bonaventura', ' ',35,1,0,0),('Giacomo Bonaventura', ' ',90,1,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-acf-fiorentina']=dict([('home',[]),('day',4),('away',[('Matías Fernández', ' ',41,1,0,0),('Giuseppe Rossi', ' ',69,2,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-inter']=dict([('home',[('Germán Denis', ' ',25,0,0,0)]),('day',10),('away',[('Ricky Álvarez',' ',16,1,0,0)])])
itagames_scorers_assists13['atalanta-bergamo-parma-fc']=dict([('home',[]),('day',24),('away',[('Cristian Molinaro',' ',9,1,0,0),('Antonio Cassano',' ',70,3,0,0),('Ezequiel Schelotto',' ',90,4,0,0)])])

In [171]:
itagames_scorers_assists13['ssc-napoli-hellas-verona']

{'away': [(u'Juan Iturbe', ' ', 66, -3, 0, 0)],
 'day': 38,
 'home': [(u'Jos\xc3\xa9 Callej\xc3\xb3n', u'Dries Mertens', 5, 1, 0, 0),
  (u'Duv\xc3\xa1n Zapata', u'Dries Mertens', 13, 2, 0, 0),
  (u'Duv\xc3\xa1n Zapata', u'Jos\xc3\xa9 Callej\xc3\xb3n', 25, 3, 0, 0),
  (u'Dries Mertens', ' ', 62, 4, 0, 0),
  (u'Dries Mertens', u'Lorenzo Insigne', 77, 4, 0, 0)]}

In [172]:
import json
with open('Italy13-14.json', 'w') as fp:
    json.dump(itagames_scorers_assists13, fp)

In [None]:
#In case you need to reload it:
#with open('Italy/Italy13-14.json', 'r') as fp:
#    data = json.load(fp)