#Initial Data Cleaning

This notebook is for the initial stages of data cleaning. It will be updated and cleaned up as we solidify our methods and have completed our scraping.

In [1]:
% matplotlib inline
import json
import pandas as pd
import numpy as np
import copy
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
# Load in data
filenames = ['BPL/BPL12-13.json', 'BPL/BPL13-14.json', 'French/French12-13.json', 'French/French13-14.json', 
             'Germany/Bundes12-13.json', 'Germany/Bundes13-14.json', 
             'Italy/Italy12-13.json', 'Italy/Italy13-14.json', 
             'Spanish/Spanish12-13.json', 'Spanish/Spanish13-14.json']
with open(filenames[0], 'r') as fp:
    data = json.load(fp)

###Gaining familiarity with data structure and early testing

In [3]:
data['arsenal-fc-aston-villa']

{u'away': [[u'Andreas Weimann', u' ', 67, 0, 0, 0]],
 u'day': 27,
 u'home': [[u'Santi Cazorla', u' ', 5, 1, 0, 0],
  [u'Santi Cazorla', u'Nacho Monreal', 84, 1, 0, 0]]}

In [4]:
data['arsenal-fc-aston-villa']['home']

[[u'Santi Cazorla', u' ', 5, 1, 0, 0],
 [u'Santi Cazorla', u'Nacho Monreal', 84, 1, 0, 0]]

In [5]:
homescorer = [goals[0] for goals in data['arsenal-fc-aston-villa']['home']]
homescorer = [(player, homescorer.count(player)) for player in set(homescorer)]
homescorer

[(u'Santi Cazorla', 2)]

In [6]:
homeassist = [goals[1] for goals in data['arsenal-fc-aston-villa']['home'] if isinstance(goals[1],unicode)]
homeassist

[u' ', u'Nacho Monreal']

In [7]:
awayscorer = [goals[0] for goals in data['arsenal-fc-aston-villa']['away']]
awayscorer = [(player, awayscorer.count(player)) for player in set(awayscorer)]
awayscorer

[(u'Andreas Weimann', 1)]

In [8]:
data['aston-villa-tottenham-hotspur']

{u'away': [[u'Jermain Defoe', u'Kyle Naughton', 58, 1, 0, 0],
  [u'Gareth Bale', u' ', 61, 2, 0, 0],
  [u'Gareth Bale', u'Aaron Lennon', 73, 3, 0, 0],
  [u'Gareth Bale', u'Gylfi Sigur\xc3\xb0sson', 84, 4, 0, 1]],
 u'day': 19,
 u'home': []}

In [9]:
teams = ['arsenal-fc','aston-villa','chelsea-fc','everton-fc','fulham-fc','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','queens-park-rangers','reading-fc','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united','wigan-athletic']
games = [t1+"-"+t2 for t1 in teams for t2 in teams if t1!=t2]
games[:3]

['arsenal-fc-aston-villa', 'arsenal-fc-chelsea-fc', 'arsenal-fc-everton-fc']

In [10]:
data[games[0]]

{u'away': [[u'Andreas Weimann', u' ', 67, 0, 0, 0]],
 u'day': 27,
 u'home': [[u'Santi Cazorla', u' ', 5, 1, 0, 0],
  [u'Santi Cazorla', u'Nacho Monreal', 84, 1, 0, 0]]}

### The following function reads in the data scraped from http://www.worldfootball.net/ via the scraping.ipynb code that is stored as .json files in our repository. What needs to be changed is the number of columns per player. In order to keep things uniform, each player has every match for the season. I want to change this to be ordered by match chronologically.

In [11]:
def cleaner_to_df(data,matches):
    """
    Function
    --------
    cleaner_to_df

    Inputs
    ------
    data : dictionary of dictionaries
       dictionary provided from scraping.ipynb in the format of {match: home: [], away: []}
    games : list of match pairings

    Returns
    -------
     A dataframe, indexed by player, with each match as the columns (will be adjusted to matchday):
         Each column contains a dictionary for the players contributions during that particular match
         {goals: (minute, was_tiebreaker, was_equalizer), 
         assists: (minute, was_tiebreaker, was_equalizer), 
         was_sub: boolean, whether player was a substitute or not
         was_home: boolean, whether player's club was home team}

    Notes
    -----
    Current version of code creates a column for every match for each player (very wrong I know),
    this will be updated to contain matchday information so that the columns are consistent across match day 

    Written by TWK ... 11 November '15
    """
    # Create a unique set of players that contributed in any way during the season
    players_list = set([])
    for match in matches:
        for team in ['home', 'away']:
            for goal in range(len(data[match][team])):
                scorer = data[match][team][goal][0]
                assister = data[match][team][goal][1]

                if not isinstance(assister,unicode):
                    assister = ''
                players_list.add(scorer)
                players_list.add(assister)  

    # Initializing player dictionary... 
    # Creates empty list for every player for every match played in the season
    base_dict = {}
#     for match in matches:
    for day in range(1,39):
        base_dict[day] = dict(zip(['goals','assists','was_sub', 'was_home'],[[],[],0, 0]))
    players = {}
    for player in players_list:
        players[player] = copy.deepcopy(base_dict)

    # Populate the player dictionary
    for match in matches:
        matchDay = data[match]['day']
        for team in ['home','away']:
            for num_goal in range(len(data[match][team])):
                goal = data[match][team][num_goal] # Get the event information
                scorerName = goal[0] # Pull the scorer's name
                assistName = goal[1] # Pull the assist man's name


                #The following structure will change as virgodi pushes his most recent changes to scraping.ipynb
                if isinstance(assistName, unicode): # Check if the assist column is missing...
                    goaltuple = (goal[2],goal[3]==1,goal[3]==0) # (minute, was_tiebreaker, was_equalizer)
                    scorerSub = goal[4] # Check if scorer was a substitute
                    assistSub = goal[5] # Check if scorer was a substitute
                    
                    # Fill in relevant information for assist man
                    players[assistName][matchDay]["assists"].append(goaltuple)
                    players[assistName][matchDay]["was_sub"] = assistSub == 1
                    players[assistName][matchDay]["was_home"] = team == 'home'
                else:
                    assistName = ''
                    goaltuple = (goal[1],goal[2]==1,goal[2]==0)
                    scorerSub = goal[3]
                    assistSub = 0
                # Fill in relevant information for goal scorer
                players[scorerName][matchDay]["goals"].append(goaltuple)
                players[scorerName][matchDay]["was_sub"] = scorerSub == 1
                if goaltuple[0] > 0:
                    players[scorerName][matchDay]["was_home"] = team == 'home'
                else:
                    players[scorerName][matchDay]["was_home"] = team != 'home'

    # Return player dictionary as a dataframe
    return pd.DataFrame(players)

### For each json stored in our repository as of 10 PM on 11 Nov. '15, we load each data set and populate the relevant dataframe and save it with the same file name with the .pkl extension (saved the data frames using the pickle class of Pandas...)

In [12]:
for filedir in filenames:
    
    with open(filedir, 'r') as fp:
        data = json.load(fp)
        
    if filedir == filenames[0]:
        teams = ['arsenal-fc','aston-villa','chelsea-fc','everton-fc','fulham-fc','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','queens-park-rangers','reading-fc','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united','wigan-athletic']
        
    elif filedir == filenames[1]:
        teams = ['arsenal-fc','aston-villa','cardiff-city','chelsea-fc','crystal-palace','everton-fc','fulham-fc','hull-city','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united']
        
    elif filedir == filenames[2]:
        teams = ['ac-ajaccio','as-nancy','as-saint-etienne','estac-troyes','evian-thonon-gaillard','fc-lorient','fc-sochaux','girondins-bordeaux','lille-osc','montpellier-hsc','ogc-nice','olympique-lyon','olympique-marseille','paris-saint-germain','sc-bastia','stade-brest','stade-reims','stade-rennes','toulouse-fc','valenciennes-fc']
        
    elif filedir == filenames[3]:
        teams = ['ac-ajaccio','as-monaco','as-saint-etienne','ea-guingamp','evian-thonon-gaillard','fc-lorient','fc-nantes','fc-sochaux','girondins-bordeaux','lille-osc','montpellier-hsc','ogc-nice','olympique-lyon','olympique-marseille','paris-saint-germain','sc-bastia','stade-reims','stade-rennes','toulouse-fc','valenciennes-fc']
        
    elif filedir == filenames[4]:
        teams = ['bor-moenchengladbach','bayern-muenchen','sc-freiburg','fc-schalke-04','hamburger-sv','bayer-leverkusen','eintracht-frankfurt','vfl-wolfsburg','1-fc-nuernberg','werder-bremen','borussia-dortmund','1899-hoffenheim','vfb-stuttgart','1-fsv-mainz-05','fc-augsburg','spvgg-greuther-fuerth','hannover-96','fortuna-duesseldorf']
        
    elif filedir == filenames[5]:
        teams = ['bor-moenchengladbach','bayern-muenchen','sc-freiburg','fc-schalke-04','hamburger-sv','bayer-leverkusen','eintracht-frankfurt','vfl-wolfsburg','1-fc-nuernberg','werder-bremen','borussia-dortmund','1899-hoffenheim','vfb-stuttgart','1-fsv-mainz-05','fc-augsburg','eintracht-braunschweig','hannover-96','hertha-bsc']
        
    elif filedir == filenames[6]:
        teams = ['sampdoria','juventus','atalanta-bergamo','chievo-verona','bologna-fc','genoa-cfc','torino-fc','calcio-catania','cagliari-calcio','lazio-roma','delfino-pescara-1936','acf-fiorentina','ac-siena','ac-milan','as-roma','ssc-napoli','us-palermo','parma-fc','inter','udinese-calcio']
        
    elif filedir == filenames[7]:
        teams = ['sampdoria','juventus','atalanta-bergamo','chievo-verona','bologna-fc','genoa-cfc','torino-fc','calcio-catania','cagliari-calcio','lazio-roma','sassuolo-calcio','acf-fiorentina','hellas-verona','ac-milan','as-roma','ssc-napoli','as-livorno','parma-fc','inter','udinese-calcio']
        
    elif filedir == filenames[8]:
        teams = ['real-madrid','ca-osasuna','fc-barcelona','malaga-cf','granada-cf','getafe-cf','sevilla-fc','valencia-cf','deportivo-la-coruna','real-sociedad','celta-vigo','espanyol-barcelona','rcd-mallorca','real-valladolid','real-zaragoza','atletico-madrid','levante-ud','real-betis','rayo-vallecano','athletic-bilbao']
        
    elif filedir == filenames[9]:
        teams = ['real-madrid','ca-osasuna','fc-barcelona','malaga-cf','granada-cf','getafe-cf','sevilla-fc','valencia-cf','elche-cf','real-sociedad','celta-vigo','espanyol-barcelona','villarreal-cf','real-valladolid','ud-almeria','atletico-madrid','levante-ud','real-betis','rayo-vallecano','athletic-bilbao']

    matches = [t1+"-"+t2 for t1 in teams for t2 in teams if t1!=t2]
    playerdf = cleaner_to_df(data, matches)
    
    filename = filedir[:-5].replace("-","_")
    filename = filename+'.pkl'
    
    playerdf.to_pickle(filename)

### Example of how to load one of our dataframes

In [13]:
playerdf = pd.read_pickle('BPL/BPL12_13.pkl')

In [14]:
playerdf["Robin van Persie"][8]

{'assists': [(27, False, True)],
 'goals': [(44, True, False)],
 'was_home': True,
 'was_sub': False}

In [15]:
playerdf['Wayne Rooney'][8]

{'assists': [(44, True, False), (46, False, False)],
 'goals': [(27, False, True), (65, False, False), (-11, True, False)],
 'was_home': True,
 'was_sub': False}

##FIXED
####What you see above is that Rooney had an own goal that gets counted for him and since he gets counted during the away team's search his 'was_home' field gets switched. We need to have a flag (like a negative minute or something) to mark if the goal was an own goal. I want to keep own goals so that they are counted against player's value...

In [16]:
playerdf2 = pd.read_pickle('Italy/Italy13_14.pkl')

In [17]:
playerdf2["Paul Pogba"][17]

{'assists': [],
 'goals': [(46, True, False)],
 'was_home': False,
 'was_sub': False}