#Initial Data Cleaning

This notebook is for the initial stages of data cleaning. It will be updated and cleaned up as we solidify our methods and have completed our scraping.

In [1]:
% matplotlib inline
import json
import pandas as pd
import numpy as np
import copy

In [2]:
# Load in data
filenames = ['BPL/BPL12-13.json', 'BPL/BPL13-14.json', 'French/French12-13.json', 'French/French13-14.json']
# with open('BPL/BPL12-13.json', 'r') as fp:
#     data = json.load(fp)

###Gaining familiarity with data structure and early testing

In [16]:
data['arsenal-fc-aston-villa']

{u'away': [[u'Andreas Weimann', 67, 0, 0]],
 u'home': [[u'Santi Cazorla', 5, 1, 0],
  [u'Santi Cazorla', u'Nacho Monreal', 84, 1, 0, 0]]}

In [15]:
data['arsenal-fc-aston-villa']['home']

[[u'Santi Cazorla', 5, 1, 0],
 [u'Santi Cazorla', u'Nacho Monreal', 84, 1, 0, 0]]

In [21]:
homescorer = [goals[0] for goals in data['arsenal-fc-aston-villa']['home']]
homescorer = [(player, homescorer.count(player)) for player in set(homescorer)]
homescorer

[(u'Santi Cazorla', 2)]

In [28]:
homeassist = [goals[1] for goals in data['arsenal-fc-aston-villa']['home'] if isinstance(goals[1],unicode)]
homeassist

[u'Nacho Monreal']

In [24]:
awayscorer = [goals[0] for goals in data['arsenal-fc-aston-villa']['away']]
awayscorer = [(player, awayscorer.count(player)) for player in set(awayscorer)]
awayscorer

[(u'Andreas Weimann', 1)]

In [30]:
data['aston-villa-tottenham-hotspur']

{u'away': [[u'Jermain Defoe', u'Kyle Naughton', 58, 1, 0, 0],
  [u'Gareth Bale', 61, 2, 0],
  [u'Gareth Bale', u'Aaron Lennon', 73, 3, 0, 0],
  [u'Gareth Bale', u'Gylfi Sigur\xc3\xb0sson', 84, 4, 0, 1]],
 u'home': []}

In [34]:
teams = ['arsenal-fc','aston-villa','chelsea-fc','everton-fc','fulham-fc','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','queens-park-rangers','reading-fc','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united','wigan-athletic']
games = [t1+"-"+t2 for t1 in teams for t2 in teams if t1!=t2]
games[:3]

['arsenal-fc-aston-villa', 'arsenal-fc-chelsea-fc', 'arsenal-fc-everton-fc']

In [35]:
data[games[0]]

{u'away': [[u'Andreas Weimann', 67, 0, 0]],
 u'home': [[u'Santi Cazorla', 5, 1, 0],
  [u'Santi Cazorla', u'Nacho Monreal', 84, 1, 0, 0]]}

### The following function reads in the data scraped from http://www.worldfootball.net/ via the scraping.ipynb code that is stored as .json files in our repository. What needs to be changed is the number of columns per player. In order to keep things uniform, each player has every match for the season. I want to change this to be ordered by match chronologically.

In [3]:
def cleaner_to_df(data,games):
    """
    Function
    --------
    cleaner_to_df

    Inputs
    ------
    data : dictionary of dictionaries
       dictionary provided from scraping.ipynb in the format of {match: home: [], away: []}
    games : list of match pairings

    Returns
    -------
     A dataframe, indexed by player, with each match as the columns (will be adjusted to matchday):
         Each column contains a dictionary for the players contributions during that particular match
         {goals: (minute, was_tiebreaker, was_equalizer), 
         assists: (minute, was_tiebreaker, was_equalizer), 
         was_sub: boolean, whether player was a substitute or not
         was_home: boolean, whether player's club was home team}

    Notes
    -----
    Current version of code creates a column for every match for each player (very wrong I know),
    this will be updated to contain matchday information so that the columns are consistent across match day 

    Written by TWK ... 11 November '15
    """
    players_list = set([])
    for game in games:
        for team in ['home', 'away']:
            for goal in range(len(data[game][team])):
                scorer = data[game][team][goal][0]
                assister = data[game][team][goal][1]

                if not isinstance(assister,unicode):
                    assister = ''
                players_list.add(scorer)
                players_list.add(assister)    

    base_dict = {}
    for game in games:
        base_dict[game] = dict(zip(['goals','assists','was_sub', 'was_home'],[[],[],0, 0]))
    players = {}
    for player in players_list:
        players[player] = copy.deepcopy(base_dict)

    for game in games:
        for team in ['home','away']:
            for num_goal in range(len(data[game][team])):
                goal = data[game][team][num_goal]
                scorerName = goal[0]
                assistName = goal[1]
                if isinstance(assistName, unicode):
                    goaltuple = (goal[2],goal[3]==1,goal[3]==0)
                    scorerSub = goal[4]
                    assistSub = goal[5]
                    players[assistName][game]["assists"].append(goaltuple)
                    players[assistName][game]["was_sub"] = scorerSub == 1 | assistSub == 1
                    players[assistName][game]["was_home"] = team == 'home'
                else:
                    assistName = ''
                    goaltuple = (goal[1],goal[2]==1,goal[2]==0)
                    scorerSub = goal[3]
                    assistSub = 0

                players[scorerName][game]["goals"].append(goaltuple)
                players[scorerName][game]["was_sub"] = scorerSub == 1 | assistSub == 1
                players[scorerName][game]["was_home"] = team == 'home'

    return pd.DataFrame(players)

### For each json stored in our repository as of 10 PM on 11 Nov. '15, we load each data set and populate the relevant dataframe and save it with the same file name with the .pkl extension (saved the data frames using the pickle class of Pandas...)

In [13]:
for filedir in filenames:
    with open(filedir, 'r') as fp:
        data = json.load(fp)
    if filedir == filenames[0]:
        teams = ['arsenal-fc','aston-villa','chelsea-fc','everton-fc','fulham-fc','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','queens-park-rangers','reading-fc','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united','wigan-athletic']
    elif filedir == filenames[1]:
        teams = ['arsenal-fc','aston-villa','cardiff-city','chelsea-fc','crystal-palace','everton-fc','fulham-fc','hull-city','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united']
    elif filedir == filenames[2]:
        teams = ['ac-ajaccio','as-nancy','as-saint-etienne','estac-troyes','evian-thonon-gaillard','fc-lorient','fc-sochaux','girondins-bordeaux','lille-osc','montpellier-hsc','ogc-nice','olympique-lyon','olympique-marseille','paris-saint-germain','sc-bastia','stade-brest','stade-reims','stade-rennes','toulouse-fc','valenciennes-fc']
    elif filedir == filenames[3]:
        teams = ['ac-ajaccio','as-monaco','as-saint-etienne','ea-guingamp','evian-thonon-gaillard','fc-lorient','fc-nantes','fc-sochaux','girondins-bordeaux','lille-osc','montpellier-hsc','ogc-nice','olympique-lyon','olympique-marseille','paris-saint-germain','sc-bastia','stade-reims','stade-rennes','toulouse-fc','valenciennes-fc']

    games = [t1+"-"+t2 for t1 in teams for t2 in teams if t1!=t2]
    playerdf = cleaner_to_df(data,games)
    
    filename = filedir.split("/")[1][:-5].replace("-","_")
    filename = filename+'.pkl'
    
    playerdf.to_pickle(filename)

### Example of how to load one of our dataframes

In [17]:
playerdf = pd.read_pickle('BPL12_13.pkl')

In [19]:
playerdf["Robin van Persie"]["manchester-united-sunderland-afc"]

{'assists': [(59, False, False)],
 'goals': [(16, True, False)],
 'was_home': True,
 'was_sub': False}