# Linking and Cleaning Data

## Imports

In [271]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle
import locale

## Process Stats Data

### Load Data

In [469]:
goals_df = pickle.load(open('./data/goals_df.pkl', 'rb'))
assists_df = pickle.load(open('./data/assists_df.pkl', 'rb'))
shots_df = pickle.load(open('./data/shots_df.pkl', 'rb'))
fouls_df = pickle.load(open('./data/fouls_df.pkl', 'rb'))
gk_df = pickle.load(open('./data/goalkeeping_df.pkl', 'rb'))

### Merge Field Player Stats

In [446]:
def merge_stats(df1, df2):
    cols = list(set(df1.columns).intersection(df2.columns))
    merged_df = pd.merge(df1, df2, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])
    return merged_df

In [457]:
fp_df = merge_stats(goals_df, assists_df)
fp_df = merge_stats(fp_df, shots_df)
fp_df = merge_stats(fp_df, fouls_df)

### Clean Field Player Stats

In [461]:
# Replace null values with 0
fp_df = fp_df.replace(np.nan, '0')

In [462]:
# Split PKG/A column into to and drop
fp_df[['PKG', 'PKA']] = fp_df['PKG/A'].str.split('/', expand=True)
fp_df.drop('PKG/A', axis=1, inplace=True)
fp_df = fp_df.replace('', '0')

In [570]:
first_name = []
last_name = []
for item in fp_df['Player'].str.split(' '):
    first_name.append(item[0])
    last_name.append(' '.join(item[1:]))
fp_df['Last Name'] = last_name    
fp_df['First Name'] = first_name

#### Set Column Dtypes

In [464]:
int_cols = ['GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG', 
            'HmG', 'RdG', 'Year', 'GWA', 'HmA', 'RdA', 'FC', 'FS', 
            'OFF', 'YC', 'RC', 'PKG', 'PKA']
for col in int_cols:
    fp_df[col] = fp_df[col].astype(int)

In [465]:
float_cols = ['G/90min', 'SC%', 'A/90min', 'SOG%']
for col in float_cols:
    fp_df[col] = fp_df[col].astype(float)

In [466]:
fp_df.rename(columns={'G/90min': 'Gp90', 
                      'SC%': 'SCpct', 
                      'A/90min': 'Ap90', 
                      'SOG%': 'SOGpct'}, inplace=True)

#### Save Clean Data

In [None]:
fp_df.to_csv('fieldplayer.csv')

### Clean Goalkeeper Stats

In [473]:
# Replace null values with 0
gk_df = gk_df.replace(np.nan, '0')

In [474]:
# Split PKG/A column into to and drop
gk_df[['PKG', 'PKA']] = gk_df['PKG/A'].str.split('/', expand=True)
gk_df.drop('PKG/A', axis=1, inplace=True)
gk_df = gk_df.replace('', '0')

#### Set Column DTypes

In [481]:
int_cols = ['GP', 'GS', 'MINS', 'SHTS', 'SV', 'GA', 'W', 'L', 
            'T', 'ShO', 'Year', 'PKG', 'PKA']
for col in int_cols:
    gk_df[col] = gk_df[col].astype(int)

In [482]:
float_cols = ['GAA', 'W%', 'Sv%']
for col in float_cols:
    gk_df[col] = gk_df[col].astype(float)

#### Save Clean Data

In [484]:
gk_df.to_csv('goalkeeper.csv')

### Merge FP with GK Stats

In [215]:
stats_df = merge_stats(fieldplayer_df, goalkeeping_df)

In [171]:
stats_df.to_csv('stats.csv')

## Salary Data

### Load Data

In [666]:
salary_df = pickle.load(open('./data/salary_df.pkl', 'rb'))

### Clean Data

In [669]:
# Create Player column for linking
salary_df['Last Name'] = salary_df['Last Name'].str.strip()
salary_df['First Name'] = salary_df['First Name'].str.strip()
salary_df['Player'] = salary_df['First Name'] + ' ' + salary_df['Last Name']

# Rename Pos to POS for linking
salary_df.rename(columns={'Pos': 'POS'}, inplace=True)

# Create Year Column
salary_df['Year'] = salary_df.Date.map(lambda x: x.year)

In [670]:
# Convert numeric str to float
def convert_money(money_str):
    money_str = money_str.strip()
    # replace empty str with 0
    if len(money_str) == 0:
        money_str = re.sub('', '0', money_str)
    return float(re.sub('[\$,]', '', re.sub('-', '0', money_str)))

# Convert Salary Columns
salary_df['Base Salary'] = salary_df['Base Salary'].map(lambda x: convert_money(x))
salary_df['Guaranteed Compensation'] = salary_df['Guaranteed Compensation'].map(lambda x: convert_money(x))

# # Pure Pandas Implementation
# salary_df = salary_df.replace('', 0)
# salary_df = salary_df.replace('-', 0)
# salary_df['Base Salary'] = salary_df['Base Salary'].replace('[\$,]', '', regex=True).astype(float)
# salary_df['Guaranteed Compensation'] = salary_df['Guaranteed Compensation'].replace('[\$,]', '', regex=True).astype(float)

In [707]:
salary_df = salary_df.groupby(['Year', 'Club', 'Last Name', 'First Name', 'Player', 'POS'], as_index=False).mean()

### Split Salary Data into Field Players and Goalkeepers

In [710]:
gk_idx = (salary_df['POS'] == 'GK')
fp_salary_df = salary_df[~gk_idx]
gk_salary_df = salary_df[gk_idx]

### Merge Field Player Stats and Salary Data

In [720]:
fp_combo_df = pd.merge(fp_df, fp_salary_df, how='outer', 
                       on=['Year', 'Player', 'Club'], 
                       suffixes=['', '_Dup'])

In [721]:
fp_combo_df[~fp_combo_df.isnull().any(axis=1)]

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,RC,PKG,PKA,First Name,Last Name,Last Name_Dup,First Name_Dup,POS_Dup,Base Salary,Guaranteed Compensation
4,Maykel Galindo,CHV,F,28.0,24.0,2021.0,12.0,5.0,55.0,28.0,...,0.0,0.0,0.0,Maykel,Galindo,Galindo,Maykel,F,72500.00,72500.00
5,Ante Razov,CHV,F,26.0,24.0,2041.0,11.0,8.0,85.0,42.0,...,0.0,1.0,1.0,Ante,Razov,Razov,Ante,F,245000.00,248750.00
12,Alejandro Moreno,CLB,F,29.0,27.0,2357.0,7.0,7.0,54.0,32.0,...,0.0,0.0,0.0,Alejandro,Moreno,Moreno,Alejandro,F,120000.00,125000.00
17,Brian Ching,HOU,F,20.0,17.0,1590.0,7.0,2.0,48.0,18.0,...,0.0,0.0,0.0,Brian,Ching,Ching,Brian,F,220000.00,220000.00
19,Nate Jaqua,HOU,F,25.0,19.0,1772.0,7.0,2.0,49.0,19.0,...,0.0,0.0,0.0,Nate,Jaqua,Jaqua,Nate,F,112500.00,119525.00
20,Chad Barrett,CHI,F,30.0,25.0,1983.0,7.0,2.0,59.0,37.0,...,0.0,1.0,1.0,Chad,Barrett,Barrett,Chad,F,41212.50,48712.50
21,Carlos Ruiz,DAL,F,22.0,19.0,1721.0,7.0,2.0,59.0,25.0,...,1.0,1.0,1.0,Carlos,Ruiz,Ruiz,Carlos,F,325000.00,435000.00
23,Chris Rolfe,CHI,F,19.0,18.0,1451.0,6.0,3.0,38.0,19.0,...,0.0,1.0,1.0,Chris,Rolfe,Rolfe,Chris,F,70000.00,74700.00
28,Jovan Kirovski,COL,M-F,28.0,28.0,2161.0,6.0,1.0,36.0,14.0,...,0.0,4.0,4.0,Jovan,Kirovski,Kirovski,Jovan,F,200000.00,200000.00
29,Guillermo Barros Schelotto,CLB,F,22.0,19.0,1605.0,5.0,11.0,39.0,18.0,...,0.0,1.0,1.0,Guillermo,Barros Schelotto,Schelotto,Guillermo Barros,F,150000.00,150000.00


In [713]:
fp_combo_df.to_csv('fp_combo2.csv')

In [504]:
guzan_stats = gk_df[gk_df.Player == 'Brad Guzan']
guzan_stats

Unnamed: 0,Player,Club,POS,GP,GS,MINS,SHTS,SV,GA,GAA,W,L,T,ShO,W%,Sv%,Year,Season,PKG,PKA
8,Brad Guzan,ATL,GK,27,27,2430,119,87,25,0.93,14,6,7,13,51.9,73.1,2007,REG,2,3
57,Brad Guzan,ATL,GK,15,15,1350,71,48,20,1.33,6,5,4,4,40.0,67.6,2008,REG,1,1
593,Brad Guzan,ATL,GK,14,14,1260,47,38,10,0.71,6,1,7,8,42.9,80.9,2017,REG,1,1


In [505]:
guzan_money = salary_df[salary_df.Player == 'Brad Guzan']
guzan_money

Unnamed: 0,Club,Last Name,First Name,POS,Base Salary,Guaranteed Compensation,Date,Player,Year
216,CHV,Guzan,Brad,GK,52237.5,67237.5,2007-08-31,Brad Guzan,2007
656,CHV,Guzan,Brad,GK,88974.0,103974.38,2008-10-07,Brad Guzan,2008
5083,ATL,Guzan,Brad,GK,340008.0,400008.0,2017-09-15,Brad Guzan,2017


In [508]:
pd.merge(guzan_stats, guzan_money, how='outer', 
         left_on=['Year', 'Club', 'Player'], 
         right_on=['Year', 'Club', 'Player'], 
         suffixes=['', '_Dup'])

Unnamed: 0,Player,Club,POS,GP,GS,MINS,SHTS,SV,GA,GAA,...,Year,Season,PKG,PKA,Last Name,First Name,POS_Dup,Base Salary,Guaranteed Compensation,Date
0,Brad Guzan,ATL,GK,27.0,27.0,2430.0,119.0,87.0,25.0,0.93,...,2007,REG,2.0,3.0,,,,,,
1,Brad Guzan,ATL,GK,15.0,15.0,1350.0,71.0,48.0,20.0,1.33,...,2008,REG,1.0,1.0,,,,,,
2,Brad Guzan,ATL,GK,14.0,14.0,1260.0,47.0,38.0,10.0,0.71,...,2017,REG,1.0,1.0,Guzan,Brad,GK,340008.0,400008.0,2017-09-15
3,Brad Guzan,CHV,,,,,,,,,...,2007,,,,Guzan,Brad,GK,52237.5,67237.5,2007-08-31
4,Brad Guzan,CHV,,,,,,,,,...,2008,,,,Guzan,Brad,GK,88974.0,103974.38,2008-10-07


In [509]:
cols = ['Year', 'Player']
df = pd.merge(merged_df, salary_df, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])

In [511]:
gk_df.groupby(['Year', 'Club', 'Player']).last()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,POS,GP,GS,MINS,SHTS,SV,GA,GAA,W,L,T,ShO,W%,Sv%,Season,PKG,PKA
Year,Club,Player,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2007,ATL,Brad Guzan,GK,27,27,2430,119,87,25,0.93,14,6,7,13,51.9,73.1,REG,2,3
2007,CHI,Jon Busch,GK,3,3,270,20,15,5,1.67,1,2,0,1,33.3,75.0,REG,2,2
2007,CHI,Matt Pickens,GK,27,27,2430,137,102,31,1.15,9,8,10,10,33.3,74.5,REG,4,4
2007,CHI,Nick Noble,GK,0,0,0,0,0,0,0.00,0,0,0,0,0.0,0.0,REG,0,0
2007,CHV,Justin Myers,GK,0,0,0,0,0,0,0.00,0,0,0,0,0.0,0.0,REG,0,0
2007,CHV,Preston Burpo,GK,3,3,270,16,13,3,1.00,1,1,1,1,33.3,81.3,REG,0,0
2007,CLB,Andy Gruenebaum,GK,10,10,900,53,35,15,1.50,1,4,5,3,10.0,66.0,REG,0,0
2007,CLB,Bill Gaudette,GK,0,0,0,0,0,0,0.00,0,0,0,0,0.0,0.0,REG,0,0
2007,CLB,William Hesmer,GK,20,20,1800,99,71,29,1.45,8,7,5,5,40.0,71.7,REG,2,3
2007,COL,Bouna Coundoul,GK,30,30,2668,158,120,32,1.07,9,12,8,9,30.0,75.9,REG,2,2


In [246]:
df.to_csv('field_player.csv')

#### Fix Player Name Association

In [249]:
import fuzzywuzzy as fwuzz

In [600]:
players1 = pd.concat([fp_df.Player, gk_df.Player], ignore_index=True).unique()

In [601]:
players2 = salary_df.Player.unique()

In [624]:
true_player = {}
for p2 in players2:
    scores = np.zeros((len(players1), 3))
    for ii, p1 in enumerate(players1):
        scores[ii, :] = np.array([fwuzz.fuzz.ratio(p1, p2), 
                                  fwuzz.fuzz.partial_ratio(p1, p2),
                                  fwuzz.fuzz.token_set_ratio(p1, p2)])
    true_player[p2] = scores

In [625]:
true_player.keys()

dict_keys(['Nana Attakora-Gyan', 'Michael Banner', 'Juan Sebastian Botero', 'Desmond Brooks', 'Bobby Burling', 'Salvatore Caccavale', 'Blake Camp', 'Jeff Carroll', 'Mike Caso', 'Danny Cepero', 'Shawn Crowe', 'Steven Curfman', 'Jeff Curtin', 'Andrew Daniels', 'Michael Dello-Russo', 'Stephen DeRoux', 'John DiRaimondo', 'Eric Ebert', 'Gary Flood', 'Lance Friesz', 'Gabriel Gala', 'Sandy Gbandi', 'Miguel Gonzalez', 'Willy Guadarrama', 'David Guzman', 'Nick Hatzke', 'Kyle Helton', 'Kenneth Hoerner', 'Justin Hughes', 'Ben Hunter', 'Jordan James', 'Scott Jones', 'Ryan Junge', 'Chris Karcz', 'Kosuke Kimura', 'Dustin Kirby', 'Brad Knighton', 'Chris Konopka', 'Michael Kraus', 'Nick LaBrocca', 'Jerrod Laventure', 'Chris Loftus', 'Amir Lowery', 'Stephen Lumley', 'Bruno Marques', 'Joey Melo', 'Mira Mupier', 'Justin Myers', 'Nick Noble', 'Brad North', 'Arsene Oka', 'Randi Patterson', 'Andrew Peterson', 'Eder Robles', 'Jordan Russolillo', 'Erasmo Solorzano', 'Erik Ustruck', 'Clifton Wilmes', 'Daniel W

In [662]:
player_map = {}
for player in true_player.keys():
    match_idx = true_player[player][:, 2].argmax()
    match = players1[match_idx]
    match_score = true_player[player][match_idx, :] / 100
    player_map[player] = (match, match_score)
    if match_score[2] < 0.9 and match_score[2] >= 0.8:
        print(player, '=>', match, ':', match_score)

Michael Banner => Michael Zaher : [ 0.81  0.77  0.81]
Salvatore Caccavale => Sal Caccavale : [ 0.81  0.77  0.82]
Gabriel Gala => Gabe Gala : [ 0.86  0.67  0.86]
Michael Randolph => Mike Randolph : [ 0.83  0.77  0.83]
Daniel Gargan => Dan Gargan : [ 0.87  0.7   0.87]
Ricardo Pereira => Ricardo Perez : [ 0.86  0.92  0.86]
Robert Findley => Robbie Findley : [ 0.86  0.86  0.86]
Hercules Gomez => Hérculez Gómez : [ 0.79  0.79  0.85]
Zachary Wells => Zach Wells : [ 0.87  0.7   0.87]
Daniel Dichio => Danny Dichio : [ 0.8   0.75  0.8 ]
Andrew Welsh => Andy Welsh : [ 0.82  0.7   0.82]
Miguel Arce => Mikel Arce : [ 0.86  0.8   0.86]
Daniel Cepero => Danny Cepero : [ 0.8   0.75  0.8 ]
Mike Palacio => Michael Palacio : [ 0.81  0.75  0.81]
Michael Chabala => Mike Chabala : [ 0.81  0.75  0.81]
Mkhokheli Dube => Kheli Dube : [ 0.75  0.9   0.83]
Christopher Sharpe => Chris Sharpe : [ 0.8   0.75  0.8 ]
Anthony Beltran => Tony Beltran : [ 0.81  0.92  0.89]
Michael Fucito => Mike Fucito : [ 0.8   0.73  0

In [659]:
player_map['Jose Martin']

KeyError: 'credits'

In [667]:
salary_df[salary_df['Last Name'] == 'Ortiz']

Unnamed: 0,Club,Last Name,First Name,Pos,Base Salary,Guaranteed Compensation,Date
1087,DC,Ortiz,Jose Guillermo,F,"$129,996.00","$137,621.00",2017-04-15
1682,DAL,Ortiz,Juan Esteban,M,"$160,000","$204,500.00",2016-09-15
2243,DAL,Ortiz,Juan Esteban,M,"$160,000","$204,500.00",2016-05-15


In [668]:
salary_df.groupby(['Year', 'Club']).mean()

KeyError: 'Year'

## Playing with Multi-Index

In [683]:
tmp = salary_df.groupby(['Year', 'Club'], as_index=True).mean()
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,Base Salary,Guaranteed Compensation
Year,Club,Unnamed: 2_level_1,Unnamed: 3_level_1
2007,CHI,152155.775862,162631.189655
2007,CHV,59938.239286,63915.917857
2007,CLB,65156.933103,72697.450345
2007,COL,75212.172414,78665.620690
2007,DAL,90023.862903,99121.443548
2007,DC,77826.481481,84310.740741
2007,HOU,87038.781111,90153.595926
2007,KC,84204.310345,90950.000000
2007,LA,316521.108846,358540.553462
2007,NE,68549.925926,72068.444444


In [684]:
tmp.index

MultiIndex(levels=[[2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017], ['', 'ATL', 'Araujo', 'CHI', 'CHV', 'CLB', 'COL', 'DAL', 'DC', 'HOU', 'KC', 'LA', 'LAFC', 'MNUFC', 'MTL', 'NE', 'NY', 'NYCFC', 'NYRB', 'None', 'ORL', 'PHI', 'POOL', 'POR', 'Pool', 'RSL', 'SEA', 'SJ', 'TFC', 'TOR', 'Unassigned', 'VAN']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1

In [687]:
tmp.iloc[0]

Base Salary                152155.775862
Guaranteed Compensation    162631.189655
Name: (2007, CHI), dtype: float64

In [690]:
tmp.loc[2007, 'CHI']

Base Salary                152155.775862
Guaranteed Compensation    162631.189655
Name: (2007, CHI), dtype: float64

In [692]:
tmp2 = salary_df.groupby(['Year', 'Club', 'Player'], as_index=True).mean()

In [693]:
tmp2.loc[2017, 'DC']

Unnamed: 0_level_0,Base Salary,Guaranteed Compensation
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Alhaji Kamara,80000.0,88250.0
Bill Hamid,350000.0,395500.0
Bobby Boswell,260000.0,260000.0
Bruno Miranda,53000.04,57476.13
Chris Durkin,70000.0,79166.67
Chris Korb,65004.0,71004.0
Chris Odoi-Atsem,65000.04,72500.04
Chris Rolfe,275000.0,282500.0
Deshorn Brown,264996.0,337329.33
Eric Klenofsky,53004.0,56754.0
