# Linking and Cleaning Data

## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle
import locale
from fuzzywuzzy import fuzz

## Process Stats Data

### Load Data

In [2]:
goals_df = pickle.load(open('./data/goals_df.pkl', 'rb'))
assists_df = pickle.load(open('./data/assists_df.pkl', 'rb'))
shots_df = pickle.load(open('./data/shots_df.pkl', 'rb'))
fouls_df = pickle.load(open('./data/fouls_df.pkl', 'rb'))
gk_df = pickle.load(open('./data/goalkeeping_df.pkl', 'rb'))

### Merge Field Player Stats

In [3]:
def merge_stats(df1, df2):
    cols = list(set(df1.columns).intersection(df2.columns))
    merged_df = pd.merge(df1, df2, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])
    return merged_df

In [4]:
fp_df = merge_stats(goals_df, assists_df)
fp_df = merge_stats(fp_df, shots_df)
fp_df = merge_stats(fp_df, fouls_df)

### Clean Field Player Stats

In [5]:
# Replace null values with 0
fp_df = fp_df.replace(np.nan, '0')

In [6]:
# Split PKG/A column into to and drop
fp_df[['PKG', 'PKA']] = fp_df['PKG/A'].str.split('/', expand=True)
fp_df.drop('PKG/A', axis=1, inplace=True)
fp_df = fp_df.replace('', '0')

In [7]:
# Split Player into Last Name and First Name columns
first_name = []
last_name = []
for item in fp_df['Player'].str.split(' '):
    first_name.append(item[0])
    last_name.append(' '.join(item[1:]))
fp_df['Last Name'] = last_name    
fp_df['First Name'] = first_name

#### Set Column Dtypes

In [8]:
int_cols = ['GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG', 
            'HmG', 'RdG', 'Year', 'GWA', 'HmA', 'RdA', 'FC', 'FS', 
            'OFF', 'YC', 'RC', 'PKG', 'PKA']
for col in int_cols:
    fp_df[col] = fp_df[col].astype(int)

In [9]:
float_cols = ['G/90min', 'SC%', 'A/90min', 'SOG%']
for col in float_cols:
    fp_df[col] = fp_df[col].astype(float)

In [10]:
fp_df.rename(columns={'G/90min': 'Gp90', 
                      'SC%': 'SCpct', 
                      'A/90min': 'Ap90', 
                      'SOG%': 'SOGpct'}, inplace=True)

#### Strip Whitespace

In [11]:
str_cols = ['Player', 'Club', 'POS', 'Season', 'Last Name', 'First Name']
for col in str_cols:
    fp_df[col] = fp_df[col].str.strip()

#### Save Clean Data

In [12]:
fp_df.to_csv('fieldplayer.csv')
fp_df.to_pickle('fieldplayer.pkl')

### Clean Goalkeeper Stats

In [13]:
# Replace null values with 0
gk_df = gk_df.replace(np.nan, '0')

In [14]:
# Split PKG/A column into to and drop
gk_df[['PKG', 'PKA']] = gk_df['PKG/A'].str.split('/', expand=True)
gk_df.drop('PKG/A', axis=1, inplace=True)
gk_df = gk_df.replace('', '0')

In [15]:
# Split Player into Last Name and First Name columns
first_name = []
last_name = []
for item in gk_df['Player'].str.split(' '):
    first_name.append(item[0])
    last_name.append(' '.join(item[1:]))
gk_df['Last Name'] = last_name    
gk_df['First Name'] = first_name

#### Set Column DTypes

In [16]:
int_cols = ['GP', 'GS', 'MINS', 'SHTS', 'SV', 'GA', 'W', 'L', 
            'T', 'ShO', 'Year', 'PKG', 'PKA']
for col in int_cols:
    gk_df[col] = gk_df[col].astype(int)

In [17]:
float_cols = ['GAA', 'W%', 'Sv%']
for col in float_cols:
    gk_df[col] = gk_df[col].astype(float)

#### Strip Whitespace

In [11]:
str_cols = ['Player', 'Club', 'POS', 'Season', 'Last Name', 'First Name']
for col in str_cols:
    gk_df[col] = gk_df[col].str.strip()

#### Save Clean Data

In [18]:
gk_df.to_csv('goalkeeper.csv')
gk_df.to_pickle('goalkeeper.pkl')

### Merge FP with GK Stats

In [19]:
stats_df = merge_stats(fp_df, gk_df)

In [20]:
stats_df.to_csv('stats.csv')

## Salary Data

### Load Data

In [241]:
salary_df = pickle.load(open('./data/salary_df.pkl', 'rb'))

### Clean Data

In [243]:
# Create Player column for linking
salary_df['Last Name'] = salary_df['Last Name'].str.strip()
salary_df['First Name'] = salary_df['First Name'].str.strip()
salary_df['Player'] = salary_df['First Name'] + ' ' + salary_df['Last Name']

# Replace NAN names with empty string
salary_df['First Name'] = salary_df['First Name'].replace(np.nan, '')
salary_df['Last Name'] = salary_df['Last Name'].replace(np.nan, '')

# Rename Pos to POS for linking
salary_df.rename(columns={'Pos': 'POS'}, inplace=True)

# Create Year Column
salary_df['Year'] = salary_df.Date.map(lambda x: x.year)

# Convert money str to float
def convert_money(salary_df, columns):
    if isinstance(columns, str):
        columns = [columns]
    for col in columns:
        salary_df[col] = salary_df[col].replace('', 0)
        salary_df[col] = salary_df[col].replace('-', 0)
        salary_df[col] = salary_df[col].replace('[\$,]', '', regex=True).astype(float)
    return salary_df
salary_df = convert_money(salary_df, ['Base Salary', 'Guaranteed Compensation'])     

#### Strip Whitespace

In [244]:
str_cols = ['Player', 'Club', 'POS', 'Last Name', 'First Name']
for col in str_cols:
    salary_df[col] = salary_df[col].str.strip()

In [245]:
salary_df[salary_df['Last Name'] == 'Kaka']

Unnamed: 0,Club,Last Name,First Name,POS,Base Salary,Guaranteed Compensation,Date,Player,Year
0,ORL,Kaka,,M,6660000.0,7167500.0,2017-09-15,Kaka,2017
960,ORL,Kaka,,M,6660000.0,7167500.0,2017-04-15,Kaka,2017
1855,ORL,Kaka,,M,6660000.0,7167500.0,2016-09-15,Kaka,2016
2410,ORL,Kaka,,M,6660000.0,7167500.0,2016-05-15,Kaka,2016
2984,ORL,Kaka,,M,6660000.0,7167500.0,2015-09-15,Kaka,2015
3550,ORL,Kaka,,M,6660000.0,7167500.0,2015-07-15,Kaka,2015
4122,ORL,Kaka,,M,6660000.0,7167500.0,2014-09-15,Kaka,2014


In [246]:
salary_df[salary_df['Last Name'] == 'Findley']

Unnamed: 0,Club,Last Name,First Name,POS,Base Salary,Guaranteed Compensation,Date,Player,Year
2874,TOR,Findley,Robbie,F,225000.0,255500.0,2015-09-15,Robbie Findley,2015
3447,TOR,Findley,Robbie,F,225000.0,255500.0,2015-07-15,Robbie Findley,2015
4035,RSL,Findley,Robbie,F,215000.0,245500.0,2014-09-15,Robbie Findley,2014
4595,RSL,Findley,Robbie,F,215000.0,245500.0,2014-04-01,Robbie Findley,2014
5157,RSL,Findley,Robbie,F,175000.0,205500.0,2013-09-15,Robbie Findley,2013
5724,RSL,Findley,Robbie,F,175000.0,205500.0,2013-08-01,Robbie Findley,2013
6284,RSL,Findley,Robbie,F,175000.0,205500.0,2013-05-01,Robbie Findley,2013
8744,RSL,Findley,Robert,F,73566.0,87316.0,2010-08-12,Robert Findley,2010
9146,RSL,Findley,Robert,F,60060.0,72560.0,2009-09-15,Robert Findley,2009
9559,RSL,Findley,Robert,F,47100.0,59600.0,2008-10-07,Robert Findley,2008


#### Keep Last Reported Salary
Salary data may have been released multiple times during the year, generally once in the spring and once in the fall but sometimes more often. We want the last salary reported for the year. Salary is usually consistent across datasets for the same year; however, these sometimes differ and we want to guarantee we're using the lastest.   

In [247]:
# Sort dataframe by Date, then keep last reported salary from the groupby
salary_df = salary_df.sort_values(['Date', 'Year', 'Club'])
salary_df = salary_df.groupby(['Year', 'Club', 'Last Name', 'First Name', 'Player', 'POS'], as_index=False).first()

In [248]:
salary_df[salary_df['Last Name'] == 'Findley']

Unnamed: 0,Year,Club,Last Name,First Name,Player,POS,Base Salary,Guaranteed Compensation,Date
323,2007,RSL,Findley,Robert,Robert Findley,F,36000.0,48500.0,2007-08-31
706,2008,RSL,Findley,Robert,Robert Findley,F,47100.0,59600.0,2008-10-07
1082,2009,RSL,Findley,Robert,Robert Findley,F,60060.0,72560.0,2009-09-15
1486,2010,RSL,Findley,Robert,Robert Findley,F,73566.0,87316.0,2010-08-12
3232,2013,RSL,Findley,Robbie,Robbie Findley,F,175000.0,205500.0,2013-05-01
3877,2014,RSL,Findley,Robbie,Robbie Findley,F,215000.0,245500.0,2014-04-01
4590,2015,TOR,Findley,Robbie,Robbie Findley,F,225000.0,255500.0,2015-07-15


### Split Salary Data into Field Players and Goalkeepers

In [249]:
gk_idx = (salary_df['POS'] == 'GK')
fp_salary_df = salary_df[~gk_idx]
gk_salary_df = salary_df[gk_idx]

### Merge Field Player Stats with Field Player Salary Data

In [250]:
# Not merging on Club because this causes issues 
cols = ['Year', 'Last Name']
df = pd.merge(fp_df, fp_salary_df, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])

In [251]:
good_merge = df[~df.isnull().any(axis=1)]

In [252]:
len(good_merge)

5921

In [253]:
good_merge[good_merge['Last Name'] == 'Findley']

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,PKA,Last Name,First Name,Club_Dup,First Name_Dup,Player_Dup,POS_Dup,Base Salary,Guaranteed Compensation,Date
10,Robbie Findley,LA,F,25.0,14.0,1353.0,8.0,0.0,31.0,16.0,...,0.0,Findley,Robbie,RSL,Robert,Robert Findley,F,36000.0,48500.0,2007-08-31
445,Robbie Findley,RSL,F,29.0,15.0,1493.0,6.0,5.0,44.0,18.0,...,0.0,Findley,Robbie,RSL,Robert,Robert Findley,F,47100.0,59600.0,2008-10-07
876,Robbie Findley,RSL,F,27.0,18.0,1751.0,12.0,4.0,60.0,26.0,...,1.0,Findley,Robbie,RSL,Robert,Robert Findley,F,60060.0,72560.0,2009-09-15
1316,Robbie Findley,RSL,F,24.0,15.0,1355.0,5.0,4.0,33.0,14.0,...,2.0,Findley,Robbie,RSL,Robert,Robert Findley,F,73566.0,87316.0,2010-08-12
3083,Robbie Findley,RSL,F,25.0,17.0,1260.0,6.0,2.0,36.0,15.0,...,0.0,Findley,Robbie,RSL,Robbie,Robbie Findley,F,175000.0,205500.0,2013-05-01
3976,Robbie Findley,RSL,F,16.0,8.0,764.0,1.0,0.0,24.0,9.0,...,0.0,Findley,Robbie,RSL,Robbie,Robbie Findley,F,215000.0,245500.0,2014-04-01
4495,Robbie Findley,TOR,F,25.0,18.0,1476.0,2.0,3.0,28.0,8.0,...,0.0,Findley,Robbie,TOR,Robbie,Robbie Findley,F,225000.0,255500.0,2015-07-15


In [255]:
good_merge.to_csv('goodmerge_fp.csv')
good_merge.to_pickle('goodmerge_fp.pkl')

In [256]:
bad_merge = df[df.isnull().any(axis=1)]

In [257]:
len(bad_merge)

915

In [259]:
bad_merge[bad_merge['Base Salary'] > 100000]

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,PKA,Last Name,First Name,Club_Dup,First Name_Dup,Player_Dup,POS_Dup,Base Salary,Guaranteed Compensation,Date
6445,,,,,,,,,,,...,,Schelotto,,CLB,Guillermo Barros,Guillermo Barros Schelotto,F,150000.00,150000.00,2007-08-31
6446,,,,,,,,,,,...,,De Oliveira,,DAL,Denilson,Denilson De Oliveira,M,872736.00,879936.00,2007-08-31
6448,,,,,,,,,,,...,,Carreiro,,DC,Fred da Silva,Fred da Silva Carreiro,M-F,190008.00,222008.00,2007-08-31
6450,,,,,,,,,,,...,,DeRosario,,HOU,Dwayne,Dwayne DeRosario,M,324999.96,324999.96,2007-08-31
6453,,,,,,,,,,,...,,Angel,,NY,Juan Pablo,Juan Pablo Angel,F,1500000.00,1593750.00,2007-08-31
6454,,,,,,,,,,,...,,Van Den Bergh,,NY,Dave,Dave Van Den Bergh,M,180000.00,214583.33,2007-08-31
6458,,,,,,,,,,,...,,Schelotto,,CLB,Guillermo Barros,Guillermo Barros Schelotto,F,250000.00,375000.00,2008-10-07
6460,,,,,,,,,,,...,,Carreiro da Silva,,DC,Fred,Fred Carreiro da Silva,M,210000.00,242000.00,2008-10-07
6461,,,,,,,,,,,...,,DeRosario,,HOU,Dwayne,Dwayne DeRosario,M,325000.00,324999.96,2008-10-07
6464,,,,,,,,,,,...,,Angel,,NY,Juan Pablo,Juan Pablo Angel,F,1500000.00,1593750.00,2008-10-07


In [260]:
bad_merge.to_csv('badmerge_fp.csv')
bad_merge.to_pickle('badmerge_fp.pkl')

### Fix Player Name Association
Some names are different between the datasets; therefore, they fail to link during the merge operation.  Given time, it would be nice to explore fixing this association for as many as possible.

In [35]:
players1 = pd.concat([fp_df.Player, gk_df.Player], ignore_index=True).unique()

In [36]:
players2 = salary_df.Player.unique()

In [None]:
true_player = {}
for p2 in players2:
    scores = np.zeros((len(players1), 3))
    for ii, p1 in enumerate(players1):
        scores[ii, :] = np.array([fuzz.ratio(p1, p2), 
                                  fuzz.partial_ratio(p1, p2),
                                  fuzz.token_set_ratio(p1, p2)])
    true_player[p2] = scores

In [None]:
player_map = {}
for player in true_player.keys():
    match_idx = true_player[player][:, 2].argmax()
    match = players1[match_idx]
    match_score = true_player[player][match_idx, :] / 100
    player_map[player] = (match, match_score)
    if (match_score[2] < 0.9) & (match_score[2] >= 0.8):
        print(player, '=>', match, ':', match_score)

In [None]:
test = fp_df.groupby(['Year', 'Club']).last()
test

In [None]:
for 

## Playing with Multi-Index

In [None]:
tmp = salary_df.groupby(['Year', 'Club'], as_index=True).mean()
tmp

In [None]:
tmp.index

In [None]:
tmp.iloc[0]

In [None]:
tmp.loc[2007, 'CHI']

In [None]:
tmp2 = salary_df.groupby(['Year', 'Club', 'Player'], as_index=True).mean()

In [None]:
tmp2.loc[2017, 'DC']