# Linking and Cleaning Data

## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle
import locale
from fuzzywuzzy import fuzz

## Process Stats Data

### Load Data

In [2]:
goals_df = pickle.load(open('./scraped_data/goals_df.pkl', 'rb'))
assists_df = pickle.load(open('./scraped_data/assists_df.pkl', 'rb'))
shots_df = pickle.load(open('./scraped_data/shots_df.pkl', 'rb'))
fouls_df = pickle.load(open('./scraped_data/fouls_df.pkl', 'rb'))
gk_df = pickle.load(open('./scraped_data/goalkeeping_df.pkl', 'rb'))

### Merge Field Player Stats

In [3]:
def merge_stats(df1, df2):
    cols = list(set(df1.columns).intersection(df2.columns))
    merged_df = pd.merge(df1, df2, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])
    return merged_df

In [4]:
fp_df = merge_stats(goals_df, assists_df)
fp_df = merge_stats(fp_df, shots_df)
fp_df = merge_stats(fp_df, fouls_df)

### Helper Functions

In [5]:
def map_club_to_name(club):
    ''' Maps club names to consistent naming convention'''

    club_name_map = {'': 'UNK',
                     '0': 'UNK',
                     'ATL': 'ATL',
                     'Araujo': 'UNK',
                     'CAN': 'UNK',
                     'CHI': 'CHI',
                     'CHV': 'CHV',
                     'CIV': 'UNK',
                     'CLB': 'CLB',
                     'COL': 'COL',
                     'DAL': 'DAL',
                     'DC': 'DC',
                     'ECU': 'UNK',
                     'GHA': 'UNK',
                     'HAI': 'UNK',
                     'HON': 'UNK', 
                     'HOU': 'HOU',
                     'JAM': 'JAM',
                     'KC': 'KC',
                     'LA': 'LA',
                     'LFC': 'LAFC',
                     'LAFC': 'LAFC',
                     'MIN': 'MIN',
                     'MNUFC': 'MIN',
                     'MTL': 'MTL',
                     'MTQ': 'UNK',
                     'NE': 'NE',
                     'NY': 'NY',
                     'NYC': 'NYCFC',
                     'NYCFC': 'NYCFC',
                     'NYR': 'NY',
                     'NYRB': 'NY',
                     'None': 'UNK',
                     'POOL': 'UNK',
                     'OCS': 'ORL',
                     'ORL': 'ORL',
                     'PAN': 'UNK',
                     'PHI': 'PHI',
                     'POR': 'POR',
                     'ROC': 'UNK',
                     'RSL': 'RSL',
                     'SEA': 'SEA',
                     'SJ': 'SJ',
                     'SKC': 'KC',
                     'SLV': 'UNK',
                     'TOR': 'TOR',
                     'TFC': 'TOR',
                     'USA': 'UNK',
                     'VAN': 'VAN',
                     'Unassigned': 'UNK'}
    try: 
        club_name = club_name_map[club]
    except:
        club_name = 'UNK'
        
    if (len(club_name) < 0) | (club_name == 0):
        club_name = 'UNK'
        
    return club_name

### Clean Field Player Stats

In [6]:
# Replace null values with 0
fp_df = fp_df.replace(np.nan, '0')

# Split PKG/A column into to and drop
fp_df[['PKG', 'PKA']] = fp_df['PKG/A'].str.split('/', expand=True)
fp_df.drop('PKG/A', axis=1, inplace=True)
fp_df = fp_df.replace('', '0')

# Split Player into Last Name and First Name columns
first_name = []
last_name = []
for item in fp_df['Player'].str.split(' '):
    first_name.append(item[0])
    last_name.append(' '.join(item[1:]))
fp_df['Last Name'] = last_name    
fp_df['First Name'] = first_name

#### Set Column Dtypes

In [7]:
# Integer Columns
int_cols = ['GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG', 
            'HmG', 'RdG', 'Year', 'GWA', 'HmA', 'RdA', 'FC', 'FS', 
            'OFF', 'YC', 'RC', 'PKG', 'PKA']
for col in int_cols:
    fp_df[col] = fp_df[col].astype(int)

# Float Columns
float_cols = ['G/90min', 'SC%', 'A/90min', 'SOG%']
for col in float_cols:
    fp_df[col] = fp_df[col].astype(float)
    
# Strip whitespace from string columns
str_cols = ['Player', 'Club', 'POS', 'Season', 'Last Name', 'First Name']
for col in str_cols:
    fp_df[col] = fp_df[col].str.strip()

In [8]:
# Make club naming consistent
fp_df['Club Name'] = fp_df['Club'].map(lambda x: map_club_to_name(x))

#### Rename Columns 

In [9]:
# Rename Columns
fp_df.rename(columns={'G/90min': 'Gp90', 
                      'SC%': 'SCpct', 
                      'A/90min': 'Ap90', 
                      'SOG%': 'SOGpct'}, inplace=True)

#### Save Clean Data

In [10]:
fp_df.to_csv('fp_clean.csv')
fp_df.to_pickle('fp_clean.pkl')

### Clean Goalkeeper Stats

In [11]:
# Replace null values with 0
gk_df = gk_df.replace(np.nan, '0')

# Split PKG/A column into to and drop
gk_df[['PKG', 'PKA']] = gk_df['PKG/A'].str.split('/', expand=True)
gk_df.drop('PKG/A', axis=1, inplace=True)
gk_df = gk_df.replace('', '0')

# Split Player into Last Name and First Name columns
first_name = []
last_name = []
for item in gk_df['Player'].str.split(' '):
    first_name.append(item[0])
    last_name.append(' '.join(item[1:]))
gk_df['Last Name'] = last_name    
gk_df['First Name'] = first_name

#### Set Column DTypes

In [12]:
# Integer Columns
int_cols = ['GP', 'GS', 'MINS', 'SHTS', 'SV', 'GA', 'W', 'L', 
            'T', 'ShO', 'Year', 'PKG', 'PKA']
for col in int_cols:
    gk_df[col] = gk_df[col].astype(int)

# Float Columns
float_cols = ['GAA', 'W%', 'Sv%']
for col in float_cols:
    gk_df[col] = gk_df[col].astype(float)

# Strip whitespace from string columns
str_cols = ['Player', 'Club', 'POS', 'Season', 'Last Name', 'First Name']
for col in str_cols:
    gk_df[col] = gk_df[col].str.strip()

In [13]:
# Make club naming consistent
gk_df['Club Name'] = gk_df['Club'].map(lambda x: map_club_to_name(x))

#### Rename Columns

In [14]:
# Rename Columns
gk_df.rename(columns={'W%': 'Wpct', 
                      'Sv%': 'SvPct'}, inplace=True)

#### Save Clean Data

In [15]:
gk_df.to_csv('goalkeeper.csv')
gk_df.to_pickle('goalkeeper.pkl')

## Salary Data

### Load Data

In [16]:
salary_df = pickle.load(open('./scraped_data/salary_df.pkl', 'rb'))

### Clean Data

In [17]:
# Create Player column for linking
salary_df['Last Name'] = salary_df['Last Name'].str.strip()
salary_df['First Name'] = salary_df['First Name'].str.strip()
salary_df['Player'] = salary_df['First Name'] + ' ' + salary_df['Last Name']

# Replace NAN names with empty string
salary_df['First Name'] = salary_df['First Name'].replace(np.nan, '')
salary_df['Last Name'] = salary_df['Last Name'].replace(np.nan, '')

# Rename Pos to POS for linking
salary_df.rename(columns={'Pos': 'POS'}, inplace=True)

# Create Year Column
salary_df['Year'] = salary_df.Date.map(lambda x: x.year)

# Convert money str to float
def convert_money(salary_df, columns):
    if isinstance(columns, str):
        columns = [columns]
    for col in columns:
        salary_df[col] = salary_df[col].replace('', 0)
        salary_df[col] = salary_df[col].replace('-', 0)
        salary_df[col] = salary_df[col].replace('[\$,]', '', regex=True).astype(float)
    return salary_df
salary_df = convert_money(salary_df, ['Base Salary', 'Guaranteed Compensation'])    

#### Strip Whitespace

In [18]:
str_cols = ['Player', 'Club', 'POS', 'Last Name', 'First Name']
for col in str_cols:
    salary_df[col] = salary_df[col].str.strip()

In [19]:
# Make club naming consistent
salary_df['Club Name'] = salary_df['Club'].map(lambda x: map_club_to_name(x))

In [20]:
salary_df[salary_df['Last Name'] == 'Findley']

Unnamed: 0,Club,Last Name,First Name,POS,Base Salary,Guaranteed Compensation,Date,Player,Year,Club Name
2874,TOR,Findley,Robbie,F,225000.0,255500.0,2015-09-15,Robbie Findley,2015,TOR
3447,TOR,Findley,Robbie,F,225000.0,255500.0,2015-07-15,Robbie Findley,2015,TOR
4035,RSL,Findley,Robbie,F,215000.0,245500.0,2014-09-15,Robbie Findley,2014,RSL
4595,RSL,Findley,Robbie,F,215000.0,245500.0,2014-04-01,Robbie Findley,2014,RSL
5157,RSL,Findley,Robbie,F,175000.0,205500.0,2013-09-15,Robbie Findley,2013,RSL
5724,RSL,Findley,Robbie,F,175000.0,205500.0,2013-08-01,Robbie Findley,2013,RSL
6284,RSL,Findley,Robbie,F,175000.0,205500.0,2013-05-01,Robbie Findley,2013,RSL
8744,RSL,Findley,Robert,F,73566.0,87316.0,2010-08-12,Robert Findley,2010,RSL
9146,RSL,Findley,Robert,F,60060.0,72560.0,2009-09-15,Robert Findley,2009,RSL
9559,RSL,Findley,Robert,F,47100.0,59600.0,2008-10-07,Robert Findley,2008,RSL


#### Keep Last Reported Salary
Salary data may have been released multiple times during the year, generally once in the spring and once in the fall but sometimes more often. We want the last salary reported for the year. Salary is usually consistent across datasets for the same year; however, these sometimes differ and we want to guarantee we're using the lastest. Note: This also forces the team association to the team the player was with at the end of the year.    

In [21]:
# Sort dataframe by Date, then keep last reported salary from the groupby
salary_df = salary_df.sort_values(['Date', 'Year', 'Club Name'])
salary_df = salary_df.groupby(['Year', 'Club Name', 'Last Name', 'First Name', 'Player', 'POS'], as_index=False).first()

In [22]:
salary_df[salary_df['Last Name'] == 'Findley']

Unnamed: 0,Year,Club Name,Last Name,First Name,Player,POS,Club,Base Salary,Guaranteed Compensation,Date
319,2007,RSL,Findley,Robert,Robert Findley,F,RSL,36000.0,48500.0,2007-08-31
700,2008,RSL,Findley,Robert,Robert Findley,F,RSL,47100.0,59600.0,2008-10-07
1077,2009,RSL,Findley,Robert,Robert Findley,F,RSL,60060.0,72560.0,2009-09-15
1480,2010,RSL,Findley,Robert,Robert Findley,F,RSL,73566.0,87316.0,2010-08-12
3204,2013,RSL,Findley,Robbie,Robbie Findley,F,RSL,175000.0,205500.0,2013-05-01
3853,2014,RSL,Findley,Robbie,Robbie Findley,F,RSL,215000.0,245500.0,2014-04-01
4564,2015,TOR,Findley,Robbie,Robbie Findley,F,TOR,225000.0,255500.0,2015-07-15


### Split Salary Data into Field Players and Goalkeepers

In [23]:
gk_idx = (salary_df['POS'] == 'GK')
fp_salary_df = salary_df[~gk_idx]
gk_salary_df = salary_df[gk_idx]

### Merge Field Player Stats with Field Player Salary Data

In [40]:
# Can't merge on Club because it's inconsistent in the mlssoccer.com database
# cols = ['Year', 'Club Name', 'Player']
cols = ['Year', 'Player']
fp_merge = pd.merge(fp_df, fp_salary_df, how='outer', on=cols, suffixes=['', '_Dup'])

In [41]:
good_fp = fp_merge[~fp_merge.isnull().any(axis=1)]

In [42]:
len(good_fp)

3969

In [27]:
good_fp[good_fp['Last Name'] == 'Findley']

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,First Name,Club Name,Club Name_Dup,Last Name_Dup,First Name_Dup,POS_Dup,Club_Dup,Base Salary,Guaranteed Compensation,Date
2638,Robbie Findley,RSL,F,25.0,17.0,1260.0,6.0,2.0,36.0,15.0,...,Robbie,RSL,RSL,Findley,Robbie,F,RSL,175000.0,205500.0,2013-05-01
3392,Robbie Findley,RSL,F,16.0,8.0,764.0,1.0,0.0,24.0,9.0,...,Robbie,RSL,RSL,Findley,Robbie,F,RSL,215000.0,245500.0,2014-04-01
3842,Robbie Findley,TOR,F,25.0,18.0,1476.0,2.0,3.0,28.0,8.0,...,Robbie,TOR,TOR,Findley,Robbie,F,TOR,225000.0,255500.0,2015-07-15


In [28]:
good_fp.to_csv('good_fp.csv')
good_fp.to_pickle('good_fp.pkl')

In [29]:
bad_fp = fp_merge[fp_merge.isnull().any(axis=1)]

In [30]:
len(bad_fp)

1562

In [31]:
bad_fp.to_csv('bad_fp.csv')
bad_fp.to_pickle('bad_fp.pkl')

### Merge Goalkeep Stats with Salary Data

In [48]:
# Can't merge on Club because it's inconsistent in the mlssoccer.com database
# cols = ['Year', 'Club Name', 'Player']
cols = ['Year', 'Player']
gk_merge = pd.merge(gk_df, gk_salary_df, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])

In [49]:
good_gk= gk_merge[~gk_merge.isnull().any(axis=1)]

In [50]:
len(good_gk)

579

In [51]:
good_gk[good_gk.Club != good_gk.Club_Dup]

Unnamed: 0,Player,Club,POS,GP,GS,MINS,SHTS,SV,GA,GAA,...,First Name,Club Name,Club Name_Dup,Last Name_Dup,First Name_Dup,POS_Dup,Club_Dup,Base Salary,Guaranteed Compensation,Date
8,Brad Guzan,ATL,GK,27.0,27.0,2430.0,119.0,87.0,25.0,0.93,...,Brad,ATL,CHV,Guzan,Brad,GK,CHV,52237.5,67237.50,2007-08-31
15,Srdjan Djekanovic,TOR,GK,8.0,7.0,635.0,34.0,22.0,9.0,1.29,...,Srdjan,TOR,TOR,Djekanovic,Srdjan,GK,TFC,17700.0,17700.00,2007-08-31
16,Greg Sutton,TOR,GK,8.0,8.0,720.0,53.0,36.0,15.0,1.88,...,Greg,TOR,TOR,Sutton,Greg,GK,TFC,125000.0,132562.50,2007-08-31
22,Chris Seitz,HOU,GK,3.0,3.0,270.0,14.0,6.0,8.0,2.67,...,Chris,HOU,RSL,Seitz,Chris,GK,RSL,55000.0,90500.00,2007-08-31
27,David Monsalve,TOR,GK,1.0,1.0,90.0,10.0,7.0,3.0,3.00,...,David,TOR,UNK,Monsalve,David,GK,Pool,30000.0,30000.00,2007-08-31
39,Boris Pardo,DAL,GK,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,Boris,DAL,UNK,Pardo,Boris,GK,Pool,17700.0,17700.00,2007-08-31
53,Greg Sutton,TOR,GK,24.0,24.0,2160.0,160.0,116.0,35.0,1.46,...,Greg,TOR,TOR,Sutton,Greg,GK,TFC,150000.0,157562.50,2008-10-07
57,Brad Guzan,ATL,GK,15.0,15.0,1350.0,71.0,48.0,20.0,1.33,...,Brad,ATL,CHV,Guzan,Brad,GK,CHV,88974.0,103974.38,2008-10-07
64,Brian Edwards,TOR,GK,6.0,6.0,540.0,31.0,23.0,8.0,1.33,...,Brian,TOR,TOR,Edwards,Brian,GK,TFC,36000.0,44750.00,2008-10-07
78,David Monsalve,DAL,GK,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,David,DAL,UNK,Monsalve,David,GK,Pool,33000.0,33000.00,2008-10-07


In [36]:
good_gk.to_csv('good_gk.csv')
good_gk.to_pickle('good_gk.pkl')

In [46]:
bad_gk = gk_merge[gk_merge.isnull().any(axis=1)]

In [47]:
len(bad_gk)

388

In [39]:
bad_gk.to_csv('bad_gk.csv')
bad_gk.to_pickle('bad_gk.pkl')

### Playing with Name Association
Some names are different between the datasets; therefore, they fail to link during the merge operation.  Given time, it would be nice to explore fixing this association for as many as possible.

In [40]:
players1 = pd.concat([fp_df.Player, gk_df.Player], ignore_index=True).unique()

In [41]:
players2 = salary_df.Player.unique()

In [42]:
true_player = {}
for p2 in players2:
    scores = np.zeros((len(players1), 3))
    for ii, p1 in enumerate(players1):
        scores[ii, :] = np.array([fuzz.ratio(p1, p2), 
                                  fuzz.partial_ratio(p1, p2),
                                  fuzz.token_set_ratio(p1, p2)])
    true_player[p2] = scores

KeyboardInterrupt: 

In [None]:
player_map = {}
for player in true_player.keys():
    match_idx = true_player[player][:, 2].argmax()
    match = players1[match_idx]
    match_score = true_player[player][match_idx, :] / 100
    player_map[player] = (match, match_score)
    if (match_score[2] < 0.9) & (match_score[2] >= 0.8):
        print(player, '=>', match, ':', match_score)