# Linking and Cleaning Data

## Imports

In [271]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle
import locale

## Process Stats Data

### Load Data

In [469]:
goals_df = pickle.load(open('./data/goals_df.pkl', 'rb'))
assists_df = pickle.load(open('./data/assists_df.pkl', 'rb'))
shots_df = pickle.load(open('./data/shots_df.pkl', 'rb'))
fouls_df = pickle.load(open('./data/fouls_df.pkl', 'rb'))
gk_df = pickle.load(open('./data/goalkeeping_df.pkl', 'rb'))

### Merge Field Player Stats

In [446]:
def merge_stats(df1, df2):
    cols = list(set(df1.columns).intersection(df2.columns))
    merged_df = pd.merge(df1, df2, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])
    return merged_df

In [457]:
fp_df = merge_stats(goals_df, assists_df)
fp_df = merge_stats(fp_df, shots_df)
fp_df = merge_stats(fp_df, fouls_df)

### Clean Field Player Stats

In [461]:
# Replace null values with 0
fp_df = fp_df.replace(np.nan, '0')

In [462]:
# Split PKG/A column into to and drop
fp_df[['PKG', 'PKA']] = fp_df['PKG/A'].str.split('/', expand=True)
fp_df.drop('PKG/A', axis=1, inplace=True)
fp_df = fp_df.replace('', '0')

In [570]:
first_name = []
last_name = []
for item in fp_df['Player'].str.split(' '):
    first_name.append(item[0])
    last_name.append(' '.join(item[1:]))

In [572]:
fp_df['First Name'] = first_name
fp_df['Last Name'] = last_name

#### Set Column Dtypes

In [464]:
int_cols = ['GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG', 
            'HmG', 'RdG', 'Year', 'GWA', 'HmA', 'RdA', 'FC', 'FS', 
            'OFF', 'YC', 'RC', 'PKG', 'PKA']
for col in int_cols:
    fp_df[col] = fp_df[col].astype(int)

In [465]:
float_cols = ['G/90min', 'SC%', 'A/90min', 'SOG%']
for col in float_cols:
    fp_df[col] = fp_df[col].astype(float)

In [466]:
fp_df.rename(columns={'G/90min': 'Gp90', 
                      'SC%': 'SCpct', 
                      'A/90min': 'Ap90', 
                      'SOG%': 'SOGpct'}, inplace=True)

#### Save Clean Data

In [None]:
fp_df.to_csv('fieldplayer.csv')

### Clean Goalkeeper Stats

In [473]:
# Replace null values with 0
gk_df = gk_df.replace(np.nan, '0')

In [474]:
# Split PKG/A column into to and drop
gk_df[['PKG', 'PKA']] = gk_df['PKG/A'].str.split('/', expand=True)
gk_df.drop('PKG/A', axis=1, inplace=True)
gk_df = gk_df.replace('', '0')

#### Set Column DTypes

In [481]:
int_cols = ['GP', 'GS', 'MINS', 'SHTS', 'SV', 'GA', 'W', 'L', 
            'T', 'ShO', 'Year', 'PKG', 'PKA']
for col in int_cols:
    gk_df[col] = gk_df[col].astype(int)

In [482]:
float_cols = ['GAA', 'W%', 'Sv%']
for col in float_cols:
    gk_df[col] = gk_df[col].astype(float)

#### Save Clean Data

In [484]:
gk_df.to_csv('goalkeeper.csv')

### Merge FP with GK Stats

In [215]:
stats_df = merge_stats(fieldplayer_df, goalkeeping_df)

In [171]:
stats_df.to_csv('stats.csv')

## Salary Data

### Load Data

In [485]:
salary_df = pickle.load(open('./data/salary_df.pkl', 'rb'))

### Clean Data

In [488]:
# Create Player column for linking
salary_df['Last Name'] = salary_df['Last Name'].str.strip()
salary_df['First Name'] = salary_df['First Name'].str.strip()
salary_df['Player'] = salary_df['First Name'] + ' ' + salary_df['Last Name']

# Rename Pos to POS for linking
salary_df.rename(columns={'Pos': 'POS'}, inplace=True)

# Create Year Column
salary_df['Year'] = salary_df.Date.map(lambda x: x.year)

In [489]:
# Convert numeric str to float
def convert_money(money_str):
    money_str = money_str.strip()
    # replace empty str with 0
    if len(money_str) == 0:
        money_str = re.sub('', '0', money_str)
    return float(re.sub('[\$,]', '', re.sub('-', '0', money_str)))

# Convert Salary Columns
salary_df['Base Salary'] = salary_df['Base Salary'].map(lambda x: convert_money(x))
salary_df['Guaranteed Compensation'] = salary_df['Guaranteed Compensation'].map(lambda x: convert_money(x))

# # Pure Pandas Implementation
# salary_df = salary_df.replace('', 0)
# salary_df = salary_df.replace('-', 0)
# salary_df['Base Salary'] = salary_df['Base Salary'].replace('[\$,]', '', regex=True).astype(float)
# salary_df['Guaranteed Compensation'] = salary_df['Guaranteed Compensation'].replace('[\$,]', '', regex=True).astype(float)

In [495]:
tmp = salary_df.groupby(['Year', 'Club', 'Player'], as_index=False).mean()
tmp

Unnamed: 0,Year,Club,Player,Base Salary,Guaranteed Compensation
0,2007,CHI,Bakary Soumare,45000.00,78000.00
1,2007,CHI,Brian Plotkin,30000.00,30000.00
2,2007,CHI,Bruno Marques,12900.00,12900.00
3,2007,CHI,C.J. Brown,106391.00,106391.00
4,2007,CHI,Calen Carr,38000.00,50500.00
5,2007,CHI,Chad Barrett,41212.50,48712.50
6,2007,CHI,Chris Armas,225000.00,225000.00
7,2007,CHI,Chris Rolfe,70000.00,74700.00
8,2007,CHI,Cuauhtemoc Blanco,2492316.00,2666778.00
9,2007,CHI,Daniel Woolard,12900.00,12900.00


In [497]:
test = pd.merge(salary_df, tmp, on=['Year', 'Club', 'Player'], how='inner', suffixes=['', '_Avg'])

In [502]:
def subset_latest(salary_df):
    new_df = pd.DataFrame()
    for year in np.unique(salary_df['Year']):
        max_date = max(salary_df[salary_df['Year'] == year]['Date'])
        subset_df = salary_df[salary_df['Date'] == max_date]
        new_df = pd.concat([new_df, subset_df], axis=0, ignore_index=True)
    return new_df

In [503]:
salary_df = subset_latest(salary_df)

### Split Salary Data into Field Players and Goalkeepers

In [521]:
gk_idx = salary_df['POS'] == 'GK'

fp_salary_df = salary_df[~gk_idx]
gk_salary_df = salary_df[gk_idx]

### Merge Field Player Stats and Salary Data

In [531]:
fp_combo_df = pd.merge(fp_df, fp_salary_df, how='outer', 
                       left_on=['Year', 'Player'], 
                       right_on=['Year', 'Player'], 
                       suffixes=['', '_Dup'])

In [532]:
fp_combo_df[fp_combo_df.isnull().any(axis=1)]

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,RC,PKG,PKA,Club_Dup,Last Name,First Name,POS_Dup,Base Salary,Guaranteed Compensation,Date
6,Christian Gomez,DC,M,27.0,27.0,2272.0,10.0,9.0,82.0,44.0,...,0.0,2.0,3.0,,,,,,,
7,Jozy Altidore,TOR,F,22.0,15.0,1399.0,9.0,4.0,43.0,20.0,...,0.0,0.0,0.0,,,,,,,
9,Robbie Findley,LA,F,25.0,14.0,1353.0,8.0,0.0,31.0,16.0,...,0.0,0.0,0.0,,,,,,,
10,Fred,DC,M,26.0,23.0,2096.0,7.0,8.0,28.0,14.0,...,1.0,0.0,0.0,,,,,,,
22,Dwayne De Rosario,HOU,M,24.0,22.0,1973.0,6.0,4.0,62.0,24.0,...,0.0,1.0,2.0,,,,,,,
25,Danny Dichio,TOR,F,17.0,14.0,1175.0,6.0,1.0,20.0,13.0,...,1.0,0.0,0.0,,,,,,,
26,Jeff Cunningham,RSL,F,23.0,20.0,1636.0,6.0,1.0,32.0,21.0,...,0.0,1.0,1.0,,,,,,,
27,Juan Toja,DAL,M,27.0,27.0,2388.0,6.0,1.0,33.0,16.0,...,0.0,0.0,0.0,,,,,,,
45,Hérculez Gómez,COL,F,20.0,16.0,1537.0,4.0,2.0,51.0,22.0,...,0.0,0.0,0.0,,,,,,,
65,Roberto Brown,COL,F,13.0,8.0,696.0,3.0,0.0,14.0,4.0,...,1.0,0.0,0.0,,,,,,,


In [533]:
fp_combo_df.to_csv('fp_combo2.csv')

In [504]:
guzan_stats = gk_df[gk_df.Player == 'Brad Guzan']
guzan_stats

Unnamed: 0,Player,Club,POS,GP,GS,MINS,SHTS,SV,GA,GAA,W,L,T,ShO,W%,Sv%,Year,Season,PKG,PKA
8,Brad Guzan,ATL,GK,27,27,2430,119,87,25,0.93,14,6,7,13,51.9,73.1,2007,REG,2,3
57,Brad Guzan,ATL,GK,15,15,1350,71,48,20,1.33,6,5,4,4,40.0,67.6,2008,REG,1,1
593,Brad Guzan,ATL,GK,14,14,1260,47,38,10,0.71,6,1,7,8,42.9,80.9,2017,REG,1,1


In [505]:
guzan_money = salary_df[salary_df.Player == 'Brad Guzan']
guzan_money

Unnamed: 0,Club,Last Name,First Name,POS,Base Salary,Guaranteed Compensation,Date,Player,Year
216,CHV,Guzan,Brad,GK,52237.5,67237.5,2007-08-31,Brad Guzan,2007
656,CHV,Guzan,Brad,GK,88974.0,103974.38,2008-10-07,Brad Guzan,2008
5083,ATL,Guzan,Brad,GK,340008.0,400008.0,2017-09-15,Brad Guzan,2017


In [508]:
pd.merge(guzan_stats, guzan_money, how='outer', 
         left_on=['Year', 'Club', 'Player'], 
         right_on=['Year', 'Club', 'Player'], 
         suffixes=['', '_Dup'])

Unnamed: 0,Player,Club,POS,GP,GS,MINS,SHTS,SV,GA,GAA,...,Year,Season,PKG,PKA,Last Name,First Name,POS_Dup,Base Salary,Guaranteed Compensation,Date
0,Brad Guzan,ATL,GK,27.0,27.0,2430.0,119.0,87.0,25.0,0.93,...,2007,REG,2.0,3.0,,,,,,
1,Brad Guzan,ATL,GK,15.0,15.0,1350.0,71.0,48.0,20.0,1.33,...,2008,REG,1.0,1.0,,,,,,
2,Brad Guzan,ATL,GK,14.0,14.0,1260.0,47.0,38.0,10.0,0.71,...,2017,REG,1.0,1.0,Guzan,Brad,GK,340008.0,400008.0,2017-09-15
3,Brad Guzan,CHV,,,,,,,,,...,2007,,,,Guzan,Brad,GK,52237.5,67237.5,2007-08-31
4,Brad Guzan,CHV,,,,,,,,,...,2008,,,,Guzan,Brad,GK,88974.0,103974.38,2008-10-07


In [509]:
cols = ['Year', 'Player']
df = pd.merge(merged_df, salary_df, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])

In [511]:
gk_df.groupby(['Year', 'Club', 'Player']).last()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,POS,GP,GS,MINS,SHTS,SV,GA,GAA,W,L,T,ShO,W%,Sv%,Season,PKG,PKA
Year,Club,Player,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2007,ATL,Brad Guzan,GK,27,27,2430,119,87,25,0.93,14,6,7,13,51.9,73.1,REG,2,3
2007,CHI,Jon Busch,GK,3,3,270,20,15,5,1.67,1,2,0,1,33.3,75.0,REG,2,2
2007,CHI,Matt Pickens,GK,27,27,2430,137,102,31,1.15,9,8,10,10,33.3,74.5,REG,4,4
2007,CHI,Nick Noble,GK,0,0,0,0,0,0,0.00,0,0,0,0,0.0,0.0,REG,0,0
2007,CHV,Justin Myers,GK,0,0,0,0,0,0,0.00,0,0,0,0,0.0,0.0,REG,0,0
2007,CHV,Preston Burpo,GK,3,3,270,16,13,3,1.00,1,1,1,1,33.3,81.3,REG,0,0
2007,CLB,Andy Gruenebaum,GK,10,10,900,53,35,15,1.50,1,4,5,3,10.0,66.0,REG,0,0
2007,CLB,Bill Gaudette,GK,0,0,0,0,0,0,0.00,0,0,0,0,0.0,0.0,REG,0,0
2007,CLB,William Hesmer,GK,20,20,1800,99,71,29,1.45,8,7,5,5,40.0,71.7,REG,2,3
2007,COL,Bouna Coundoul,GK,30,30,2668,158,120,32,1.07,9,12,8,9,30.0,75.9,REG,2,2


In [246]:
df.to_csv('field_player.csv')

#### Fix Player Name Association

In [249]:
import fuzzywuzzy as fwuzz

In [251]:
fwuzz.fuzz.ratio('Testing FuzzyWuzzy', 'Testing FuzzyWuzzy')

100

In [257]:
str1 = 'Fred da Silva Carreiro'
str2 = 'Fred Carreiro da Silva'
str3 = 'Fred Carreiro'
str4 = 'Fred'

In [261]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6638 entries, 0 to 6637
Data columns (total 35 columns):
Player                     6638 non-null object
Club                       5312 non-null object
POS                        5316 non-null object
GP                         5312 non-null object
GS                         5312 non-null object
MINS                       5312 non-null object
G                          5312 non-null object
A                          5312 non-null object
SHTS                       5312 non-null object
SOG                        5312 non-null object
GWG                        5312 non-null object
PKG/A                      5316 non-null object
HmG                        5312 non-null object
RdG                        5312 non-null object
G/90min                    5316 non-null object
SC%                        5316 non-null object
Year                       6638 non-null int64
Season                     5316 non-null object
GWA                        531

In [258]:
print(fwuzz.fuzz.ratio(str1, str2))
print(fwuzz.fuzz.ratio(str1, str3))
print(fwuzz.fuzz.ratio(str2, str3))

59
74
74


In [259]:
print(fwuzz.fuzz.partial_ratio(str1, str2))
print(fwuzz.fuzz.partial_ratio(str1, str3))
print(fwuzz.fuzz.partial_ratio(str2, str3))

59
69
100


In [260]:
print(fwuzz.fuzz.token_set_ratio(str1, str2))
print(fwuzz.fuzz.token_set_ratio(str1, str3))
print(fwuzz.fuzz.token_set_ratio(str2, str3))

100
100
100
