## Linking Stat Data

### Imports

In [271]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle
import locale

### Stats Data

In [445]:
goals_df = pickle.load(open('./data/goals_df.pkl', 'rb'))
assists_df = pickle.load(open('./data/assists_df.pkl', 'rb'))
shots_df = pickle.load(open('./data/shots_df.pkl', 'rb'))
fouls_df = pickle.load(open('./data/fouls_df.pkl', 'rb'))
goalkeeping_df = pickle.load(open('./data/goalkeeping_df.pkl', 'rb'))

#### Merge Field Player Stats

In [446]:
def merge_stats(df1, df2):
    cols = list(set(df1.columns).intersection(df2.columns))
    merged_df = pd.merge(df1, df2, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])
    return merged_df

In [457]:
fp_df = merge_stats(goals_df, assists_df)
fp_df = merge_stats(fp_df, shots_df)
fp_df = merge_stats(fp_df, fouls_df)

#### Clean Field Player Stats

In [461]:
# Replace null values with 0
fp_df = fp_df.replace(np.nan, '0')

In [462]:
# Split PKG/A column into to and drop
fp_df[['PKG', 'PKA']] = fp_df['PKG/A'].str.split('/', expand=True)
fp_df.drop('PKG/A', axis=1, inplace=True)
fp_df = fp_df.replace('', '0')

##### Set Column Dtypes

In [464]:
int_cols = ['GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG', 
            'HmG', 'RdG', 'Year', 'GWA', 'HmA', 'RdA', 'FC', 'FS', 
            'OFF', 'YC', 'RC', 'PKG', 'PKA']
for col in int_cols:
    fp_df[col] = fp_df[col].astype(int)

In [465]:
float_cols = ['G/90min', 'SC%', 'A/90min', 'SOG%']
for col in float_cols:
    fp_df[col] = fp_df[col].astype(float)

In [466]:
fp_df.rename(columns={'G/90min': 'Gp90', 
                      'SC%': 'SCpct', 
                      'A/90min': 'Ap90', 
                      'SOG%': 'SOGpct'}, inplace=True)

In [None]:
fieldplayer_df.to_csv('fieldplayer.csv')

#### Merge Field Player Stats with Goalkeeping Stats

In [215]:
stats_df = merge_stats(fieldplayer_df, goalkeeping_df)

In [171]:
stats_df.to_csv('stats.csv')

### Salary Data

In [375]:
salary_df = pickle.load(open('./data/salary_df.pkl', 'rb'))

#### Clean Salary Dataframe

In [376]:
# Create Player column for linking
salary_df['Last Name'] = salary_df['Last Name'].str.strip()
salary_df['First Name'] = salary_df['First Name'].str.strip()
salary_df['Player'] = salary_df['First Name'] + ' ' + salary_df['Last Name']

# Rename Pos to POS for linking
salary_df.rename(columns={'Pos': 'POS'}, inplace=True)

# Create Year Column
salary_df['Year'] = salary_df.Date.map(lambda x: x.year)

In [377]:
# Convert numeric str to float
def convert_money(money_str):
    money_str = money_str.strip()
    # replace empty str with 0
    if len(money_str) == 0:
        money_str = re.sub('', '0', money_str)
    return float(re.sub('[\$,]', '', re.sub('-', '0', money_str)))

In [378]:
# Convert Salary Columns
salary_df['Base Salary'] = salary_df['Base Salary'].map(lambda x: convert_money(x))
salary_df['Guaranteed Compensation'] = salary_df['Guaranteed Compensation'].map(lambda x: convert_money(x))

In [374]:
# salary_df = salary_df.replace('', 0)
# salary_df = salary_df.replace('-', 0)
# salary_df['Base Salary'] = salary_df['Base Salary'].replace('[\$,]', '', regex=True).astype(float)
# salary_df['Guaranteed Compensation'] = salary_df['Guaranteed Compensation'].replace('[\$,]', '', regex=True).astype(float)

In [393]:
def subset_latest(salary_df):
    new_df = pd.DataFrame()
    for year in np.unique(salary_df['Year']):
        max_date = max(salary_df[salary_df['Year'] == year]['Date'])
        subset_df = salary_df[salary_df['Date'] == max_date]
        new_df = pd.concat([new_df, subset_df], axis=0, ignore_index=True)
    return new_df

In [394]:
salary_df = subset_latest(salary_df)

### Merge Stats with Salary Data

In [395]:
cols = ['Year', 'Player']
df = pd.merge(merged_df, salary_df, how='outer', left_on=cols, right_on=cols, suffixes=['', '_Dup'])

In [396]:
df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,OFF,YC,RC,Club_Dup,Last Name,First Name,POS_Dup,Base Salary,Guaranteed Compensation,Date
0,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,...,21,2,0,DC,Emilio,Luciano,F,265000.00,293125.00,2007-08-31
1,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,...,42,2,1,NY,Angel,Juan Pablo,F,1500000.00,1593750.00,2007-08-31
2,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,...,21,2,0,NE,Twellman,Taylor,F,325008.00,350008.00,2007-08-31
3,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,...,50,6,0,KC,Johnson,Eddie,F,750000.00,875000.00,2007-08-31
4,Maykel Galindo,CHV,F,28,24,2021,12,5,55,28,...,30,6,0,CHV,Galindo,Maykel,F,72500.00,72500.00,2007-08-31
5,Ante Razov,CHV,F,26,24,2041,11,8,85,42,...,30,2,0,CHV,Razov,Ante,F,245000.00,248750.00,2007-08-31
6,Christian Gomez,DC,M,27,27,2272,10,9,82,44,...,10,6,0,,,,,,,
7,Jozy Altidore,TOR,F,22,15,1399,9,4,43,20,...,12,4,0,,,,,,,
8,Landon Donovan,LA,M-F,25,24,2191,8,13,44,20,...,11,3,0,LA,Donovan,Landon,F,900000.00,900000.00,2007-08-31
9,Robbie Findley,LA,F,25,14,1353,8,0,31,16,...,14,0,0,,,,,,,


In [246]:
df.to_csv('field_player.csv')

#### Fix Player Name Association

In [249]:
import fuzzywuzzy as fwuzz

In [251]:
fwuzz.fuzz.ratio('Testing FuzzyWuzzy', 'Testing FuzzyWuzzy')

100

In [257]:
str1 = 'Fred da Silva Carreiro'
str2 = 'Fred Carreiro da Silva'
str3 = 'Fred Carreiro'
str4 = 'Fred'

In [261]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6638 entries, 0 to 6637
Data columns (total 35 columns):
Player                     6638 non-null object
Club                       5312 non-null object
POS                        5316 non-null object
GP                         5312 non-null object
GS                         5312 non-null object
MINS                       5312 non-null object
G                          5312 non-null object
A                          5312 non-null object
SHTS                       5312 non-null object
SOG                        5312 non-null object
GWG                        5312 non-null object
PKG/A                      5316 non-null object
HmG                        5312 non-null object
RdG                        5312 non-null object
G/90min                    5316 non-null object
SC%                        5316 non-null object
Year                       6638 non-null int64
Season                     5316 non-null object
GWA                        531

In [258]:
print(fwuzz.fuzz.ratio(str1, str2))
print(fwuzz.fuzz.ratio(str1, str3))
print(fwuzz.fuzz.ratio(str2, str3))

59
74
74


In [259]:
print(fwuzz.fuzz.partial_ratio(str1, str2))
print(fwuzz.fuzz.partial_ratio(str1, str3))
print(fwuzz.fuzz.partial_ratio(str2, str3))

59
69
100


In [260]:
print(fwuzz.fuzz.token_set_ratio(str1, str2))
print(fwuzz.fuzz.token_set_ratio(str1, str3))
print(fwuzz.fuzz.token_set_ratio(str2, str3))

100
100
100
