## Linking Stat Data

### Imports

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle

### Load Data

In [54]:
goals_df = pickle.load(open('./data/goals_df.pkl', 'rb'))
assists_df = pickle.load(open('./data/assists_df.pkl', 'rb'))
shots_df = pickle.load(open('./data/shots_df.pkl', 'rb'))
fouls_df = pickle.load(open('./data/fouls_df.pkl', 'rb'))
goalkeeping_df = pickle.load(open('./data/goalkeeping_df.pkl', 'rb'))
salary_df = pickle.load(open('./data/salary_df.pkl', 'rb'))

In [199]:
def merge_stats(df1, df2):
    cols = list(set(goals_df.columns).intersection(assists_df.columns))
    merged_df = pd.merge(df1, df2, how='outer', 
                         left_on=cols, 
                         right_on=cols, 
                         suffixes=['', '_Dup'])
    return merged_df

In [201]:
merged_df = merge_stats(goals_df, assists_df)

In [202]:
merged_df.head()

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,HmG,RdG,G/90min,SC%,Year,Season,GWA,HmA,RdA,A/90min
0,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,...,12,8,0.75,25.3,2007,REG,1,1,0,0.04
1,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,...,13,6,0.8,19.6,2007,REG,2,3,2,0.21
2,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,...,7,9,0.63,17.8,2007,REG,1,2,1,0.12
3,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,...,8,7,0.63,20.0,2007,REG,2,3,3,0.25
4,Maykel Galindo,CHV,F,28,24,2021,12,5,55,28,...,8,4,0.53,21.8,2007,REG,3,3,2,0.22


In [179]:
merged_df = pd.merge(merged_df, shots_df, how='outer', 
                     left_on=fp_cols, 
                     right_on=fp_cols, 
                     suffixes=['', '_Dup'])

In [180]:
merged_df = pd.merge(merged_df, fouls_df, how='outer', 
                     left_on=['Year', 'Club', 'Player'], 
                     right_on=['Year', 'Club', 'Player'], 
                     suffixes=['', '_Dup'])

In [181]:
merged_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,G_Dup,A_Dup,SHTS_Dup,SOG_Dup,FC,FS,OFF,YC,RC,Season_Dup
0,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,...,20,1,79,47,35,39,21,2,0,REG
1,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,...,19,5,97,53,31,20,42,2,1,REG
2,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,...,16,3,90,55,13,41,21,2,0,REG
3,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,...,15,6,75,43,21,57,50,6,0,REG
4,Maykel Galindo,CHV,F,28,24,2021,12,5,55,28,...,12,5,55,28,44,53,30,6,0,REG
5,Ante Razov,CHV,F,26,24,2041,11,8,85,42,...,11,8,85,42,37,33,30,2,0,REG
6,Christian Gomez,DC,M,27,27,2272,10,9,82,44,...,10,9,82,44,34,43,10,6,0,REG
7,Jozy Altidore,TOR,F,22,15,1399,9,4,43,20,...,9,4,43,20,38,43,12,4,0,REG
8,Landon Donovan,LA,M-F,25,24,2191,8,13,44,20,...,8,13,44,20,19,40,11,3,0,REG
9,Robbie Findley,LA,F,25,14,1353,8,0,31,16,...,8,0,31,16,15,27,14,0,0,REG


In [37]:
merged_df.groupby(by=['Year', 'Club', 'POS']).last()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A,A/90min,FC,FS,G,G/90min,GP,GS,GWA,GWG,...,RC,RdA,RdG,SC%,SC%_Dup,SHTS,SOG,SOG%,Season,YC
Year,Club,POS,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2007,ATL,D,0,0.00,5,10,1,0.04,25,25,0,0,...,0,0,1,100.0,100.0,1,1,100.0,REG,0
2007,ATL,F,0,0.00,8,21,1,0.08,24,10,0,0,...,0,0,1,4.2,4.2,24,10,41.7,REG,1
2007,ATL,M,4,0.15,61,30,3,0.11,28,28,1,1,...,0,1,2,10.7,10.7,28,9,32.1,REG,5
2007,CHI,D,0,0.00,31,21,0,0.00,29,29,0,0,...,0,0,0,0.0,0.0,7,2,28.6,REG,7
2007,CHI,F,0,0.00,1,5,0,0.00,6,2,0,0,...,0,0,0,0.0,0.0,1,1,100.0,REG,0
2007,CHI,M,0,0.00,7,9,0,0.00,8,5,0,0,...,0,0,0,0.0,0.0,8,3,37.5,REG,0
2007,CHV,D,0,0.00,19,10,0,0.00,23,23,0,0,...,1,0,0,0.0,0.0,1,1,100.0,REG,2
2007,CHV,F,0,0.00,2,0,0,0.00,2,0,0,0,...,0,0,0,0.0,0.0,2,0,0.0,REG,0
2007,CHV,M,0,0.00,5,3,0,0.00,4,4,0,0,...,0,0,0,0.0,0.0,9,4,44.4,REG,0
2007,CLB,D,0,0.00,9,7,0,0.00,6,6,0,0,...,0,0,0,0.0,0.0,7,2,28.6,REG,0


In [169]:
merged_df2 = pd.merge(merged_df, goalkeeping_df, how='outer', 
                      left_on=cols, 
                      right_on=cols, 
                      suffixes=['', '_Dup'])

In [170]:
merged_df2

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,GA,GAA,PKG/A_Dup,W,L,T,ShO,W%,Sv%,Season_Dup
0,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,...,,,,,,,,,,
1,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,...,,,,,,,,,,
2,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,...,,,,,,,,,,
3,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,...,,,,,,,,,,
4,Maykel Galindo,CHV,F,28,24,2021,12,5,55,28,...,,,,,,,,,,
5,Ante Razov,CHV,F,26,24,2041,11,8,85,42,...,,,,,,,,,,
6,Christian Gomez,DC,M,27,27,2272,10,9,82,44,...,,,,,,,,,,
7,Jozy Altidore,TOR,F,22,15,1399,9,4,43,20,...,,,,,,,,,,
8,Landon Donovan,LA,M-F,25,24,2191,8,13,44,20,...,,,,,,,,,,
9,Robbie Findley,LA,F,25,14,1353,8,0,31,16,...,,,,,,,,,,


In [171]:
merged_df2.to_csv('merged_df2.csv')

In [165]:
goalkeeping_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,SHTS,SV,GA,GAA,PKG/A,W,L,T,ShO,W%,Sv%,Year,Season
0,Bouna Coundoul,COL,GK,30,30,2668,158,120,32,1.07,2/2,9,12,8,9,30.0,75.9,2007,REG
1,Kevin Hartman,KC,GK,30,30,2700,159,110,45,1.50,6/7,11,12,7,5,36.7,69.2,2007,REG
2,Matt Reis,NE,GK,30,30,2700,169,120,43,1.43,3/3,14,8,8,10,46.7,71.0,2007,REG
3,Joe Cannon,LA,GK,29,29,2610,171,119,46,1.59,4/5,9,13,7,5,31.0,69.6,2007,REG
4,Troy Perkins,DC,GK,29,29,2610,155,117,32,1.10,1/3,16,6,7,8,55.2,75.5,2007,REG
5,Pat Onstad,HOU,GK,27,27,2418,109,85,22,0.81,1/2,13,8,6,11,48.1,78.0,2007,REG
6,Nick Rimando,RSL,GK,27,27,2430,191,146,37,1.37,4/6,6,13,8,7,22.2,76.4,2007,REG
7,Matt Pickens,CHI,GK,27,27,2430,137,102,31,1.15,4/4,9,8,10,10,33.3,74.5,2007,REG
8,Brad Guzan,ATL,GK,27,27,2430,119,87,25,0.93,2/3,14,6,7,13,51.9,73.1,2007,REG
9,William Hesmer,CLB,GK,20,20,1800,99,71,29,1.45,2/3,8,7,5,5,40.0,71.7,2007,REG


In [45]:
goals_df.merge(assists_df, how='outer', 
               left_on=['Year', 'Club', 'Player'], 
               right_on=['Year', 'Club', 'Player'], 
               suffixes=['', '_Dup'])

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,Season,POS_Dup,GP_Dup,GS_Dup,A_Dup,GWA,HmA,RdA,A/90min,Season_Dup
0,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,...,REG,F,29,28,1,1,1,0,0.04,REG
1,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,...,REG,F,24,24,5,2,3,2,0.21,REG
2,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,...,REG,F,26,25,3,1,2,1,0.12,REG
3,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,...,REG,F,24,24,6,2,3,3,0.25,REG
4,Maykel Galindo,CHV,F,28,24,2021,12,5,55,28,...,REG,F,28,24,5,3,3,2,0.22,REG
5,Ante Razov,CHV,F,26,24,2041,11,8,85,42,...,REG,F,26,24,8,4,5,3,0.35,REG
6,Christian Gomez,DC,M,27,27,2272,10,9,82,44,...,REG,M,27,27,9,5,7,2,0.36,REG
7,Jozy Altidore,TOR,F,22,15,1399,9,4,43,20,...,REG,F,22,15,4,2,2,2,0.26,REG
8,Landon Donovan,LA,M-F,25,24,2191,8,13,44,20,...,REG,M-F,25,24,13,3,10,3,0.53,REG
9,Robbie Findley,LA,F,25,14,1353,8,0,31,16,...,REG,F,25,14,0,0,0,0,0.00,REG


In [50]:
joined_df = goals_df.join(assists_df.set_index(['Year', 'Club', 'Player']), 
                          how='outer',                          
                          on=['Year', 'Club', 'Player'],
                          rsuffix='_Dup')

In [51]:
joined_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,Season,POS_Dup,GP_Dup,GS_Dup,A_Dup,GWA,HmA,RdA,A/90min,Season_Dup
0,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,...,REG,F,29,28,1,1,1,0,0.04,REG
1,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,...,REG,F,24,24,5,2,3,2,0.21,REG
2,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,...,REG,F,26,25,3,1,2,1,0.12,REG
3,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,...,REG,F,24,24,6,2,3,3,0.25,REG
4,Maykel Galindo,CHV,F,28,24,2021,12,5,55,28,...,REG,F,28,24,5,3,3,2,0.22,REG
5,Ante Razov,CHV,F,26,24,2041,11,8,85,42,...,REG,F,26,24,8,4,5,3,0.35,REG
6,Christian Gomez,DC,M,27,27,2272,10,9,82,44,...,REG,M,27,27,9,5,7,2,0.36,REG
7,Jozy Altidore,TOR,F,22,15,1399,9,4,43,20,...,REG,F,22,15,4,2,2,2,0.26,REG
8,Landon Donovan,LA,M-F,25,24,2191,8,13,44,20,...,REG,M-F,25,24,13,3,10,3,0.53,REG
9,Robbie Findley,LA,F,25,14,1353,8,0,31,16,...,REG,F,25,14,0,0,0,0,0.00,REG


In [None]:
salary_df['Player'] = salary_df['First Name'] + ' ' + salary_df['Last Name']

In [161]:
salary_df.rename(columns={'Pos': 'POS'})

Unnamed: 0,Club,Last Name,First Name,POS,Base Salary,Guaranteed Compensation,Date,Player,Year
0,TFC,Attakora-Gyan,Nana,D,"$12,900.00","$12,900.00",2007-08-31,Nana Attakora-Gyan,2007
1,CHI,Banner,Michael,M,"$12,900.00","$12,900.00",2007-08-31,Michael Banner,2007
2,DAL,Botero,Juan Sebastian,M,"$12,900.00","$12,900.00",2007-08-31,Juan Sebastian Botero,2007
3,CHV,Brooks,Desmond,D,"$12,900.00","$12,900.00",2007-08-31,Desmond Brooks,2007
4,CHV,Burling,Bobby,F-D,"$12,900.00","$12,900.00",2007-08-31,Bobby Burling,2007
5,NY,Caccavale,Salvatore,M,"$12,900.00","$12,900.00",2007-08-31,Salvatore Caccavale,2007
6,NY,Camp,Blake,M,"$12,900.00","$12,900.00",2007-08-31,Blake Camp,2007
7,DC,Carroll,Jeff,M-D,"$12,900.00","$12,900.00",2007-08-31,Jeff Carroll,2007
8,LA,Caso,Mike,M,"$12,900.00","$12,900.00",2007-08-31,Mike Caso,2007
9,NY,Cepero,Danny,GK,"$12,900.00","$12,900.00",2007-08-31,Danny Cepero,2007


In [94]:
# salary_df['Year'] = list(map(lambda x : x.year, salary_df['Date']))

In [116]:
salary_df['Year'] = salary_df.Date.map(lambda x: x.year)

In [124]:
salary_df

Unnamed: 0,Club,Last Name,First Name,Pos,Base Salary,Guaranteed Compensation,Date,Player,Year,Year Day
0,ORL,Kaka,,M,"$6,660,000.00","$7,167,500.00",2017-09-15,Kaka,2017,258
1,TOR,Giovinco,Sebastian,F,"$5,600,000.00","$7,115,555.67",2017-09-15,Sebastian Giovinco,2017,258
2,TOR,Bradley,Michael,M,"$6,000,000.00","$6,500,000.00",2017-09-15,Michael Bradley,2017,258
3,NYCFC,Pirlo,Andrea,M,"$5,600,000.00","$5,915,690.00",2017-09-15,Andrea Pirlo,2017,258
4,NYCFC,Villa,David,F,"$5,610,000.00","$5,610,000.00",2017-09-15,David Villa,2017,258
5,LA,Dos Santos,Giovani,F,"$3,750,000.00","$5,500,000.00",2017-09-15,Giovani Dos Santos,2017,258
6,CHI,Schweinsteiger,Bastian,M,"$5,400,000.00","$5,400,000.00",2017-09-15,Bastian Schweinsteiger,2017,258
7,TOR,Altidore,Jozy,F,"$4,875,000.00","$4,875,000.00",2017-09-15,Jozy Altidore,2017,258
8,SEA,Dempsey,Clint,F,"$3,200,000.00","$3,892,933.50",2017-09-15,Clint Dempsey,2017,258
9,POR,Valeri,Diego,M,"$2,227,500.00","$2,607,500.00",2017-09-15,Diego Valeri,2017,258


In [157]:
def subset_on_latest_salary_release_date(salary_df):
    new_df = pd.DataFrame()
    for year in np.unique(salary_df['Year']):
        max_date = max(salary_df[salary_df['Year'] == year]['Date'])
        subset_df = salary_df[salary_df['Date'] == max_date]
        new_df = pd.concat([new_df, subset_df], axis=0, ignore_index=True)
    return new_df

In [160]:
salary_df = subset_on_latest_salary_release_date(salary_df)