In [2]:
%load_ext autoreload

%autoreload 2

import configparser
import os
import time
import random
import pandas as pd
import requests
import openpyxl
import sqlite3

from datetime import datetime
from io import StringIO
from pathlib import Path
from bs4 import BeautifulSoup
from alive_progress import alive_bar

# some stuff I set up in a config file so I don't have to keep updating certain
# variables in every script
config = configparser.ConfigParser()
config.read('../src/config.ini')
output = Path(config['paths']['output'])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
def combine_player_data(start_year:int, end_year:int) -> pd.DataFrame:
    # initialize empty dataframe
    all_data = pd.DataFrame()
    
    # combine all data into a df that is used for cleaning
    for year in range(start_year, end_year+1):
        player_data = pd.read_csv(config['data'][f'all_player_data_{year}'])
        all_data = pd.concat([all_data, player_data], ignore_index=True)
    
    all_data = adjust_initial_df(all_data)
    
    # switch to MLS data
    all_data.replace('On matchday squad, but did not play', 0, inplace=True)
    
    return all_data[all_data['Comp'] == 'MLS']

def adjust_initial_df(df)->pd.DataFrame:
    correct_col_names = ['Date', 'Day', 'Comp', 'Round', 'Venue', 'Result', 'Squad', 'Opponent',
       'Start', 'Pos', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY',
       'CrdR', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG', 'SCA',
       'GCA', 'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'Att_TakeOn', 'Succ',
       'Match Report', 'player_url']
    df.columns = correct_col_names
    game_keys = []
    for index, player in df.iterrows():
        if player['Venue'] == 'Home':
            game_keys.append(f'{player['Date']} {player['Squad']} vs {player['Opponent']}')
        else:
            game_keys.append(f'{player['Date']} {player['Opponent']} vs {player['Squad']}')
    df['game_key'] = game_keys
    return df

def create_match_data(player_data_df):
    player_data_df['game_key'] = player_data_df.apply(create_game_key, axis=1)
    create_dependent_variables(player_data_df)
    
    desired_cols = ['Date', 'Day', 'Round', 'Squad', 'Opponent', 'Result', 'OverallResult', 'home_score', 'home_penalties', 'away_score', 'away_penalties', 'game_key']
    updated_names = ['game_date', 'day', 'round', 'home_team', 'away_team', 'result', 'overall_result', 'home_score', 'home_penalties', 'away_score', 'away_penalties', 'game_key']
    
    player_data_df = player_data_df[desired_cols]
    player_data_df.columns = updated_names
    player_data_df.drop_duplicates(inplace=True)
    
    return player_data_df.sort_values(by=['game_date'])
    
def create_game_key(row):
    if row['Venue'] == 'Home':
        return f'{row['Date']} {row['Squad']} vs {row['Opponent']}'
    else:
        return f'{row['Date']} {row['Opponent']} vs {row['Squad']}'

def create_dependent_variables(df):
    df['Results_Raw'] = df['Result'].apply(lambda x: x.replace(' ', '–').replace('(', '').replace(')', '').split('–'))
    df['OverallResult'] = df['Results_Raw'].apply(lambda x: x[0])
    df['home_score'] = df['Results_Raw'].apply(lambda x: int(x[1]))
    df['home_penalties' ] = df['Results_Raw'].apply(lambda x: int(x[2]) if len(x)==5 else 0)
    df['away_score'] = df['Results_Raw'].apply(lambda x: int(x[3]) if len(x)==5 else int(x[2]))
    df['away_penalties' ] = df['Results_Raw'].apply(lambda x: int(x[4]) if len(x)==5 else 0)
    df['game_key'] = df.apply(lambda x: f'{x['Date']} {x['Squad']} vs {x['Opponent']}', axis=1)
    
def clean_data_for_modeling(player_data, match_data):
    players_numeric_columns = ['Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY',
       'CrdR', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG', 'SCA',
       'GCA', 'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'Att_TakeOn',
       'Succ']
    
    
    for col in players_numeric_columns:
        player_data[col] = player_data[col].astype(float)
    
    cleaned_df = pd.DataFrame()

    for match in match_data['game_key']:
        current_match_data = match_data[match_data['game_key'] == match].copy().reset_index()
        current_match_date = current_match_data['game_date'][0]
        current_match_year = int(current_match_data['game_date'][0][-4:])
        home_team = current_match_data['home_team'][0]
        away_team = current_match_data['away_team'][0]
        

        prior_home_stats = player_data.query(f'Date<"{current_match_date}" and Date > "{current_match_year-1}-12-31" and Squad=="{home_team}" and Venue=="Home" and Start!="N"')
        prior_home_stats = prior_home_stats[players_numeric_columns]
        prior_away_stats = player_data.query(f'Date<"{current_match_date}" and Date > "{current_match_year-1}-12-31" and Squad=="{away_team}" and Venue=="Home" and Start!="N"')
        prior_away_stats = prior_away_stats[players_numeric_columns]
        
        prior_home_stats.columns = [f'home_prior_{x}' for x in prior_home_stats.columns]
        prior_away_stats.columns = [f'away_prior_{x}' for x in prior_away_stats.columns]
        
        if prior_home_stats.shape[0]==0:
            temp = prior_home_stats.columns
            prior_home_stats = pd.DataFrame([0]*len(prior_home_stats.columns)).T
            prior_home_stats.columns = temp
            
        if prior_away_stats.shape[0]==0:
            temp = prior_away_stats.columns
            prior_away_stats = pd.DataFrame([0]*len(prior_away_stats.columns)).T
            prior_away_stats.columns = temp

        prior_home_stats = pd.DataFrame(prior_home_stats.describe().T['mean']).T.reset_index()
        prior_away_stats = pd.DataFrame(prior_away_stats.describe().T['mean']).T.reset_index()
        
        final_row = pd.concat([current_match_data, prior_home_stats, prior_away_stats], axis=1)
        
        cleaned_df = pd.concat([cleaned_df, final_row], axis=0)
        
    return cleaned_df

In [71]:
player_data = combine_player_data(2022, 2024)

In [72]:
player_data.head()

Unnamed: 0,Date,Day,Comp,Round,Venue,Result,Squad,Opponent,Start,Pos,...,Att,Cmp%,PrgP,Carries,PrgC,Att_TakeOn,Succ,Match Report,player_url,game_key
0,2/26/2022,Sat,MLS,Regular Season,Home,D 1–1,Philadelphia Union,Minnesota Utd,N,AM,...,7,85.7,2,6,1,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,2/26/2022 Philadelphia Union vs Minnesota Utd
1,3/5/2022,Sat,MLS,Regular Season,Away,W 2–1,Philadelphia Union,CF Montréal,N,0,...,0,0.0,0,0,0,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,3/5/2022 CF Montréal vs Philadelphia Union
2,3/12/2022,Sat,MLS,Regular Season,Home,W 2–0,Philadelphia Union,SJ Earthquakes,N,FW,...,4,100.0,1,7,1,1,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,3/12/2022 Philadelphia Union vs SJ Earthquakes
3,3/19/2022,Sat,MLS,Regular Season,Away,W 2–0,Philadelphia Union,NYCFC,N,0,...,0,0.0,0,0,0,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,3/19/2022 NYCFC vs Philadelphia Union
4,4/2/2022,Sat,MLS,Regular Season,Home,W 2–0,Philadelphia Union,Charlotte,N,CM,...,0,,0,0,0,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,4/2/2022 Philadelphia Union vs Charlotte


In [73]:
matches_df = create_match_data(player_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data_df.drop_duplicates(inplace=True)


In [74]:
matches_df.tail()

Unnamed: 0,game_date,day,round,home_team,away_team,result,overall_result,home_score,home_penalties,away_score,away_penalties,game_key
27105,9/9/2023,Sat,Regular Season,NE Revolution,Minnesota Utd,D 1–1,D,1,0,1,0,9/9/2023 NE Revolution vs Minnesota Utd
26699,9/9/2023,Sat,Regular Season,SJ Earthquakes,D.C. United,D 0–0,D,0,0,0,0,9/9/2023 SJ Earthquakes vs D.C. United
30414,9/9/2023,Sat,Regular Season,LAFC,Portland Timbers,L 0–2,L,0,0,2,0,9/9/2023 LAFC vs Portland Timbers
28373,9/9/2023,Sat,Regular Season,Portland Timbers,LAFC,W 2–0,W,2,0,0,0,9/9/2023 Portland Timbers vs LAFC
26997,9/9/2023,Sat,Regular Season,D.C. United,SJ Earthquakes,D 0–0,D,0,0,0,0,9/9/2023 D.C. United vs SJ Earthquakes


In [75]:
cleaned_df = clean_data_for_modeling(player_data, matches_df)

In [76]:
cleaned_df

Unnamed: 0,index,game_date,day,round,home_team,away_team,result,overall_result,home_score,home_penalties,...,away_prior_SCA,away_prior_GCA,away_prior_Cmp,away_prior_Att,away_prior_Cmp%,away_prior_PrgP,away_prior_Carries,away_prior_PrgC,away_prior_Att_TakeOn,away_prior_Succ
0,935,10/1/2022,Sat,Regular Season,NY Red Bulls,Columbus Crew,L 1–2,L,1,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,1157,10/1/2022,Sat,Regular Season,Columbus Crew,NY Red Bulls,W 2–1,W,2,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,4881,10/1/2022,Sat,Regular Season,FC Dallas,Colorado Rapids,L 0–1,L,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,764,10/1/2022,Sat,Regular Season,Atlanta Utd,NE Revolution,L 1–2,L,1,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,4601,10/1/2022,Sat,Regular Season,Austin,Vancouver W'caps,L 0–2,L,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,27105,9/9/2023,Sat,Regular Season,NE Revolution,Minnesota Utd,D 1–1,D,1,0,...,2.022945,0.177820,29.862333,38.512428,76.026960,3.732314,25.636711,1.374761,1.673040,0.850860
0,26699,9/9/2023,Sat,Regular Season,SJ Earthquakes,D.C. United,D 0–0,D,0,0,...,1.895161,0.197581,27.614919,36.411290,74.541129,2.973790,23.379032,1.191532,1.205645,0.572581
0,30414,9/9/2023,Sat,Regular Season,LAFC,Portland Timbers,L 0–2,L,0,0,...,2.164557,0.291139,32.343882,40.588608,77.979114,3.626582,28.040084,1.402954,1.603376,0.767932
0,28373,9/9/2023,Sat,Regular Season,Portland Timbers,LAFC,W 2–0,W,2,0,...,2.361396,0.285421,33.960986,42.158111,78.806160,3.932238,29.983573,1.687885,1.839836,0.895277
