In [1]:
%load_ext autoreload

%autoreload 2

import configparser
import os
import time
import random
import pandas as pd
import requests
import openpyxl
import sqlite3

from datetime import datetime
from io import StringIO
from pathlib import Path
from bs4 import BeautifulSoup
from alive_progress import alive_bar

# some stuff I set up in a config file so I don't have to keep updating certain
# variables in every script
config = configparser.ConfigParser()
config.read('../src/config.ini')
output = Path(config['paths']['output'])

In [None]:
def combine_player_data(start_year:int, end_year:int) -> pd.DataFrame:
    # initialize empty dataframe
    all_data = pd.DataFrame()
    
    # combine all data into a df that is used for cleaning
    for year in range(start_year, end_year+1):
        player_data = pd.read_csv(config['data'][f'all_player_data_{year}'])
        all_data = pd.concat([all_data, player_data], ignore_index=True)

    all_data.drop(columns = "Unnamed: 0", inplace=True)

    all_data = adjust_initial_df(all_data)

    # switch to MLS data
    all_data.replace('On matchday squad, but did not play', 0, inplace=True)
    
    return all_data[all_data['Comp'] == 'MLS']

def adjust_initial_df(df)->pd.DataFrame:
    correct_col_names = ['Date', 'Day', 'Comp', 'Round', 'Venue', 'Result', 'Squad', 'Opponent',
       'Start', 'Pos', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY',
       'CrdR', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG', 'SCA',
       'GCA', 'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'Att_TakeOn', 'Succ',
       'Match Report', 'player_url']
    df.columns = correct_col_names
    game_keys = []
    for index, player in df.iterrows():
        if player['Venue'] == 'Home':
            game_keys.append(f'{player['Date']} {player['Squad']} vs {player['Opponent']}')
        else:
            game_keys.append(f'{player['Date']} {player['Opponent']} vs {player['Squad']}')
    df['game_key'] = game_keys
    return df

def create_match_data(player_data_df):
    player_data_df['game_key'] = player_data_df.apply(create_game_key, axis=1)
    create_dependent_variables(player_data_df)
    
    desired_cols = ['Date', 'Day', 'Round', 'Squad', 'Opponent', 'Result', 'OverallResult', 'home_score', 'home_penalties', 'away_score', 'away_penalties', 'game_key']
    updated_names = ['game_date', 'day', 'round', 'home_team', 'away_team', 'result', 'overall_result', 'home_score', 'home_penalties', 'away_score', 'away_penalties', 'game_key']
    
    player_data_df = player_data_df[desired_cols]
    player_data_df.columns = updated_names
    player_data_df.drop_duplicates(inplace=True)
    
    return player_data_df.sort_values(by=['game_date'])
    
def create_game_key(row):
    if row['Venue'] == 'Home':
        return f'{row['Date']} {row['Squad']} vs {row['Opponent']}'
    else:
        return f'{row['Date']} {row['Opponent']} vs {row['Squad']}'

def create_dependent_variables(df):
    df['Results_Raw'] = df['Result'].apply(lambda x: x.replace(' ', '–').replace('(', '').replace(')', '').split('–'))
    df['OverallResult'] = df['Results_Raw'].apply(lambda x: x[0])
    df['home_score'] = df['Results_Raw'].apply(lambda x: int(x[1]))
    df['home_penalties' ] = df['Results_Raw'].apply(lambda x: int(x[2]) if len(x)==5 else 0)
    df['away_score'] = df['Results_Raw'].apply(lambda x: int(x[3]) if len(x)==5 else int(x[2]))
    df['away_penalties' ] = df['Results_Raw'].apply(lambda x: int(x[4]) if len(x)==5 else 0)
    df['game_key'] = df.apply(lambda x: f'{x['Date']} {x['Squad']} vs {x['Opponent']}', axis=1)
    
def clean_data_for_modeling(player_data, match_data):
    players_numeric_columns = ['Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY',
       'CrdR', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG', 'SCA',
       'GCA', 'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'Att_TakeOn',
       'Succ']
    
    
    for col in players_numeric_columns:
        player_data[col] = player_data[col].astype(float)
    
    cleaned_df = pd.DataFrame()

    for match in match_data['game_key']:
        current_match_data = match_data[match_data['game_key'] == match].copy().reset_index()
        current_match_date = current_match_data['game_date'][0]
        current_match_year = int(current_match_data['game_date'][0][:4])
        home_team = current_match_data['home_team'][0]
        away_team = current_match_data['away_team'][0]
        

        prior_home_stats = player_data.query(f'Date<"{current_match_date}" and Date > "{current_match_year-1}-12-31" and Squad=="{home_team}" and Venue=="Home" and Start!="N"')
        prior_home_stats = prior_home_stats[players_numeric_columns]
        prior_away_stats = player_data.query(f'Date<"{current_match_date}" and Date > "{current_match_year-1}-12-31" and Squad=="{away_team}" and Venue=="Home" and Start!="N"')
        prior_away_stats = prior_away_stats[players_numeric_columns]
        
        prior_home_stats.columns = [f'home_prior_{x}' for x in prior_home_stats.columns]
        prior_away_stats.columns = [f'away_prior_{x}' for x in prior_away_stats.columns]
        
        if prior_home_stats.shape[0]==0:
            temp = prior_home_stats.columns
            prior_home_stats = pd.DataFrame([0]*len(prior_home_stats.columns)).T
            prior_home_stats.columns = temp
            
        if prior_away_stats.shape[0]==0:
            temp = prior_away_stats.columns
            prior_away_stats = pd.DataFrame([0]*len(prior_away_stats.columns)).T
            prior_away_stats.columns = temp

        prior_home_stats = pd.DataFrame(prior_home_stats.describe().T['mean']).T.reset_index()
        prior_away_stats = pd.DataFrame(prior_away_stats.describe().T['mean']).T.reset_index()
        
        final_row = pd.concat([current_match_data, prior_home_stats, prior_away_stats], axis=1)
        
        cleaned_df = pd.concat([cleaned_df, final_row], axis=0)
        
    return cleaned_df

In [3]:
player_data = combine_player_data(2022, 2024)

In [4]:
player_data.head()

Unnamed: 0,Date,Day,Comp,Round,Venue,Result,Squad,Opponent,Start,Pos,...,Att,Cmp%,PrgP,Carries,PrgC,Att_TakeOn,Succ,Match Report,player_url,game_key
1,2022-02-26,Sat,MLS,Regular Season,Home,D 1–1,Philadelphia Union,Minnesota Utd,N,AM,...,7,85.7,2,6,1,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,2022-02-26 Philadelphia Union vs Minnesota Utd
2,2022-03-05,Sat,MLS,Regular Season,Away,W 2–1,Philadelphia Union,CF Montréal,N,0,...,0,0.0,0,0,0,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,2022-03-05 CF Montréal vs Philadelphia Union
3,2022-03-12,Sat,MLS,Regular Season,Home,W 2–0,Philadelphia Union,SJ Earthquakes,N,FW,...,4,100.0,1,7,1,1,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,2022-03-12 Philadelphia Union vs SJ Earthquakes
4,2022-03-19,Sat,MLS,Regular Season,Away,W 2–0,Philadelphia Union,NYCFC,N,0,...,0,0.0,0,0,0,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,2022-03-19 NYCFC vs Philadelphia Union
5,2022-04-02,Sat,MLS,Regular Season,Home,W 2–0,Philadelphia Union,Charlotte,N,CM,...,0,,0,0,0,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...,2022-04-02 Philadelphia Union vs Charlotte


In [5]:
matches_df = create_match_data(player_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data_df.drop_duplicates(inplace=True)


In [6]:
matches_df.tail()

Unnamed: 0,game_date,day,round,home_team,away_team,result,overall_result,home_score,home_penalties,away_score,away_penalties,game_key
52880,2024-11-09,Sat,Round One,Charlotte,Orlando City,D 1 (1)–1 (4),D,1,1,1,4,2024-11-09 Charlotte vs Orlando City
53891,2024-11-09,Sat,Round One,Orlando City,Charlotte,D 1 (4)–1 (1),D,1,4,1,1,2024-11-09 Orlando City vs Charlotte
54962,2024-11-09,Sat,Round One,NYCFC,FC Cincinnati,D 0 (6)–0 (5),D,0,6,0,5,2024-11-09 NYCFC vs FC Cincinnati
52929,2024-11-09,Sat,Round One,Atlanta Utd,Inter Miami,W 3–2,W,3,0,2,0,2024-11-09 Atlanta Utd vs Inter Miami
53472,2024-11-09,Sat,Round One,Inter Miami,Atlanta Utd,L 2–3,L,2,0,3,0,2024-11-09 Inter Miami vs Atlanta Utd


In [7]:
cleaned_df = clean_data_for_modeling(player_data, matches_df)

In [8]:
cleaned_df

Unnamed: 0,index,game_date,day,round,home_team,away_team,result,overall_result,home_score,home_penalties,...,away_prior_SCA,away_prior_GCA,away_prior_Cmp,away_prior_Att,away_prior_Cmp%,away_prior_PrgP,away_prior_Carries,away_prior_PrgC,away_prior_Att_TakeOn,away_prior_Succ
0,1,2022-02-26,Sat,Regular Season,Philadelphia Union,Minnesota Utd,D 1–1,D,1,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,1539,2022-02-26,Sat,Regular Season,FC Dallas,Toronto FC,D 1–1,D,1,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,1126,2022-02-26,Sat,Regular Season,Columbus Crew,Vancouver W'caps,W 4–0,W,4,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,902,2022-02-26,Sat,Regular Season,NY Red Bulls,SJ Earthquakes,W 3–1,W,3,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,37,2022-02-26,Sat,Regular Season,Colorado Rapids,LAFC,L 0–3,L,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,52880,2024-11-09,Sat,Round One,Charlotte,Orlando City,D 1 (1)–1 (4),D,1,1,...,2.090909,0.207071,37.106061,43.934343,83.000505,4.010101,30.257576,1.479798,1.126263,0.515152
0,53891,2024-11-09,Sat,Round One,Orlando City,Charlotte,D 1 (4)–1 (1),D,1,4,...,1.803030,0.166667,31.439394,39.247475,78.186869,3.454545,25.888889,1.474747,0.974747,0.469697
0,54962,2024-11-09,Sat,Round One,NYCFC,FC Cincinnati,D 0 (6)–0 (5),D,0,6,...,2.255924,0.251185,36.028436,44.109005,80.095261,3.530806,30.919431,1.530806,1.748815,0.734597
0,52929,2024-11-09,Sat,Round One,Atlanta Utd,Inter Miami,W 3–2,W,3,0,...,2.095000,0.385000,41.430000,48.960000,83.267000,3.835000,32.875000,1.265000,1.455000,0.700000


In [9]:
cleaned_df.to_csv(output / 'cleaned_df.csv', index=False)
matches_df.to_csv(output / 'matches_df.csv', index=False)