## Retrieval of Game Week Wise Player Match Statistics (2024-2025) from GitHub URL

In [None]:
import os
from dotenv import load_dotenv
import requests
import json
import pandas as pd
import numpy as np
import pickle

In [None]:
owner = "olbauday"
repo = "FPL-Elo-Insights"
branch = "main"

load_dotenv(dotenv_path="C:/PROJECT/.env")
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

if not GITHUB_TOKEN:
    raise RuntimeError("Personal Access Token is not set")
    
pms_path_24 = "data/2024-2025/playermatchstats"
pms_save_folder_24 = r"C:\exp1\Player Gameweek Stats 24"
players_path_24 = "data/2024-2025/players"
matches_path_24 = "data/2024-2025/matches"
players_save_folder_24 = r"C:\exp1\Player Stats 24"
matches_save_folder_24 = r"C:\exp1\Match Stats 24"

path_25 = "data/2025-2026/By Tournament/Premier League"
save_folder_25 = r"C:\\exp1\\"
file_dir_map = {
    "playermatchstats.csv": os.path.join(save_folder_25, "Player Gameweek Stats 25"),
    "players.csv": os.path.join(save_folder_25, "Player Stats 25"),
    "matches.csv": os.path.join(save_folder_25, "Match Stats 25")
               }

In [None]:
pms_df_24 = None
players_df_24 = None
matches_df_24 = None
teams_df_24 = None
pms_df_25 = None
players_df_25 = None
matches_df_25 = None
teams_df_25 = None

In [None]:
def downloadGitHubData(owner, repo, branch, base_path, local_save_folder):
    headers = {
        "Authorization": f"token {GITHUB_TOKEN}",
        "Accept": "application/vnd.github.v3+json"
    }
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{base_path}?ref={branch}"
    print(f"Retrieving directory contents from GitHub: {base_path}")
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error: Could not retrieve contents for {base_path}. Status code: {response.status_code}")
        print("Please check the Owner / Repository / Branch / Base Path.")
        return  
    try:
        contents = response.json()
    except requests.exceptions.JSONDecodeError:
        print("Error: Failed to decode JSON response")
        return
    for item in contents:
        local_path = os.path.join(local_save_folder, item['name'])
        if item['type'] == 'dir' and item['name'].startswith('GW'):
            dir_exists = os.path.isdir(local_path)
            if dir_exists:
                print(f"{item['name']} exists.")
            else:
                print(f"{item['name']} created.")
            new_path = f"{base_path}/{item['name']}"
            downloadGitHubData(owner, repo, branch, new_path, local_path)
        elif item['type'] == 'file' and item['name'].endswith('.csv'):
            download_url = item.get('download_url')
            if not download_url:
                print(f"No URL found for downloading {item['name']}")
                continue
            if os.path.exists(local_path):
                # print(f"(SKIP) File already exists: {item['name']}")
                continue            
            print(f"Downloading: {item['name']}")
            file_dir = os.path.dirname(local_path)
            os.makedirs(file_dir, exist_ok=True)
            file_content_response = requests.get(download_url)
            if file_content_response.status_code == 200:
                with open(local_path, 'wb') as f:
                    f.write(file_content_response.content)
                print(f"{item['name']} has been saved successfully.")
            else:
                print(f"Downloading {item['name']}")
                print(f"Status code: {file_content_response.status_code}")
    print(f"Finished processing contents of: {base_path}")

In [None]:
def downloadGitHubData_25(owner, repo, branch, base_path, local_save_folder):
    
    # define headers
    headers = {
        "Authorization": f"token {GITHUB_TOKEN}",
        "Accept": "application/vnd.github.v3+json"
    }
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{base_path}?ref={branch}"
    
    # generating response and handling reply
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error: Could not retrieve contents for {base_path}. Status code: {response.status_code}")
        print("Please check the Owner / Repository / Branch / Base Path.")
        return 
    try:
        # storing the response in a variable
        contents = response.json()
    except requests.exceptions.JSONDecodeError:
        print("Error: Failed to decode JSON response")
        return
    for item in contents:
        item_name = item['name']
        if item['type'] == 'dir' and item_name.startswith('GW'):
            try:
                gw_number = int(item_name[2:])
            except ValueError:
                continue
            if not (17 <= gw_number <= 21):
                print(f"SKIP: {item_name}")
                continue
            print(f"Entering directory: {item_name}")
            new_path = f"{base_path}/{item_name}"
            downloadGitHubData_25(owner, repo, branch, new_path, item_name) 
        elif item['type'] == 'file' and item_name.endswith('.csv'):
            if item_name not in file_dir_map:
                print(f"SKIP: {item_name}")
                continue
            gw_folder_name = local_save_folder
            target_local_root = file_dir_map[item_name]
            local_path = os.path.join(target_local_root, gw_folder_name, item_name)
            download_url = item.get('download_url')
            if not download_url:
                print(f"No url found for downloading {item_name}")
                continue
            if os.path.exists(local_path):
                print(f"{item_name} exists. Skipping.")
                continue           
            print(f"Downloading file: {item_name} to {local_path}")
            file_dir = os.path.dirname(local_path)
            os.makedirs(file_dir, exist_ok=True)
            file_content_response = requests.get(download_url)
            if file_content_response.status_code == 200:
                with open(local_path, 'wb') as f:
                    f.write(file_content_response.content)
                print(f"{item_name} has been saved successfully.")
            else:
                print(f"Error downloading {item_name}. Status code: {file_content_response.status_code}")
    print(f"Finished processing contents of: {base_path}")

## Loading Players Match, Players, Match Stats (2024-2025)

### 1. Players Match Stats 

In [None]:
# downloadGitHubData(owner, repo, branch, pms_path_24, pms_save_folder_24)

In [None]:
root = os.path.join("C:\\PROJECT\\data\\raw-data", "player-match-data-24")
if pms_df_24 is not None:
    print(f"The final combined dataframe already exists with {pms_df_24.shape[0]} rows and {pms_df_24.shape[1]} columns")
else:
    all_gw_data = []
    for idx in range(1, 39):
        gw_folder = f"GW{idx}"
        csv_file = os.path.join(root, gw_folder, "playermatchstats.csv")
        if os.path.exists(csv_file):
            print(f"Loading Player Data from: {gw_folder}")
            temp_df = pd.read_csv(csv_file)
            temp_df['Game Week'] = idx
            all_gw_data.append(temp_df)
        else: print(f"File not found for: {gw_folder}")
    if all_gw_data:
        pms_df_24 = pd.concat(all_gw_data, ignore_index=True)
        print(f"Succesfully combined all Game Week's Players Data into One single Data frame!")
        print(f"Shape of the dataframe = {pms_df_24.shape}")
    else:
        print("No files were found or loaded")

### 2. Players Stats

In [None]:
# downloadGitHubData(owner, repo, branch, players_path_24, players_save_folder_24)

In [None]:
root = os.path.join("C:\\PROJECT\\data\\raw-data", "player-data-24")
if players_df_24 is not None:
    print(f"The dataframe already exists with {players_df_24.shape[0]} rows and {players_df_24.shape[1]} columns")
else:
    csv_file = os.path.join(root, "players.csv")
    if os.path.exists(csv_file):
        print("Loading Players Data")
        players_df_24 = pd.read_csv(csv_file)
        print("Loaded successfully")
        print(f"Shape of the dataframe: {players_df_24.shape}")
    else: print(f"File not found at: {csv_file}")

### 3. Match Stats 

In [None]:
# downloadGitHubData(owner, repo, branch, matches_path_24, matches_save_folder_24)

In [None]:
root = os.path.join("C:\\PROJECT\\data\\raw-data", "match-data-github-24")
if matches_df_24 is not None:
    print(f"The dataframe already exists with {matches_df_24.shape[0]} rows and {matches_df_24.shape[1]} columns")
else:
    csv_file = os.path.join(root, "matches.csv")
    if os.path.exists(csv_file):
        print("Loading Matches Data")
        matches_df_24 = pd.read_csv(csv_file)
        print("Loaded successfully")
        print(f"Shape of the dataframe: {matches_df_24.shape}")
    else: print(f"File not found at: {csv_file}")

In [None]:
matches_df_24['kickoff_time'] = pd.to_datetime(matches_df_24['kickoff_time'], format="mixed")
matches_df_24['gameweek'] = matches_df_24['gameweek'].astype(int)
matches_df_24 = matches_df_24.sort_values(by=['gameweek', 'kickoff_time'], ascending=True).reset_index(drop=True)

### 4. Teams Stats

In [None]:
root = os.path.join("C:\\PROJECT\\data\\raw-data", "team-data-24")
if teams_df_24 is not None:
    print(f"The dataframe already exists with {teams_df_24.shape[0]} rows and {teams_df_24.shape[1]} columns")
else:
    csv_file = os.path.join(root, "teams24.csv")
    if os.path.exists(csv_file):
        print("Loading Teams Data")
        teams_df_24 = pd.read_csv(csv_file)
        print("Loaded successfully")
        print(f"Shape of the dataframe: {teams_df_24.shape}")
    else: print(f"File not found at: {csv_file}")

In [None]:
# verify and validate
print(f"Player Match Statistics: {pms_df_24.shape}")
print(f"Player Statistics: {players_df_24.shape}")
print(f"Match Statistics: {matches_df_24.shape}")
print(f"Team Statistics: {teams_df_24.shape}")

## Loading Players Match, Players, Match Stats, Team Stats (2025-2026)

### 1. Players Match Stats

In [None]:
downloadGitHubData_25(owner, repo, branch, path_25, save_folder_25)

In [None]:
root = os.path.join("C:\\PROJECT\\data\\raw-data", "player-match-data-25")
if pms_df_25 is not None:
    print(f"The final combined dataframe already exists with {pms_df_25.shape[0]} rows and {pms_df_25.shape[1]} columns")
else:
    all_gw_data = []
    for idx in range(1, 7):
        gw_folder = f"GW{idx}"
        csv_file = os.path.join(root, gw_folder, "playermatchstats.csv")
        if os.path.exists(csv_file):
            print(f"Loading Player Data from: {gw_folder}")
            temp_df = pd.read_csv(csv_file)
            temp_df['Game Week'] = idx
            all_gw_data.append(temp_df)
        else: print(f"File not found for: {gw_folder}")
    if all_gw_data:
        pms_df_25 = pd.concat(all_gw_data, ignore_index=True)
        print(f"Succesfully combined all Game Week's Players Data into One single Data frame!")
        print(f"Shape of the dataframe = {pms_df_25.shape}")
    else:
        print("No files were found or loaded")

### 2. Players Stats

In [None]:
root = os.path.join("C:\\PROJECT\\data\\raw-data", "player-data-25")
if players_df_25 is not None:
    print(f"The final combined dataframe already exists with {players_df_25.shape[0]} rows and {players_df_25.shape[1]} columns")
else:
    all_gw_data = []
    for idx in range(1, 7):
        gw_folder = f"GW{idx}"
        csv_file = os.path.join(root, gw_folder, "players.csv")
        if os.path.exists(csv_file):
            print(f"Loading Player Data from: {gw_folder}")
            temp_df_25 = pd.read_csv(csv_file)
            temp_df_25['Game Week'] = idx
            all_gw_data.append(temp_df_25)
        else: print(f"File not found for: {gw_folder}")
    if all_gw_data:
        players_df_25 = pd.concat(all_gw_data, ignore_index=True)
        print(f"Succesfully combined all Players Data into One single Data frame!")
        print(f"Shape of the dataframe = {players_df_25.shape}")
    else:
        print("No files were found or loaded")

### 3. Match Stats

In [None]:
root = os.path.join("C:\\PROJECT\\data\\raw-data", "match-data-github-25")
if matches_df_25 is not None:
    print(f"The final combined dataframe already exists with {matches_df_25.shape[0]} rows and {matches_df_25.shape[1]} columns")
else:
    all_gw_data = []
    for idx in range(1, 7):
        gw_folder = f"GW{idx}"
        csv_file = os.path.join(root, gw_folder, "matches.csv")
        if os.path.exists(csv_file):
            print(f"Loading Match Data from: {gw_folder}")
            temp_df_25 = pd.read_csv(csv_file)
            temp_df_25['Game Week'] = idx
            all_gw_data.append(temp_df_25)
        else: print(f"File not found for: {gw_folder}")
    if all_gw_data:
        matches_df_25 = pd.concat(all_gw_data, ignore_index=True)
        print(f"Succesfully combined all Matches Data into One single Data frame!")
        print(f"Shape of the dataframe = {matches_df_25.shape}")
    else:
        print("No files were found or loaded")

In [None]:
matches_df_25['kickoff_time'] = pd.to_datetime(matches_df_25['kickoff_time'], format="mixed")
matches_df_25['gameweek'] = matches_df_25['gameweek'].astype(int)
matches_df_25 = matches_df_25.sort_values(by=['gameweek', 'kickoff_time'], ascending=True).reset_index(drop=True)

### 4. Teams Stats

In [None]:
root = os.path.join("C:\\PROJECT\\data\\raw-data", "team-data-25")
if teams_df_25 is not None:
    print(f"The dataframe already exists with {teams_df_25.shape[0]} rows and {teams_df_25.shape[1]} columns")
else:
    csv_file = os.path.join(root, "teams25.csv")
    if os.path.exists(csv_file):
        print("Loading Teams Data")
        teams_df_25 = pd.read_csv(csv_file)
        print("Loaded successfully")
        print(f"Shape of the dataframe: {teams_df_25.shape}")
    else: print(f"File not found at: {csv_file}")

### Dimensionality check of all dataframes gathered

1. 2024-2025 Season

In [None]:
print(f"Player Match Statistics: {pms_df_24.shape}")
print(f"Player Statistics: {players_df_24.shape}")
print(f"Match Statistics: {matches_df_24.shape}")
print(f"Team Statistics: {teams_df_24.shape}")

2. 2025-2026 season

In [None]:
print(f"Player Match Statistics: {pms_df_25.shape}")
print(f"Player Statistics: {players_df_25.shape}")
print(f"Match Statistics: {matches_df_25.shape}")
print(f"Team Statistics: {teams_df_25.shape}")

## Data Merging & Concatenation

### I. Players Match Statistics

In [None]:
pms_24_columns = set(pms_df_24.columns)
pms_25_columns = set(pms_df_25.columns)
pms_different_columns = pms_25_columns.difference(pms_24_columns)
combined_pms = pd.concat([pms_df_24, pms_df_25], ignore_index=True)
print(f"Shape of combined PMS data: {combined_pms.shape}")

### II. Players Statistics

In [None]:
players_24_columns = set(players_df_24.columns)
players_25_columns = set(players_df_25.columns)
players_different_columns = players_25_columns.difference(players_24_columns)
combined_players = pd.concat([players_df_24, players_df_25], ignore_index=True).drop(columns=players_different_columns, errors='ignore')
combined_players = combined_players.drop_duplicates(subset=['player_id'], keep='first')
print(f"Shape of combined Players data: {combined_players.shape}")

### III. Matches Statistics

In [None]:
matches_24_columns = set(matches_df_24.columns)
matches_25_columns = set(matches_df_25.columns)
matches_different_columns = matches_25_columns.difference(matches_24_columns)
matches_df_24['season'] = 2024
matches_df_25['season'] = 2025
combined_matches = pd.concat([matches_df_24, matches_df_25], ignore_index=True).drop(columns=matches_different_columns, errors='ignore')
print(f"Shape of combined Matches data: {combined_matches.shape}")

### IV. Teams Statistics

In [None]:
teams_df_25 = teams_df_25.sort_values(by='id', ascending=True).reset_index(drop=True)

In [None]:
teams_stats = pd.concat([teams_df_24, teams_df_25], ignore_index=True).drop(columns=['fotmob_name'], errors='ignore').drop_duplicates()
teams_stats.loc[teams_stats.index[:20], 'season'] = 2024
teams_stats.loc[teams_stats.index[20:], 'season'] = 2025
teams_stats['season'] = teams_stats['season'].astype(int)
print(f"Shape of Overall Teams data: {teams_stats.shape}")

## Working with the PMS data

#### It is evident that the features: 'player_id' & 'match_id' are the foreign keys and refer to the Players data and Match data respectively

In [None]:
combined_pms_columns = set(combined_pms.columns)
combined_players_columns = set(combined_players.columns)
pms_players_same = combined_pms_columns.intersection(combined_players_columns)
print(f"Same columns from PMS and Players data: {pms_players_same}")

In [None]:
pms_players = combined_pms.merge(
    combined_players[['player_id', 'position', 'team_code']],
    on='player_id',
    how='left',
    suffixes=('_pms', '_static')
)

In [None]:
print(f"Shape of combined PMS data: {combined_pms.shape}")
print(f"Shape of combined Players data: {combined_players.shape}")
print(f"Shape after merging PMS and Players data: {pms_players.shape}")

### Divide the dataset on the basis of the positions of different players 

In [None]:
pms_players['position'].value_counts()

In [None]:
position_map = {
    'Goalkeeper':'GK',
    'Defender':'DEF',
    'Midfielder':'MID',
    'Forward':'FWD',
    'Unknown':'NA'
}
pms_players['position_group'] = pms_players['position'].map(position_map)
pms_players_gk = pms_players[pms_players['position_group']=='GK'].copy()
pms_players_def = pms_players[pms_players['position_group']=='DEF'].copy()
pms_players_mid = pms_players[pms_players['position_group']=='MID'].copy()
pms_players_fwd = pms_players[pms_players['position_group']=='FWD'].copy()

In [None]:
# goalkeeper stats
gk_stats = pms_players_gk[['player_id', 'team_code', 'match_id', 'Game Week', 'minutes_played', 
                           'gk_accurate_passes', 'gk_accurate_long_balls', 
                           'saves', 'saves_inside_box', 
                           'goals_conceded', 'team_goals_conceded',
                           'xgot_faced', 'goals_prevented',
                           'sweeper_actions', 'high_claim']].copy()

# defender stats
def_stats = pms_players_def[['player_id', 'match_id', 'team_code', 'Game Week', 'minutes_played', 'xg', 'xa',
                            'accurate_passes', 'accurate_long_balls', 'final_third_passes',
                            'tackles_won', 'interceptions', 'recoveries', 'blocks', 'clearances', 
                            'headed_clearances', 'dribbled_past', 'duels_won',
                            'ground_duels_won', 'aerial_duels_won', 'was_fouled', 'fouls_committed',
                            'tackles', 'distance_covered', 'defensive_contributions']].copy()
def_stats['tackles_won_percentage'] = def_stats['tackles_won'] / def_stats['tackles']
def_stats = def_stats.drop(columns='tackles', errors='ignore')

# midfielder stats
mid_stats = pms_players_mid[['player_id', 'match_id', 'team_code', 'Game Week', 'minutes_played',
                             'goals', 'assists', 'xg', 'xa',
                             'accurate_passes', 'accurate_crosses', 'accurate_long_balls', 'final_third_passes',
                             'total_shots', 'shots_on_target',
                             'chances_created', 'touches',
                             'successful_dribbles', 'corners',
                             'penalties_scored', 'penalties_missed',
                             'tackles_won', 'interceptions', 'recoveries', 'blocks', 'clearances',
                             'dribbled_past', 'duels_won', 'ground_duels_won', 'aerial_duels_won',
                             'was_fouled', 'fouls_committed',
                             'distance_covered', 'defensive_contributions']].copy()

# forward stats
fwd_stats = pms_players_fwd[['player_id', 'match_id', 'team_code', 'Game Week', 'minutes_played',
                             'goals', 'assists', 'xg', 'xa', 'xgot',
                             'accurate_passes', 'final_third_passes',
                             'total_shots', 'shots_on_target',
                             'chances_created', 'big_chances_missed', 'touches', 'touches_opposition_box',
                             'successful_dribbles', 'corners', 'offsides',
                             'penalties_scored', 'penalties_missed',
                             'duels_won', 'ground_duels_won', 'aerial_duels_won',
                             'was_fouled', 'fouls_committed', 'dispossessed']].copy()

print(f"Shape of GK stats: {gk_stats.shape}")
print(f"Shape of DEF stats: {def_stats.shape}")
print(f"Shape of MID stats: {mid_stats.shape}")
print(f"Shape of FWD stats: {fwd_stats.shape}")

## Data Cleaning: Cleaning the data seperately for all the datasets

In [None]:
print(f"Shape of GK stats: {gk_stats.shape}")
null_gk_stats_dict = {key: value for key, value in dict(gk_stats.isnull().sum()).items() if value > 0}
print(f"Columns with their NaN values: {null_gk_stats_dict}")

In [None]:
print(f"Shape of DEF stats: {def_stats.shape}")
null_def_stats_dict = {key: value for key, value in dict(def_stats.isnull().sum()).items() if value > 0}
print(f"Columns with their NaN values: {null_def_stats_dict}")

In [None]:
print(f"Shape of MID stats: {mid_stats.shape}")
null_mid_stats_dict = {key: value for key, value in dict(mid_stats.isnull().sum()).items() if value > 0}
print(f"Columns with their NaN values: {null_mid_stats_dict}")

In [None]:
print(f"Shape of MID stats: {fwd_stats.shape}")
null_fwd_stats_dict = {key: value for key, value in dict(fwd_stats.isnull().sum()).items() if value > 0}
print(f"Columns with their NaN values: {null_fwd_stats_dict}")

In [None]:
# features to be dropped entirely from the dataset
def_mid_drop_columns = ['defensive_contributions', 'distance_covered']
fwd_drop_columns = ['dispossessed']

def_stats = def_stats.drop(columns=def_mid_drop_columns, errors='ignore')
mid_stats = mid_stats.drop(columns=def_mid_drop_columns, errors='ignore')
fwd_stats = fwd_stats.drop(columns=fwd_drop_columns, errors='ignore')

# features to be filled and imputed with 0
gk_stats['saves_inside_box'] = gk_stats['saves_inside_box'].fillna(0)
def_stats['tackles_won_percentage'] = def_stats['tackles_won_percentage'].fillna(0)
mid_stats['corners'] = mid_stats['corners'].fillna(0) # check before & after imputing
fwd_stats['corners'] = fwd_stats['corners'].fillna(0)

In [None]:
print(f"Shape of GK stats: {gk_stats.shape}")
print(f"Shape of DEF stats: {def_stats.shape}")
print(f"Shape of MID stats: {mid_stats.shape}")
print(f"Shape of FWD stats: {fwd_stats.shape}")

## Feature Engineering: Introducing Rolling Features

In [None]:
# ensure the player was a genuine and active participant in the match, having played atleast 60 mins out of 90
min_mins_played = 60
genuine_gk_stats = gk_stats[gk_stats['minutes_played'] >= min_mins_played].copy()
genuine_def_stats = def_stats[def_stats['minutes_played'] >= min_mins_played].copy()
genuine_mid_stats = mid_stats[mid_stats['minutes_played'] >= min_mins_played].copy()
genuine_fwd_stats = fwd_stats[fwd_stats['minutes_played'] >= min_mins_played].copy()
print(f"Shape of GK stats: {genuine_gk_stats.shape}")
print(f"Shape of DEF stats: {genuine_def_stats.shape}")
print(f"Shape of MID stats: {genuine_mid_stats.shape}")
print(f"Shape of FWD stats: {genuine_fwd_stats.shape}")

In [None]:
genuine_gk_stats = genuine_gk_stats.sort_values(by=['player_id', 'Game Week'])
genuine_gk_stats = genuine_gk_stats.reset_index(drop=True)
genuine_gk_stats['chron_idx'] = genuine_gk_stats.index

genuine_def_stats = genuine_def_stats.sort_values(by=['player_id', 'Game Week'])
genuine_def_stats = genuine_def_stats.reset_index(drop=True)
genuine_def_stats['chron_idx'] = genuine_def_stats.index

genuine_mid_stats = genuine_mid_stats.sort_values(by=['player_id', 'Game Week'])
genuine_mid_stats = genuine_mid_stats.reset_index(drop=True)
genuine_mid_stats['chron_idx'] = genuine_mid_stats.index

genuine_fwd_stats = genuine_fwd_stats.sort_values(by=['player_id', 'Game Week'])
genuine_fwd_stats = genuine_fwd_stats.reset_index(drop=True)
genuine_fwd_stats['chron_idx'] = genuine_fwd_stats.index

print("All datasets are now sorted and contain the 'chron_idx' merge key without using inplace=True.")

In [None]:
# preparing the list of columns which will be engineered and transformed into rolling features (last 5)
gk_rolling = ['gk_accurate_passes', 'gk_accurate_long_balls', 
              'saves', 'saves_inside_box', 
              'goals_conceded', 'team_goals_conceded', 
              'xgot_faced', 'goals_prevented' ,
              'sweeper_actions', 'high_claim']
def_rolling = ['xg', 'xa', 'accurate_passes', 'accurate_long_balls', 'final_third_passes',
               'tackles_won', 'interceptions', 'recoveries', 'blocks', 'clearances',
               'headed_clearances', 'dribbled_past', 'duels_won', 'ground_duels_won',
               'aerial_duels_won', 'was_fouled', 'fouls_committed',
               'tackles_won_percentage']
mid_rolling = ['goals', 'assists', 'xg', 'xa', 
               'accurate_passes', 'accurate_crosses', 'accurate_long_balls','final_third_passes', 
               'total_shots', 'shots_on_target',
               'chances_created', 'touches', 'successful_dribbles', 'corners',
               'penalties_scored', 'penalties_missed', 'tackles_won', 'interceptions',
               'recoveries', 'blocks', 'clearances', 'dribbled_past', 'duels_won',
               'ground_duels_won', 'aerial_duels_won', 'was_fouled', 'fouls_committed']
fwd_rolling = ['goals', 'assists', 'xg', 'xa', 'xgot', 
               'accurate_passes', 'final_third_passes', 
               'total_shots', 'shots_on_target', 'chances_created', 'big_chances_missed', 
               'touches', 'touches_opposition_box', 'successful_dribbles', 'corners', 'offsides',
               'penalties_scored', 'penalties_missed', 'duels_won', 'ground_duels_won',
               'aerial_duels_won', 'was_fouled', 'fouls_committed']

def rollingFeatures(frame, rolling, groupedby='player_id', prefix='L5_Avg_'):
    rolling_frame = frame.groupby(groupedby)[rolling].rolling(window=5, min_periods=1).mean().shift(1).reset_index()
    new_cols = ['player_id', 'chron_idx'] + [prefix + col for col in rolling]
    rolling_frame.columns = new_cols
    first_row_idx = frame.groupby(groupedby)['chron_idx'].min().values
    rolling_frame['is_first_match'] = rolling_frame['chron_idx'].isin(first_row_idx)
    for col in rolling_frame.columns:
        if col.startswith(prefix):
            rolling_frame.loc[rolling_frame['is_first_match'], col] = np.nan
    return frame.merge(
        rolling_frame.drop(columns='is_first_match', errors='ignore'),
        on=['player_id', 'chron_idx'], 
        how='left'
    ).drop(columns='chron_idx')

In [None]:
fe_gk_stats = rollingFeatures(genuine_gk_stats, gk_rolling)
fe_def_stats = rollingFeatures(genuine_def_stats, def_rolling)
fe_mid_stats = rollingFeatures(genuine_mid_stats, mid_rolling)
fe_fwd_stats = rollingFeatures(genuine_fwd_stats, fwd_rolling)
print(f"Shape of Feature Engineered GK stats: {fe_gk_stats.shape}")
print(f"Shape of Feature Engineered DEF stats: {fe_def_stats.shape}")
print(f"Shape of Feature Engineered MID stats: {fe_mid_stats.shape}")
print(f"Shape of Feature Engineered FWD stats: {fe_fwd_stats.shape}")

### Merging Teams and Matches data

In [None]:
home_strength_columns = ['name', 'code', 'season', 'strength', 'strength_overall_home', 'strength_attack_home', 'strength_defence_home', 'elo']
away_strength_columns = ['name', 'code', 'season', 'strength', 'strength_overall_away', 'strength_attack_away', 'strength_defence_away', 'elo']
teams_matches = combined_matches.merge(
    teams_stats[home_strength_columns].rename(
        columns={col: f'HT_{col}' for col in home_strength_columns if col not in ['code', 'season']}
    ),
    left_on=['home_team', 'season'],
    right_on=['code', 'season'],
    how='left'
).drop(columns='code', errors='ignore').copy()
teams_matches = teams_matches.merge(
    teams_stats[away_strength_columns].rename(
        columns={col: f'AT_{col}' for col in away_strength_columns if col not in ['code', 'season']}
    ),
    left_on=['away_team', 'season'],
    right_on=['code', 'season'],
    how='left'
).drop(columns='code', errors='ignore').copy()

### Feature Reduction and Data Cleaning

In [None]:
teams_matches['kickoff_time'] = pd.to_datetime(teams_matches['kickoff_time'], format="mixed")
teams_matches['gameweek'] = teams_matches['gameweek'].astype(int)
teams_matches = teams_matches.sort_values(by=['gameweek', 'kickoff_time'], ascending=True).reset_index(drop=True)
teams_matches = teams_matches.drop(columns='fotmob_id', errors='ignore')

#### Identifying Null Columns

In [None]:
{key: value for key, value in dict(teams_matches.isnull().sum()).items() if value > 0}

In [None]:
ht_elo_median = teams_matches['home_team_elo'].median()
at_elo_median = teams_matches['away_team_elo'].median()
teams_matches['home_team_elo'] = teams_matches['home_team_elo'].fillna(ht_elo_median)
teams_matches['away_team_elo'] = teams_matches['away_team_elo'].fillna(at_elo_median)
teams_matches['elo_diff'] = teams_matches['home_team_elo'] - teams_matches['away_team_elo']
teams_matches = teams_matches.rename(columns={'home_team_elo':'ht_match_elo', 'away_team_elo':'at_match_elo'})
print(f"ELO ratings of Home Team imputed with {ht_elo_median:.2f} and Away Team imputed with {at_elo_median:.2f}")

In [None]:
median_cols = [
    'home_possession', 'away_possession', 
    'home_tackles_won_pct', 'away_tackles_won_pct'
]
for col in median_cols:
    median = teams_matches[col].median() 
    print(f"Filled {teams_matches[col].isnull().sum()} NaNs in '{col}' with median: {median:.2f}")
    teams_matches[col] = teams_matches[col].fillna(median)

In [None]:
{key: value for key, value in dict(teams_matches.isnull().sum()).items() if value > 0}

In [None]:
print(list(teams_matches.columns))

In [None]:
teams_matches.shape

In [None]:
pms_players.columns

## Save all datasets as pickle file for Data Integration 

1. Position-wise Players Match Data

In [None]:
fe_gk_stats.to_pickle('C:/exp1/Pickle Files/fe_gk_stats.pkl')
fe_def_stats.to_pickle('C:/exp1/Pickle Files/fe_def_stats.pkl')
fe_mid_stats.to_pickle('C:/exp1/Pickle Files/fe_mid_stats.pkl')
fe_fwd_stats.to_pickle('C:/exp1/Pickle Files/fe_fwd_stats.pkl')

2. Teams + Match Data

In [None]:
teams_matches.to_pickle('C:/exp1/Pickle Files/teams_matches.pkl')

In [None]:
teams_matches.dtypes

In [None]:
teams_matches.shape