# Import Libraries

In [2]:
import pandas as pd
import os
import requests
import time
from bs4 import BeautifulSoup
from tqdm import tqdm


# Data Preparation

In [2]:
# Specify the folder where the injury datasets are stored
data_folder = "Data\\InjuryDataPerLeaguePerSeason"

# List to store individual DataFrames
dataframes = []

# Loop through each file in the data folder
for file in os.listdir(data_folder):
    if file.startswith("player_injury_data_") and file.endswith(".csv"):
        file_path = os.path.join(data_folder, file)
        print("Reading: ", file_path)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Extract league name and season year from the file name
        parts = file.split('_')
        league_name = '_'.join(parts[3:-1])  # Handle potential underscores in league names
        season_year = parts[-1].split('.')[0]
        
        # Add league and season year columns
        df['League'] = league_name
        df['Season Year'] = season_year
        
        # Append to the list of DataFrames
        dataframes.append(df)

# Merge all DataFrames on the 'Player ID' column
merged_df = pd.concat(dataframes, ignore_index=True)

# Sort the merged dataset by 'Player ID', 'Season Year', and 'League'
merged_df.sort_values(by=['Player ID', 'Season Year', 'League'], inplace=True)

# # Save the merged dataset to a CSV file
# output_path = os.path.join(data_folder, 'merged_player_injury_data.csv')
# merged_df.to_csv(output_path, index=False)

# print(f"Merged dataset saved to {output_path}")

Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_Bundesliga_2021.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_Bundesliga_2022.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_Bundesliga_2023.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_Bundesliga_2024.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_La Liga_2021.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_La Liga_2022.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_La Liga_2023.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_La Liga_2024.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_Premier League_2021.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_Premier League_2022.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_Premier League_2023.csv
Reading:  Data\InjuryDataPerLeaguePerSeason\player_injury_data_Premier League_2024.csv


In [6]:
# Create dataframes
df_injuries = pd.read_csv('df_injuries.csv')
df_season_stats = pd.read_csv('df_season_stats.csv')

# Perform the left merge
df_merged = pd.merge(
    df_injuries,
    df_season_stats,
    on=["Player", "Player ID", "Season"],
    how="left",  # Left merge to keep all rows from df_injuries
    suffixes=("_injuries", "_stats"),
)

df_merged.to_csv('dataset.csv', index=False)


In [7]:
df_merged

Unnamed: 0,Player,Player ID,Season,Injury,From,Until,Days Out,Games Missed,Club_injuries,Competition,Appearances,Club_stats,Minutes Played
0,Eldin Jakupović,2857,22/23,Leg injury,"Apr 13, 2023","May 8, 2023",25 days,5,Los Angeles FC,CONCACAF Champions Cup,-,CONCACAF CL,-
1,Eldin Jakupović,2857,22/23,Leg injury,"Apr 13, 2023","May 8, 2023",25 days,5,Los Angeles FC,Premier League,-,Premier League,-
2,James Milner,3333,24/25,Hamstring injury,"Aug 31, 2024","Sep 14, 2024",14 days,2,Brighton & Hove Albion,Premier League,3,Premier League,172'
3,James Milner,3333,23/24,Muscle injury,"Jan 31, 2024","Jun 12, 2024",133 days,19,Brighton & Hove Albion,Premier League,15,Premier League,778'
4,James Milner,3333,23/24,Muscle injury,"Jan 31, 2024","Jun 12, 2024",133 days,19,Brighton & Hove Albion,FA Cup,-,FA Cup,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...
98333,Berhan Deniz,1196380,23/24,Inguinal hernia,"Apr 26, 2024","May 27, 2024",31 days,5,Samsunspor,U19 Süper Lig,11,U19 Süper Lig,770'
98334,Berhan Deniz,1196380,23/24,Inguinal hernia,"Apr 26, 2024","May 27, 2024",31 days,5,Samsunspor,Süper Lig,1,Süper Lig,3'
98335,Berhan Deniz,1196380,23/24,Inguinal hernia,"Apr 26, 2024","May 27, 2024",31 days,5,Samsunspor,Türkiye Kupasi,3,Türkiye Kupasi,98'
98336,Erdem Güleç,1230230,24/25,Broken tibia,"Jul 20, 2024",,134 days,14,Sivasspor,,,,


In [8]:
df_merged.isna().sum()

Player              0
Player ID           0
Season              0
Injury              0
From               12
Until             712
Days Out            0
Games Missed        0
Club_injuries       0
Competition       706
Appearances       702
Club_stats        702
Minutes Played    702
dtype: int64

In [9]:
df_merged['Injury'].nunique()

323

In [10]:
df_merged['Injury'].unique()

array(['Leg injury', 'Hamstring injury', 'Muscle injury', 'concussion',
       'unknown injury', 'Calf injury', 'Hamstring strain', 'Hip injury',
       'bruise', 'Knee surgery', 'Knee problems',
       'Achilles tendon problems', 'Achilles tendon irritation',
       'Knee injury', 'Muscle fatigue', 'Adductor injury', 'Corona virus',
       'Calf muscle tear', 'influenza', 'Thigh problems', 'surgery',
       'strain', 'Cruciate ligament tear', 'muscular problems',
       'Broken hand', 'Ill', 'Acromioclavicular joint dislocation',
       'Torn muscle fiber', 'Torn muscle bundle',
       'Internal ligament tear', 'Ankle injury', 'Knee bruise',
       'Ligament injury', 'Abdominal problems', 'minor knock', 'Knock',
       'Broken nose bone', 'Back problems', 'Broken finger', 'Virus',
       'Rest', 'Hip flexor problems', 'Arm injury', 'Adductor pain',
       'Metacarpal fracture', 'Internal ligament strain',
       'Dental surgery', 'flesh wound', 'Bursitis', 'Quarantine',
       'Rib fr