# Injury Prediction Modeling Notebook

# Step 1: Load Libraries

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


# Step 2: Load Data

In [44]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [45]:
print(f"Number of null values in train set:{df_train.isna().sum()}")

Number of null values in train set:Player                   0
Player ID                0
Season                   0
Injury                   0
From                     0
Days Out                 0
Games Missed             0
Club                     0
Appearances              0
League                   0
Minutes Played           0
is_injurydata_missing    0
target                   0
dtype: int64


In [46]:
print(f"Number of null values in test set:{df_test.isna().sum()}")

Number of null values in test set:Player                   0
Player ID                0
Season                   0
Injury                   0
From                     0
Days Out                 0
Games Missed             0
Club                     0
Appearances              0
League                   0
Minutes Played           0
is_injurydata_missing    0
target                   0
dtype: int64


In [47]:
print("Days Out:", df_train.loc[df_train['Days Out']<0].shape)
print("Games Missed:", df_train.loc[df_train['Games Missed']<0].shape)
print("target:", df_train.loc[df_train['target']<0].shape)
print("Appearances:", df_train.loc[df_train['Appearances']<0].shape)
print("is_injurydata_missing':", df_train.loc[df_train['is_injurydata_missing']==1].shape)


Days Out: (0, 13)
Games Missed: (0, 13)
target: (0, 13)
Appearances: (131, 13)
is_injurydata_missing': (131, 13)


# Step 3: Feature Engineering

In [48]:
def target_encoding(df, col, target_col='target', min_samples=1):
    encoding_map = df.groupby(col)[target_col].agg(['mean', 'count'])
    encoding_map = encoding_map.loc[encoding_map['count'] >= min_samples, 'mean'].to_dict()
    return df[col].map(encoding_map).fillna(df[target_col].mean())


In [75]:
def feature_engineering(df, apply_target_encoding=True):
    df['From'] = pd.to_datetime(df['From'], errors='coerce')

    # Create temporal features
    df['Year'] = df['From'].dt.year
    df['Month'] = df['From'].dt.month
    df['Day'] = df['From'].dt.day
    df['Weekday'] = df['From'].dt.weekday

    # Duration ratios
    df['Days_per_Game'] = df['Days Out'] / (df['Games Missed'] + 1)
    df['Minutes_per_Appearance'] = df['Minutes Played'] / (df['Appearances'] + 2)

    # Time since last game
    df['Days_Since_Last_Game'] = (pd.to_datetime('today') - df['From']).dt.days

    # Injury-related features
    df['Injury_Severity'] = df['Days Out'] / (df['Appearances'] + 2)

    # Cumulative injury history
    df['Injuries_in_League'] = df.groupby('League')['target'].cumsum() - df['target']
    df['Previous_Total_Injuries'] = df.groupby('Player ID')['target'].cumsum() - df['target']
    df['Previous_Season_Injuries'] = df[df['Season'] < df['Season'].max()].groupby('Player ID')['target'].transform('sum')

    # Target and label encoding for categorical features
    categorical_cols = ['Club', 'League', 'Season']
    encoder = LabelEncoder()
    for col in categorical_cols:
        df[f'{col}_LE'] = encoder.fit_transform(df[col].astype(str))
        if col in ['Club', 'League'] and apply_target_encoding:
            df[f'{col}_TE'] = target_encoding(df, col)

    df.drop(columns=categorical_cols, inplace=True)
    return df

In [80]:
# Apply Feature Engineering
train = feature_engineering(df_train.copy(), apply_target_encoding=True)
test = feature_engineering(df_test.copy(), apply_target_encoding=True)

# Step 4: Prepare Data for Modeling
train_sorted = train.sort_values(by=['From', 'Player ID']).reset_index(drop=True)
X_train = train_sorted.drop(['target', 'Injury', 'Player', 'Player ID', 'From'], axis=1)
y_train = train_sorted['target']
X_test = test.drop(['target', 'Injury', 'Player', 'Player ID', 'From'], axis=1)
y_test = test['target']

In [83]:
X_train['date'] = X_train

Unnamed: 0,Days Out,Games Missed,Appearances,Minutes Played,is_injurydata_missing,Year,Month,Day,Weekday,Days_per_Game,...,Days_Since_Last_Game,Injury_Severity,Injuries_in_League,Previous_Total_Injuries,Previous_Season_Injuries,Club_LE,Club_TE,League_LE,League_TE,Season_LE
0,52.0,8.0,22.0,1.0,False,2019,1,25,4,5.777778,...,2151,2.166667,1,0,0.0,458,0.000000,89,0.027273,0
1,52.0,8.0,1.0,23.0,False,2019,1,25,4,5.777778,...,2151,17.333333,1,0,0.0,458,0.000000,63,0.011765,0
2,153.0,19.0,16.0,936.0,False,2019,4,28,6,7.650000,...,2058,8.500000,12,0,0.0,388,0.022321,158,0.018889,0
3,153.0,19.0,4.0,316.0,False,2019,4,28,6,7.650000,...,2058,25.500000,4,0,0.0,388,0.022321,77,0.008011,0
4,195.0,36.0,8.0,720.0,False,2019,5,6,0,5.270270,...,2050,19.500000,0,0,3.0,25,1.000000,216,0.200000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15460,95.0,23.0,2.0,76.0,False,2024,6,27,3,3.958333,...,171,23.750000,7,0,,415,0.000000,68,0.016327,2
15461,95.0,23.0,3.0,173.0,False,2024,6,27,3,3.958333,...,171,19.000000,3,0,,415,0.000000,77,0.008011,2
15462,95.0,23.0,1.0,70.0,False,2024,6,27,3,3.958333,...,171,31.666667,0,0,,415,0.000000,69,0.025641,2
15463,31.0,5.0,-1.0,-1.0,True,2024,7,13,5,5.166667,...,155,31.000000,2,0,,244,0.000000,280,0.106870,2


In [None]:
# Step 5: Cross-Validation with Time-Series Split
def cross_validate_model(model, X, y, dates, cv_method='time_series', num_splits=3):
    if cv_method == 'time_series':
        cv = TimeSeriesSplit(n_splits=num_splits)
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        val_dates = dates.iloc[val_idx]
        train_dates = dates.iloc[train_idx]

        # Calculate class distribution in the validation set
        pos_ratio = y_val_fold.sum() / len(y_val_fold)
        neg_ratio = 1 - pos_ratio
        print(f"Fold {fold + 1}: Positive Ratio = {pos_ratio:.2f}, Negative Ratio = {neg_ratio:.2f}")
        print(f"Date Range in Validation: {val_dates.min().date()} to {val_dates.max().date()}")
        print(f"Date Range in Training: {train_dates.min().date()} to {train_dates.max().date()}")

        model.fit(X_train_fold, y_train_fold)
        y_pred_val = model.predict(X_val_fold)
        evaluate_model(y_val_fold, y_pred_val)
    
# Evaluation Function
def evaluate_model(y_true, y_pred):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("Accuracy Score:", accuracy_score(y_true, y_pred))

# Example Model Evaluation
model = RandomForestClassifier(random_state=42)
cross_validate_model(model, X_train, y_train, train_sorted['From'], num_splits=3)

# Final Evaluation on Test Set
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)


Fold 1: Positive Ratio = 0.02, Negative Ratio = 0.98
Date Range in Validation: 2020-06-18 to 2023-02-04
Date Range in Training: 2019-01-25 to 2020-06-18
Confusion Matrix:
 [[3788    3]
 [   4   71]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3791
           1       0.96      0.95      0.95        75

    accuracy                           1.00      3866
   macro avg       0.98      0.97      0.98      3866
weighted avg       1.00      1.00      1.00      3866

Accuracy Score: 0.9981893429901707
Fold 2: Positive Ratio = 0.04, Negative Ratio = 0.96
Date Range in Validation: 2023-02-04 to 2023-10-31
Date Range in Training: 2019-01-25 to 2023-02-04
Confusion Matrix:
 [[3730    0]
 [  48   88]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3730
           1       1.00      0.65      0.79       136

    accuracy                

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
