In [191]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, jaccard_score
from sklearn.linear_model import RidgeClassifier as RC
from sklearn.neural_network import MLPClassifier as MLP

from common import load_data


In [192]:
# --- Load the dataset and check distributions --- #
main_df = load_data('exp2_data.csv', verbose=False)


In [193]:
# --- Prepare dataset --- #
# one-hot encoding of categorical features
# main_df = pd.get_dummies(main_df, columns=['Position', 'StadiumType', 'FieldType', 'Weather'], dtype=int)

# drop uneeded columns
main_df = main_df.drop(columns=['GameID', 'Position', 'StadiumType', 'FieldType', 'Weather', 'GameDist'])

# make new target variable for injury occurence
main_df['InjurySeverity'] = main_df['InjurySeverity'].apply(lambda x: 1 if x > 0 else 0)
main_df.rename(columns={'InjurySeverity': 'Injury'}, inplace=True)

# create dataset with more balanced classes
injury_players_df = main_df[main_df['Injury'] == 1]['PlayerKey'].unique()
main_df = main_df[main_df['PlayerKey'].isin(injury_players_df)]

injury_df = main_df[main_df['Injury'] == 1]
non_injury_df = main_df[main_df['Injury'] == 0]

random_samples = non_injury_df.sample(n=injury_df.shape[0], random_state=42)
main_df = pd.concat([injury_df, random_samples], axis=0)

print('Num of injuries: {}'.format(injury_df.shape[0]))
print('Num of unique players: {}'.format(len(main_df['PlayerKey'].unique())))
print('Num of columns: {}'.format(len(main_df.columns)))
print('Num of rows: {}'.format(len(main_df)))
print(main_df.head(5))


Num of injuries: 105
Num of unique players: 100
Num of columns: 4
Num of rows: 210
     PlayerKey  PlayerGame  CumulativeDist  Injury
118      31070           3         4433.92       1
167      31933          20        36877.01       1
210      33337           2         3258.37       1
216      33337           8        12653.17       1
228      33474          19        41041.52       1


In [194]:
# --- Split data for training and testing --- #
train_data, test_data = train_test_split(main_df, test_size=0.2, random_state=42, shuffle=True)

target_feature = 'Injury'
y_train = train_data[target_feature]
y_test = test_data[target_feature]
x_train = train_data.drop([target_feature], axis=1)
x_test = test_data.drop([target_feature], axis=1)


In [195]:
# --- Train and Test Ridge Classifier --- #
ridge = RC(alpha=1, random_state=42)
ridge.fit(x_train, y_train)
y_pred = ridge.predict(x_test)
print('Accuracy: {:.5f}'.format(ridge.score(x_test, y_test)))
print('Precision: {:.5f}'.format(precision_score(y_test, y_pred)))
print('Recall: {:.5f}'.format(recall_score(y_test, y_pred)))
print('F1 Score: {:.5f}'.format(f1_score(y_test, y_pred)))
print('Balanced Accuracy: {:.5f}'.format(balanced_accuracy_score(y_test, y_pred)))
print('Jaccard Score: {:.5f}'.format(jaccard_score(y_test, y_pred)))


Accuracy: 0.61905
Precision: 0.55556
Recall: 0.78947
F1 Score: 0.65217
Balanced Accuracy: 0.63387
Jaccard Score: 0.48387


In [196]:
# --- Train and Test Neural Network --- #
mlp = MLP(hidden_layer_sizes=(64, 32, 16), max_iter=1000, random_state=42, verbose=True)
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
print('Accuracy: {:.5f}'.format(mlp.score(x_test, y_test)))
print('Precision: {:.5f}'.format(precision_score(y_test, y_pred)))
print('Recall: {:.5f}'.format(recall_score(y_test, y_pred)))
print('F1 Score: {:.5f}'.format(f1_score(y_test, y_pred)))
print('Balanced Accuracy: {:.5f}'.format(balanced_accuracy_score(y_test, y_pred)))
print('Jaccard Score: {:.5f}'.format(jaccard_score(y_test, y_pred)))


Iteration 1, loss = 17.59275716
Iteration 2, loss = 17.59275713
Iteration 3, loss = 17.59275708
Iteration 4, loss = 15.77703222
Iteration 5, loss = 19.31405837
Iteration 6, loss = 18.45093914
Iteration 7, loss = 18.45093908
Iteration 8, loss = 18.45093902
Iteration 9, loss = 18.45093895
Iteration 10, loss = 14.88141617
Iteration 11, loss = 16.81139500
Iteration 12, loss = 15.65556441
Iteration 13, loss = 16.46425688
Iteration 14, loss = 15.93476518
Iteration 15, loss = 16.60890908
Iteration 16, loss = 16.63410803
Iteration 17, loss = 15.84778364
Iteration 18, loss = 14.58459233
Iteration 19, loss = 15.51692019
Iteration 20, loss = 18.45093815
Iteration 21, loss = 18.40540021
Iteration 22, loss = 15.45319227
Iteration 23, loss = 16.97089454
Iteration 24, loss = 17.87321622
Iteration 25, loss = 15.51770786
Iteration 26, loss = 15.99664757
Iteration 27, loss = 14.91611542
Iteration 28, loss = 11.06208231
Iteration 29, loss = 13.08653547
Iteration 30, loss = 14.69522365
Iteration 31, loss 