In [1]:
# Importing data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Loading three CSV files into pandas DataFrames
injury_data = pd.read_csv('InjuryRecord.csv')
play_data = pd.read_csv('PlayList.csv')
track_data = pd.read_csv('PlayerTrackData.csv')

In [3]:
# Merging Datasets: 
combined_data = pd.merge(play_data, track_data, on=['PlayKey'], how='inner')
# Adding a new column indicating weather or not an injury occured
combined_data['InjuryOccurred'] = combined_data['PlayKey'].isin(injury_data['PlayKey']).astype(int)

In [14]:
# Taking a subset of the data (100000 random rows of data) to use
random_sampled_data = combined_data.sample(n=100000, random_state=42)
# Reseting index
random_sampled_data = random_sampled_data.reset_index(drop=True)

In [15]:
# Data Cleanup

# One-hot encoding for Weather data
weather_encoded = pd.get_dummies(random_sampled_data['Weather'], prefix='Weather')
# Combine the encoded columns with the original DataFrame
random_sampled_data = pd.concat([random_sampled_data, weather_encoded], axis=1)
# Drop the original 'Weather' column if no longer needed
random_sampled_data.drop(columns=['Weather'], inplace=True)

# Group definitions for weather categories
weather_groups = {
    'Sunny_Weather': [
        'Weather_Sunny', 'Weather_Sunny Skies', 'Weather_Sunny and clear',
        'Weather_Sunny and cold', 'Weather_Sunny and warm', 'Weather_Mostly sunny', 
        'Weather_Partly Sunny', 'Weather_Partly sunny', 
        'Weather_Sunny, Windy', 'Weather_Sunny, highs to upper 80s',  'Weather_Mostly Sunny', 'Weather_Mostly Sunny Skies'
    ],
    'Clear_Weather': [
        'Weather_Clear',
        'Weather_Clear Skies', 'Weather_Clear and Cool',
        'Weather_Clear and Sunny', 'Weather_Clear and cold',
        'Weather_Clear and sunny', 'Weather_Clear and warm',
        'Weather_Clear skies', 'Weather_Clear to Partly Cloudy', 'Weather_Partly clear'
    ],
    'Cloudy_Weather': [
        'Weather_Cloudy', 'Weather_Cloudy and Cool', 'Weather_Cloudy and cold',
        'Weather_Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
        'Weather_Cloudy, 50% change of rain', 'Weather_Cloudy, Rain',
        'Weather_Cloudy, chance of rain', 'Weather_cloudy', 'Weather_Overcast', 
        'Weather_Cloudy, fog started developing in 2nd quarter',
        'Weather_Cloudy, light snow accumulating 1-3"', 'Weather_Coudy',
        'Weather_Mostly Cloudy', 'Weather_Mostly Coudy', 'Weather_Mostly cloudy'
    ],
    'Low_Rain_Weather': [
        'Weather_10% Chance of Rain', 'Weather_30% Chance of Rain', 'Weather_Light Rain'
    ],
    'Rainy_Weather': [
        'Weather_Rain', 'Weather_Rain Chance 40%', 'Weather_Rain likely, temps in low 40s.',
        'Weather_Rain shower', 'Weather_Rainy', 'Weather_Scattered Showers',
        'Weather_Showers'
    ],
    'Indoor_Weather': [
        'Weather_Indoor', 'Weather_Indoors', 'Weather_Controlled Climate',
        'Weather_N/A (Indoors)', 'Weather_N/A Indoor'
    ],
    'Partly_Cloudy_Weather': [
        'Weather_Partly Cloudy', 'Weather_Partly Clouidy', 'Weather_Partly cloudy',
        'Weather_Party Cloudy', 'Weather_Sun & clouds'
    ],
    'Snowy_Weather': [
        'Weather_Snow', 'Weather_Heavy lake effect snow'
    ],
    'Other_Weather': [
        'Weather_Fair', 'Weather_Hazy', 'Weather_Heat Index 95'
    ]
}

for group_name, columns in weather_groups.items():
    # Check if the columns exist in the DataFrame
    columns_present = [col for col in columns if col in random_sampled_data.columns]
    # Combine the columns
    random_sampled_data[group_name] = random_sampled_data[columns_present].max(axis=1)
    # Optionally drop the old columns
    random_sampled_data.drop(columns=columns_present, inplace=True)

field_type_encoded = pd.get_dummies(random_sampled_data['FieldType'], prefix='FieldType')
# Combine the encoded columns with the original DataFrame
random_sampled_data = pd.concat([random_sampled_data, field_type_encoded], axis=1)
# Drop the original 'FieldType' column if no longer needed
random_sampled_data.drop(columns=['FieldType'], inplace=True)

stadium_type_encoded = pd.get_dummies(random_sampled_data['StadiumType'], prefix='StadiumType')
# Combine the encoded columns with the original DataFrame
random_sampled_data = pd.concat([random_sampled_data, stadium_type_encoded], axis=1)
# Drop the original 'FieldType' column if no longer needed
random_sampled_data.drop(columns=['StadiumType'], inplace=True)

stadium_type_groups = {
    'Indoor_Stadium': [
        'StadiumType_Closed Dome', 'StadiumType_Dome', 'StadiumType_Dome, closed',
        'StadiumType_Domed', 'StadiumType_Indoor', 'StadiumType_Indoor, Open Roof',
        'StadiumType_Indoor, Roof Closed', 'StadiumType_Indoors'
    ],
    'Outdoor_Stadium': [
        'StadiumType_Bowl', 'StadiumType_Cloudy', 'StadiumType_Heinz Field',
        'StadiumType_Open', 'StadiumType_Oudoor', 'StadiumType_Ourdoor',
        'StadiumType_Outddors', 'StadiumType_Outdoor', 'StadiumType_Outdoors',
        'StadiumType_Outdor', 'StadiumType_Outside'
    ],
    'Retractable_Roof_Stadium': [
        'StadiumType_Domed, Open', 'StadiumType_Domed, open',
        'StadiumType_Outdoor Retr Roof-Open', 'StadiumType_Retr. Roof - Closed',
        'StadiumType_Retr. Roof - Open', 'StadiumType_Retr. Roof Closed', 'StadiumType_Domed, closed',
        'StadiumType_Retr. Roof-Closed', 'StadiumType_Retractable Roof', 'StadiumType_Retr. Roof-Open'
    ]
}
# Iterate through stadium type groups and combine columns
for group_name, columns in stadium_type_groups.items():
    # Check if the columns exist in the DataFrame
    columns_present = [col for col in columns if col in random_sampled_data.columns]
    # Combine the columns
    random_sampled_data[group_name] = random_sampled_data[columns_present].max(axis=1)
    # Optionally drop the old columns
    random_sampled_data.drop(columns=columns_present, inplace=True)

columns_of_interest  = [
    'Sunny_Weather', 'Clear_Weather', 'Cloudy_Weather', 'Low_Rain_Weather', 
    'Rainy_Weather', 'Indoor_Weather', 'Partly_Cloudy_Weather', 'Snowy_Weather', 
    'Other_Weather', 'FieldType_Natural', 'FieldType_Synthetic', 'Indoor_Stadium', 
    'Outdoor_Stadium', 'Retractable_Roof_Stadium'
]

# Apply the replacement for each column
for column in columns_of_interest:
    random_sampled_data[column] = random_sampled_data[column].replace({True: 1, False: 0})


  random_sampled_data[column] = random_sampled_data[column].replace({True: 1, False: 0})
