### Project goal
The goal of this project is to predict the number of injuries per game using factors like weather, field type, and environmental metrics.

### EDA
Next, we will load the datasets and explore their structure to understand the data better.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
injury_data = pd.read_csv("../data/raw/InjuryRecord.csv")
play_list = pd.read_csv("../data/raw/PlayList.csv")
tracking_data = pd.read_csv("../data/processed/tracking_aggregated.csv")

NameError: name 'pd' is not defined

In [None]:
injury_data["InjurySeverity"] = injury_data[["DM_M1", "DM_M7", "DM_M28", "DM_M42"]].mul([1, 7, 28, 42]).max(axis=1)
injury_data.drop(columns=["DM_M1", "DM_M7", "DM_M28", "DM_M42"], inplace=True)

In [None]:
injury_data.head(10)

Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,InjurySeverity
0,39873,39873-4,39873-4-32,Knee,Synthetic,42
1,46074,46074-7,46074-7-26,Knee,Natural,7
2,36557,36557-1,36557-1-70,Ankle,Synthetic,42
3,46646,46646-3,46646-3-30,Ankle,Natural,1
4,43532,43532-5,43532-5-69,Ankle,Synthetic,42
5,41145,41145-2,41145-2-60,Knee,Natural,1
6,46014,46014-10,46014-10-22,Ankle,Synthetic,42
7,44860,44860-5,44860-5-52,Knee,Natural,7
8,44806,44806-7,44806-7-61,Knee,Synthetic,1
9,45962,45962-8,45962-8-40,Ankle,Synthetic,7


In [None]:
combined_data = pd.merge(play_list,injury_data, how="left", on=['PlayKey', 'GameID', 'PlayerKey'])
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267006 entries, 0 to 267005
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PlayerKey       267006 non-null  int64  
 1   GameID          267006 non-null  object 
 2   PlayKey         267006 non-null  object 
 3   RosterPosition  267006 non-null  object 
 4   PlayerDay       267006 non-null  int64  
 5   PlayerGame      267006 non-null  int64  
 6   StadiumType     250096 non-null  object 
 7   FieldType       267006 non-null  object 
 8   Temperature     267006 non-null  int64  
 9   Weather         248315 non-null  object 
 10  PlayType        266639 non-null  object 
 11  PlayerGamePlay  267006 non-null  int64  
 12  Position        267006 non-null  object 
 13  PositionGroup   267006 non-null  object 
 14  BodyPart        77 non-null      object 
 15  Surface         77 non-null      object 
 16  InjurySeverity  77 non-null      float64
dtypes: float64

In [None]:
tracking_and_injury_data = pd.merge(combined_data, tracking_data, how="left", on=['PlayKey'])
tracking_and_injury_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267006 entries, 0 to 267005
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   PlayerKey          267006 non-null  int64  
 1   GameID             267006 non-null  object 
 2   PlayKey            267006 non-null  object 
 3   RosterPosition     267006 non-null  object 
 4   PlayerDay          267006 non-null  int64  
 5   PlayerGame         267006 non-null  int64  
 6   StadiumType        250096 non-null  object 
 7   FieldType          267006 non-null  object 
 8   Temperature        267006 non-null  int64  
 9   Weather            248315 non-null  object 
 10  PlayType           266639 non-null  object 
 11  PlayerGamePlay     267006 non-null  int64  
 12  Position           267006 non-null  object 
 13  PositionGroup      267006 non-null  object 
 14  BodyPart           77 non-null      object 
 15  Surface            77 non-null      object 
 16  In

In [None]:
for col in ['BodyPart', 'Surface', 'RosterPosition', 'StadiumType', 'FieldType', 'Weather', 'Position', 'PlayType']:
  print(f"Unique values in {col}: {tracking_and_injury_data[col].unique()}\n")

Unique values in BodyPart: [nan 'Knee' 'Foot' 'Ankle']

Unique values in Surface: [nan 'Natural' 'Synthetic']

Unique values in RosterPosition: ['Quarterback' 'Wide Receiver' 'Linebacker' 'Running Back'
 'Defensive Lineman' 'Tight End' 'Safety' 'Cornerback' 'Offensive Lineman'
 'Kicker']

Unique values in StadiumType: ['Outdoor' 'Indoors' 'Oudoor' 'Outdoors' 'Open' 'Closed Dome'
 'Domed, closed' nan 'Dome' 'Indoor' 'Domed' 'Retr. Roof-Closed'
 'Outdoor Retr Roof-Open' 'Retractable Roof' 'Ourdoor'
 'Indoor, Roof Closed' 'Retr. Roof - Closed' 'Bowl' 'Outddors'
 'Retr. Roof-Open' 'Dome, closed' 'Indoor, Open Roof' 'Domed, Open'
 'Domed, open' 'Heinz Field' 'Cloudy' 'Retr. Roof - Open'
 'Retr. Roof Closed' 'Outdor' 'Outside']

Unique values in FieldType: ['Synthetic' 'Natural']

Unique values in Weather: ['Clear and warm' 'Mostly Cloudy' 'Sunny' 'Clear' 'Cloudy'
 'Cloudy, fog started developing in 2nd quarter' 'Rain' 'Partly Cloudy'
 'Mostly cloudy' 'Cloudy and cold' 'Cloudy and Cool' 'Rai

### Standardizing the StadiumType column

In [None]:
stadium_mapping = {
    'Indoors': 'Indoors', 'Indoor': 'Indoors', 'Indoor, Roof Closed': 'Indoors',
    'Dome': 'Closed', 'Closed Dome': 'Closed', 'Domed, closed': 'Closed',
    'Retractable Roof': 'Closed', 'Retr. Roof-Closed': 'Closed', 'Retr. Roof - Closed': 'Closed',
    'Retr. Roof - Open': 'Open', 'Indoor, Open Roof': 'Open',
    'Open': 'Open', 'Outdoor': 'Open', 'Outdoors': 'Open', 'Outddors': 'Open', 'Oudoor': 'Open'
}

tracking_and_injury_data['StadiumType'] = tracking_and_injury_data['StadiumType'].map(stadium_mapping)
tracking_and_injury_data['StadiumType']

0         Open
1         Open
2         Open
3         Open
4         Open
          ... 
267001    Open
267002    Open
267003    Open
267004    Open
267005    Open
Name: StadiumType, Length: 267006, dtype: object

### Replacing outliers in the Weather column


In [None]:
outliers = tracking_and_injury_data['Temperature'] == -999
print(f"Number of outliers: {outliers.sum()}")

tracking_and_injury_data['Temperature'] = tracking_and_injury_data['Temperature'].replace(-999, np.nan)
tracking_and_injury_data.loc[outliers, 'Temperature']

# Replace outliers based on the temperature mapping from the play_list
temperature_mapping = play_list.groupby('PlayKey')['Temperature'].first()
tracking_and_injury_data['Temperature'] = tracking_and_injury_data['Temperature'].fillna(tracking_and_injury_data['PlayKey'].map(temperature_mapping))
combined_data['Temperature'].isnull().sum()

Number of outliers: 24170


np.int64(0)

### Standardizing the Weather column

In [None]:
weather_mapping = {
  'Clear': 'Clear',
  'Clear Skies': 'Clear',
  'Clear skies': 'Clear',
  'Fair': 'Clear',
  'Clear and warm': 'Clear',
  'Partly Cloudy': 'Partly Cloudy',
  'Sun & clouds': 'Partly Cloudy',
  'Mostly Sunny': 'Partly Cloudy',
  'Mostly sunny': 'Partly Cloudy',
  'Cloudy': 'Cloudy',
  'Coudy': 'Cloudy',
  'Mostly cloudy': 'Cloudy',
  'Cloudy and Cool': 'Cloudy',
  'Rain': 'Rainy',
  'Light Rain': 'Rainy',
  'Rain shower': 'Rainy',
  'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.': 'Rainy',
  'Cloudy, 50% change of rain': 'Rainy',
  'Indoor': 'Indoor',
  'Indoors': 'Indoor',
  'Controlled Climate': 'Indoor',
  'Cold': 'Cold',
  'Sunny': 'Sunny'
}

tracking_and_injury_data['Weather'] = tracking_and_injury_data['Weather'].map(weather_mapping)
tracking_and_injury_data['Weather'].value_counts()

Weather
Cloudy           64207
Sunny            51728
Partly Cloudy    33118
Clear            31047
Indoor           12854
Rainy            11683
Cold               549
Name: count, dtype: int64

In [None]:
tracking_and_injury_data['StadiumType'].isnull().sum()

np.int64(25548)

In [None]:
tracking_and_injury_data['StadiumType'] = tracking_and_injury_data['StadiumType'].fillna(tracking_and_injury_data['StadiumType'].mode()[0])
tracking_and_injury_data['StadiumType'].isnull().sum()

np.int64(0)

In [None]:
tracking_and_injury_data['Weather'].isnull().sum()

np.int64(61820)

In [None]:
tracking_and_injury_data['Weather'] = tracking_and_injury_data['Weather'].fillna(tracking_and_injury_data['Weather'].mode()[0])
tracking_and_injury_data['Weather'].isnull().sum()

np.int64(0)

In [None]:
tracking_and_injury_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267006 entries, 0 to 267005
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   PlayerKey          267006 non-null  int64  
 1   GameID             267006 non-null  object 
 2   PlayKey            267006 non-null  object 
 3   RosterPosition     267006 non-null  object 
 4   PlayerDay          267006 non-null  int64  
 5   PlayerGame         267006 non-null  int64  
 6   StadiumType        267006 non-null  object 
 7   FieldType          267006 non-null  object 
 8   Temperature        267006 non-null  float64
 9   Weather            267006 non-null  object 
 10  PlayType           266639 non-null  object 
 11  PlayerGamePlay     267006 non-null  int64  
 12  Position           267006 non-null  object 
 13  PositionGroup      267006 non-null  object 
 14  BodyPart           77 non-null      object 
 15  Surface            77 non-null      object 
 16  In

In [None]:
tracking_and_injury_data = tracking_and_injury_data.dropna(subset=['s_mean'])
tracking_and_injury_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 266961 entries, 0 to 267005
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   PlayerKey          266961 non-null  int64  
 1   GameID             266961 non-null  object 
 2   PlayKey            266961 non-null  object 
 3   RosterPosition     266961 non-null  object 
 4   PlayerDay          266961 non-null  int64  
 5   PlayerGame         266961 non-null  int64  
 6   StadiumType        266961 non-null  object 
 7   FieldType          266961 non-null  object 
 8   Temperature        266961 non-null  float64
 9   Weather            266961 non-null  object 
 10  PlayType           266594 non-null  object 
 11  PlayerGamePlay     266961 non-null  int64  
 12  Position           266961 non-null  object 
 13  PositionGroup      266961 non-null  object 
 14  BodyPart           77 non-null      object 
 15  Surface            77 non-null      object 
 16  InjuryS

In [None]:
tracking_and_injury_data['InjurySeverity'].fillna(0, inplace=True)
tracking_and_injury_data.drop(columns=['Surface'], inplace=True)
tracking_and_injury_data['BodyPart'].fillna('None', inplace=True)
tracking_and_injury_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 266961 entries, 0 to 267005
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   PlayerKey          266961 non-null  int64  
 1   GameID             266961 non-null  object 
 2   PlayKey            266961 non-null  object 
 3   RosterPosition     266961 non-null  object 
 4   PlayerDay          266961 non-null  int64  
 5   PlayerGame         266961 non-null  int64  
 6   StadiumType        266961 non-null  object 
 7   FieldType          266961 non-null  object 
 8   Temperature        266961 non-null  float64
 9   Weather            266961 non-null  object 
 10  PlayType           266594 non-null  object 
 11  PlayerGamePlay     266961 non-null  int64  
 12  Position           266961 non-null  object 
 13  PositionGroup      266961 non-null  object 
 14  BodyPart           266961 non-null  object 
 15  InjurySeverity     266961 non-null  float64
 16  s_mean 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tracking_and_injury_data['InjurySeverity'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tracking_and_injury_data['BodyPart'].fillna('None', inplace=True)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
combined_data_with_tracking_and_dummies = pd.get_dummies(tracking_and_injury_data,
    columns=['StadiumType', 'FieldType', 'Weather', 'PositionGroup', 'BodyPart', 'PlayType'],
    drop_first=True
)
combined_data_with_tracking_and_dummies.head(10)

Unnamed: 0,PlayerKey,GameID,PlayKey,RosterPosition,PlayerDay,PlayerGame,Temperature,PlayerGamePlay,Position,InjurySeverity,...,PlayType_Extra Point,PlayType_Field Goal,PlayType_Kickoff,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush
0,26624,26624-1,26624-1-1,Quarterback,1,1,63.0,1,QB,0.0,...,False,False,False,False,False,True,False,False,False,False
1,26624,26624-1,26624-1-2,Quarterback,1,1,63.0,2,QB,0.0,...,False,False,False,False,False,True,False,False,False,False
2,26624,26624-1,26624-1-3,Quarterback,1,1,63.0,3,QB,0.0,...,False,False,False,False,False,False,False,False,False,True
3,26624,26624-1,26624-1-4,Quarterback,1,1,63.0,4,QB,0.0,...,False,False,False,False,False,False,False,False,False,True
4,26624,26624-1,26624-1-5,Quarterback,1,1,63.0,5,QB,0.0,...,False,False,False,False,False,True,False,False,False,False
5,26624,26624-1,26624-1-6,Quarterback,1,1,63.0,6,QB,0.0,...,False,False,False,False,False,False,False,False,False,True
6,26624,26624-1,26624-1-7,Quarterback,1,1,63.0,7,QB,0.0,...,False,False,False,False,False,True,False,False,False,False
7,26624,26624-1,26624-1-8,Quarterback,1,1,63.0,8,QB,0.0,...,False,False,False,False,False,True,False,False,False,False
8,26624,26624-1,26624-1-9,Quarterback,1,1,63.0,9,QB,0.0,...,False,False,False,False,False,False,False,False,False,True
9,26624,26624-1,26624-1-10,Quarterback,1,1,63.0,10,QB,0.0,...,False,False,False,False,False,True,False,False,False,False


In [None]:
drop_cols = ["PlayerKey", "GameID", "PlayKey", "RosterPosition", "Position"]  # Adjust as needed
training_data = combined_data_with_tracking_and_dummies.drop(columns=drop_cols)

In [None]:
X = training_data.drop(columns=["InjurySeverity"], axis=1)
y = training_data["InjurySeverity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()

# Fit the scaler on the training set, then transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)


In [None]:
y_pred = lr.predict(X_test_scaled)

# Compute evaluation metrics
rmse = mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R-squared

print("RMSE: ", rmse)
print("R2: ", r2)
print("Intercept: ", lr.intercept_)
print("Coefficients:", lr.coef_)

RMSE:  0.06072676915563853
R2:  0.7032224784010478
Intercept:  0.00525827839376696
Coefficients: [-6.20071774e-04  2.80716793e-04  7.71350174e-04  9.56122933e-04
 -1.89123747e-03  3.66104727e-03 -2.17233182e-03  7.35233041e-04
  7.16024952e-05 -5.40647862e-04 -3.49762428e-05 -7.00781425e-04
  1.76364946e-03 -9.40098361e-04 -1.88021993e-03 -1.40268908e-03
  3.53416730e-04 -1.48157387e-03  1.05502271e-04  1.52555473e-04
 -7.04594246e-04 -7.73223412e-04 -2.01032540e-04  9.80533132e-04
  2.83698556e-05 -5.77369918e-04 -6.07534541e-04  8.63157399e-02
  3.18075152e-02 -2.60722469e-01  2.96952880e-04  2.15035263e-04
  1.39618976e-03  1.62760241e-04 -3.31737103e-04  2.47831001e-04
 -6.55108366e-05 -5.78831699e-05  6.43670870e-04  4.95284671e-04]


In [None]:
from sklearn.model_selection import cross_val_score

lr_cv = LinearRegression()
scores = cross_val_score(lr_cv, X_train_scaled, y_train, cv=5, scoring='neg_root_mean_squared_error')
print("Cross-Validation RMSE (5 folds):", -scores.mean())

Cross-Validation RMSE (5 folds): 0.29197809129667457


In [None]:
from sklearn.metrics import r2_score

def regression_accuracy(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    # Ensure we don't have negative accuracy
    accuracy = max(0, r2) * 100
    return accuracy

# Assuming y_test and y_pred are defined
accuracy_percentage = regression_accuracy(y_test, y_pred)
print("Model Accuracy: {:.2f}%".format(accuracy_percentage))

Model Accuracy: 70.32%
