In [None]:
import lightgbm as lgb
import pandas as pd
import sklearn
import numpy as np

In [None]:

file_path = 'finalMergedData.csv'
df = pd.read_csv(file_path)

# df = pd.read_csv("./finalMergedData.csv")
# Assuming your DataFrame is called df
df.columns

In [None]:
df.columns
columns = ['gameId','ballCarrierId','ballCarrierDisplayName','playDescription','passResult','passLength','penaltyYards','prePenaltyPlayResult','playNullifiedByPenalty',
           'homeTeamWinProbabilityAdded','visitorTeamWinProbilityAdded','expectedPointsAdded','foulNFLId1','foulNFLId2','frameId',
           'Full Name','teamId','foulName1','foulName2']
df = df.drop(columns=columns)

In [None]:
#this piece of data processing code is snipped from other people's kaggle notebook record.
#https://www.kaggle.com/code/mansooralam559/classification-model-for-nfl-big-data-bowl-2024
def convert_height_to_meters(height):
    # Split the height into feet and inches
    feet, inches = map(int, height.split('-'))
    # Convert height to inches
    total_inches = feet * 12 + inches
    # Convert inches to cm (1 inch = 2.54 cm)
    height_cm = total_inches * 2.54
    # Convert cm to meters
    height_m = height_cm / 100
    return height_m
df['height_x'] = df['height_x'].apply(convert_height_to_meters)

def gameClockConverter(clock):
    minutes,second = map(int,clock.split(':'))
    totalTime = minutes*60 + second
    return totalTime
df['gameClock'] = df['gameClock'].apply(gameClockConverter)

#the open source kaggle notebook also reminds me of calculating the bmi of a player. This could be essential because it gives a more intuitive
#and quantitative information about a player's physique.

In [None]:

df['yardlineSide'] = df['yardlineSide'].fillna('none')
df['offenseFormation']=df['offenseFormation'].fillna('none')
# deFendersIntheBoxMean = df['defendersInTheBox'].mean()
df['defendersInTheBox'] = df['defendersInTheBox'].fillna(df['defendersInTheBox'].mean())
df['passProbability'] = df['passProbability'].fillna(df['passProbability'].mean())

df.isna().sum()
# deFendersIntheBoxMean

In [None]:
#EDA
import seaborn as sns
import matplotlib.pyplot as plt

numerical_df = df.select_dtypes(include=[np.number])

# Compute the correlation matrix for numerical features
correlation_matrix = numerical_df.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
df['playId'] = df['playId'].astype('category')

# Plot barplot
plt.figure(figsize=(12, 6))
sns.barplot(x='playId', y='playResult', data=df)
plt.title('Play Result by Play ID')
plt.xlabel('Play ID')
plt.ylabel('Play Result')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()



In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['playResult'], kde=True, stat='density')
plt.title('Probability Density Function of playResult')
plt.xlabel('playResult')
plt.ylabel('Density')
plt.show()
#we can tell it is normally distributed

In [None]:
#scaling
from sklearn.preprocessing import StandardScaler

# Get list of numerical column names
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

# Remove playResult from numerical_cols
numerical_cols.remove('playResult')

# Separate numerical and categorical features
X_numerical = df[numerical_cols]  # Only include numerical columns
X_categorical = df.drop(columns=numerical_cols + ['playResult'])  # Exclude numerical and target columns

# Initialize StandardScaler
scaler = StandardScaler()

# Standardize numerical features (excluding playResult)
X_numerical_scaled = scaler.fit_transform(X_numerical)

# Convert the standardized numerical features back to a DataFrame
X_numerical_scaled_df = pd.DataFrame(X_numerical_scaled, columns=X_numerical.columns)

# Concatenate the standardized numerical features with the categorical features and playResult column
df_standardized = pd.concat([X_numerical_scaled_df, X_categorical, df['playResult']], axis=1)

# Now df_standardized contains the entire dataset with numerical features standardized except for playResult
numerical_cols

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(df['playResult'], df['overall_rating'])
plt.title('Scatter Plot between playResult and overallRating')
plt.xlabel('playResult')
plt.ylabel('overallRating')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(df['playResult'], df['passProbability'])
plt.title('Scatter Plot between playResult and pass probability')
plt.xlabel('playResult')
plt.ylabel('pass probability')
plt.grid(True)
plt.show()

In [None]:
df_standardized

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
numeric_df = df_standardized.select_dtypes(include=['number'])

vif_data = pd.DataFrame()
vif_data["feature"] = numeric_df.columns
vif_data["VIF"] = [variance_inflation_factor(numeric_df.values, i) for i in range(len(numeric_df.columns))]
vif_data

In [None]:
df_standardized  = df_standardized.drop(columns=['preSnapVisitorTeamWinProbability'])

In [None]:
# object_columns = df_standardized.select_dtypes(include=['object']).columns
# df_standardized[object_columns] = df_standardized[object_columns].astype('category')

# df_standardized = pd.get_dummies(df_standardized)

# df_standardized

In [None]:
object_columns = df_standardized.select_dtypes(include=['object']).columns
df_standardized[object_columns] = df_standardized[object_columns].astype('category')
categorical_features = df_standardized.select_dtypes(include=['category']).columns.tolist()

# Print the list of categorical feature names
print("Categorical Features:", categorical_features)


In [None]:
from sklearn.model_selection import train_test_split
X = df_standardized.drop(columns=['playResult'])  # Features
y = df_standardized['playResult']  # Target

# First split: Split the data into training (70%) and temporary (30%)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: Split the temporary data into validation (50%) and final training (50%)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=42)

X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Testing set shape:", X_test.shape)
# print(X_train['playId'].nunique())# this is a lot of different plays
X_train.columns

In [None]:
#lightgbm does not seem to work
from sklearn.metrics import mean_squared_error

params = {
    'objective': 'regression',  # Regression task
    'metric': 'l2',             # Mean squared error (MSE) as the evaluation metric
    'num_leaves': 31,            # Number of leaves in each tree
    'learning_rate': 0.05,       # Learning rate
    'feature_fraction': 0.2,     # Feature fraction (randomly select a subset of features)
    'bagging_fraction': 1.0,     # Bagging fraction (randomly select a subset of data)
    'bagging_freq': 5,           # Frequency for bagging
    'verbose': 0                 # Verbosity
}
train_data = lgb.Dataset(X_train, label=y_train,categorical_feature=categorical_features)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data])
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


In [None]:
sklearn.metrics.get_scorer_names()

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')
# Define the parameter grid
param_grid = {
    'objective': ['regression'],
    'metric': ['l2'],
    'num_leaves':[31,50,80],
    'min_data_in_leaf': [10,20, 50, 100],  # Adjusting parameter name for clarity
    'max_depth': [15, 20,25,30],
    'lambda_l2': [0.0, 0.01, 0.03,0.06],
    'learning_rate': [0.05, 0.03, 0.04],
    'feature_fraction': [0.5,0.6,0.7],
    'bagging_fraction': [1.0],
    'bagging_freq': [10],
    'verbose': [0],
    'n_jobs': [-1]  # Use all CPU cores for LightGBM
}

# Create the LightGBM regressor model
model = lgb.LGBMRegressor(n_jobs=-1)  # This will use all cores for each individual LightGBM training process

# Perform hyperparameter tuning with GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the testing data
y_pred = best_model.predict(X_val)

# Evaluate the model using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_val, y_pred)
print("Mean Absolute Error:", mae)



In [None]:
y_pred = best_model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute error:", mae)

In [None]:
feature_range = y_test.max() - y_test.min()

print("Range of the feature:", feature_range)

In [None]:

samples = np.random.normal(loc=df_standardized['playResult'].mean(), scale=np.sqrt(df_standardized['playResult'].var()), size=y_test.shape[0])
mae = mean_absolute_error(samples,y_test)
mae