In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, explained_variance_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn import svm
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer

np.set_printoptions(precision=4)

In [None]:
data_file = Path().cwd().parent / "data" / "workouts.pkl"
data = pd.read_pickle(data_file)
data.head()

In [None]:
df = data.copy()

x = df.drop(['attendance', 'overflow'], axis = 1)
y = df.attendance

In [None]:
categorical_columns = []
columns_to_drop = []

for i in x.columns:
    if x[i].nunique()<15 and x[i].dtype=="object":
        categorical_columns.append(i)
    elif x[i].nunique()>=15 and x[i].dtype=="object":
        columns_to_drop.append(i)

print(categorical_columns)
print(columns_to_drop)

In [None]:
# Define the OneHotEncoder
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', onehot_encoder, categorical_columns)
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

x_transformed = preprocessor.fit_transform(x)
new_column_names = preprocessor.get_feature_names_out()
new_column_names = [name.split('__')[-1] for name in new_column_names]
x = pd.DataFrame(x_transformed, columns=new_column_names) # type: ignore - False positive for pandas interface.
x = x.astype({col: 'int' for col in x.columns if x[col].dtype == 'float64' and x[col].apply(float.is_integer).all()})

x.head()


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

# print(y_test)
# scaler = StandardScaler()
# x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns)

In [None]:
# Initialize the comparison dictionary with additional metrics
comparison_dict = {
    'model': [],
    'params': [],
    'R^2': [],
    'MAE': [],
    'MSE': [],
    'RMSE': [],
    'MAPE': [],
    'Explained Variance': []
}

In [None]:
# Random Forest Regressor

params={'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
        'max_features': ['sqrt', 'log2'],
        'max_depth': list(int(i) for i in np.linspace(5, 55, 26)) + [None]}

for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:
            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion = criterion,
                                          max_features = max_features, max_depth = max_depth, random_state = 1)
            model.fit(x_train, y_train)
            
            # Predictions
            y_pred = model.predict(x_test)
            
            # Compute scores
            r2_score = model.score(x_test, y_test)
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mape = mean_absolute_percentage_error(y_test, y_pred)
            explained_var = explained_variance_score(y_test, y_pred)
            
            # Record the results
            comparison_dict['model'].append('random_forest_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['R^2'].append(r2_score)
            comparison_dict['MAE'].append(mae)
            comparison_dict['MSE'].append(mse)
            comparison_dict['RMSE'].append(rmse)
            comparison_dict['MAPE'].append(mape)
            comparison_dict['Explained Variance'].append(explained_var)

In [None]:
## Support Vector Machines

params={'gamma': np.logspace(-4, -1, 10),
        'C': np.logspace(-2, 1, 10),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} 

for gamma in params['gamma']:
    for c in params['C']:
        for kernel in params['kernel']:
            model_params = (gamma, c, kernel)
            model = svm.SVR(gamma = gamma, C = c, kernel = kernel)
            model.fit(x_train, y_train)
            
            # Predictions
            y_pred = model.predict(x_test)
            
            # Compute scores
            r2_score = model.score(x_test, y_test)
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mape = mean_absolute_percentage_error(y_test, y_pred)
            explained_var = explained_variance_score(y_test, y_pred)
            
            # Record the results
            comparison_dict['model'].append('svr_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['R^2'].append(r2_score)
            comparison_dict['MAE'].append(mae)
            comparison_dict['MSE'].append(mse)
            comparison_dict['RMSE'].append(rmse)
            comparison_dict['MAPE'].append(mape)
            comparison_dict['Explained Variance'].append(explained_var)

In [None]:
# Neural network

params={'hidden_layer_sizes': [(80,20,40,5), (75,30,50,10,3)], 
        'activation': ['identity', 'relu','logistic', 'tanh',], 
        'solver': ['lbfgs','sgd', 'adam'], 
        'alpha': np.logspace(-4,1,20)} 

for hidden_layer_sizes in params['hidden_layer_sizes']:
    for activation in params['activation']:
        for solver in params['solver']:
            for alpha in params['alpha']:
                model_params = (hidden_layer_sizes, activation, solver, alpha )
                model = MLPRegressor(hidden_layer_sizes = hidden_layer_sizes,
                                      activation = activation, solver = solver, alpha = alpha, random_state = 1)
                model.fit(x_train, y_train)

            # Predictions
            y_pred = model.predict(x_test)
            
            # Compute scores
            r2_score = model.score(x_test, y_test)
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mape = mean_absolute_percentage_error(y_test, y_pred)
            explained_var = explained_variance_score(y_test, y_pred)
            
            # Record the results
            comparison_dict['model'].append('mlp_regressor')
            comparison_dict['params'].append(model_params)
            comparison_dict['R^2'].append(r2_score)
            comparison_dict['MAE'].append(mae)
            comparison_dict['MSE'].append(mse)
            comparison_dict['RMSE'].append(rmse)
            comparison_dict['MAPE'].append(mape)
            comparison_dict['Explained Variance'].append(explained_var)

In [None]:
# Lasso Regression

params={'fit_intercept': [True, False]}

for fit_intercept in params['fit_intercept']:
    model_params = (fit_intercept)
    model = Lasso(fit_intercept = fit_intercept)
    model.fit(x_train, y_train)
            
    # Predictions
    y_pred = model.predict(x_test)
    
    # Compute scores
    r2_score = model.score(x_test, y_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    explained_var = explained_variance_score(y_test, y_pred)
    
    # Record the results
    comparison_dict['model'].append('lasso')
    comparison_dict['params'].append(model_params)
    comparison_dict['R^2'].append(r2_score)
    comparison_dict['MAE'].append(mae)
    comparison_dict['MSE'].append(mse)
    comparison_dict['RMSE'].append(rmse)
    comparison_dict['MAPE'].append(mape)
    comparison_dict['Explained Variance'].append(explained_var)

In [None]:
# ElasticNet Regression

params={'fit_intercept': [True, False]}

for fit_intercept in params['fit_intercept']:
    model_params = (fit_intercept)
    model = ElasticNet(fit_intercept = fit_intercept)
    model.fit(x_train, y_train)
            
    # Predictions
    y_pred = model.predict(x_test)
    
    # Compute scores
    r2_score = model.score(x_test, y_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    explained_var = explained_variance_score(y_test, y_pred)
    
    # Record the results
    comparison_dict['model'].append('elastic_net')
    comparison_dict['params'].append(model_params)
    comparison_dict['R^2'].append(r2_score)
    comparison_dict['MAE'].append(mae)
    comparison_dict['MSE'].append(mse)
    comparison_dict['RMSE'].append(rmse)
    comparison_dict['MAPE'].append(mape)
    comparison_dict['Explained Variance'].append(explained_var)

In [None]:
# Linear Regression

params={'fit_intercept': [True, False]}

for fit_intercept in params['fit_intercept']:
    model_params = (fit_intercept)
    model = LinearRegression(fit_intercept = fit_intercept)
    model.fit(x_train, y_train)
            
    # Predictions
    y_pred = model.predict(x_test)
    
    # Compute scores
    r2_score = model.score(x_test, y_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    explained_var = explained_variance_score(y_test, y_pred)
    
    # Record the results
    comparison_dict['model'].append('linear_regression')
    comparison_dict['params'].append(model_params)
    comparison_dict['R^2'].append(r2_score)
    comparison_dict['MAE'].append(mae)
    comparison_dict['MSE'].append(mse)
    comparison_dict['RMSE'].append(rmse)
    comparison_dict['MAPE'].append(mape)
    comparison_dict['Explained Variance'].append(explained_var)

In [None]:
# Get the coefficients from the model
coefficients = model.coef_

# Create a DataFrame to hold feature names and their coefficients
coef_df = pd.DataFrame({
    'Feature': x.columns,
    'Coefficient': coefficients
})

# Sort the DataFrame by the absolute value of coefficients
coef_df['Absolute Coefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Absolute Coefficient', ascending=False)

print(coef_df)

In [None]:
# Plot the coefficients
plt.figure(figsize=(10, 6))
plt.barh(coef_df['Feature'], coef_df['Coefficient'], color='skyblue')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
plt.title('Feature Importance in Linear Regression')
plt.gca().invert_yaxis()  # Highest importance at the top
plt.show()

In [None]:
# Get the coefficients from the model
coefficients = model.coef_

# Extract only the coach-related coefficients
coach_columns = [col for col in x.columns if col.startswith('coach_')]
coach_coefficients = [coefficients[x.columns.get_loc(col)] for col in coach_columns]

# Create a DataFrame to hold coach names and their coefficients
coach_coef_df = pd.DataFrame({
    'Coach': [col.replace('coach_', '') for col in coach_columns],
    'Coefficient': coach_coefficients
})

# Sort the DataFrame by the absolute value of coefficients
coach_coef_df['Absolute Coefficient'] = coach_coef_df['Coefficient'].abs()
coach_coef_df = coach_coef_df.sort_values(by='Absolute Coefficient', ascending=False)

print(coach_coef_df)

In [None]:
# Plot the coefficients for coaches
plt.figure(figsize=(10, 6))
plt.barh(coach_coef_df['Coach'], coach_coef_df['Coefficient'], color='skyblue')
plt.xlabel('Coefficient')
plt.ylabel('Coach')
plt.title('Ranking of Coaches by Feature Importance in Linear Regression')
plt.gca().invert_yaxis()  # Highest importance at the top
plt.show()

In [None]:
# Get the coefficients from the model
coefficients = model.coef_

# Extract only the time-related coefficients
time_columns = [col for col in x.columns if col.startswith('time_')]
time_coefficients = [coefficients[x.columns.get_loc(col)] for col in time_columns]

# Create a DataFrame to hold time names and their coefficients
time_coef_df = pd.DataFrame({
    'Time': [col.replace('time_', '') for col in time_columns],
    'Coefficient': time_coefficients
})

# Sort the DataFrame by the absolute value of coefficients
time_coef_df['Absolute Coefficient'] = time_coef_df['Coefficient'].abs()
time_coef_df = time_coef_df.sort_values(by='Absolute Coefficient', ascending=False)

print(time_coef_df)

In [None]:
# Plot the coefficients for coaches
plt.figure(figsize=(10, 6))
plt.barh(time_coef_df['Time'], time_coef_df['Coefficient'], color='skyblue')
plt.xlabel('Coefficient')
plt.ylabel('Time')
plt.title('Ranking of Times by Feature Importance in Linear Regression')
plt.gca().invert_yaxis()  # Highest importance at the top
plt.show()

In [None]:
comparison_df = pd.DataFrame(comparison_dict)

# Example: Plot R^2 scores
plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='R^2', data=comparison_df, palette='viridis')
plt.title('R^2 Scores Across Different Models')
plt.xlabel('Model')
plt.ylabel('R^2 Score')
plt.show()

# Example: Plot MAE
plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='MAE', data=comparison_df, palette='magma')
plt.title('MAE Across Different Models')
plt.xlabel('Model')
plt.ylabel('Mean Absolute Error (MAE)')
plt.show()

# Example: Plot RMSE
plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='RMSE', data=comparison_df, palette='coolwarm')
plt.title('RMSE Across Different Models')
plt.xlabel('Model')
plt.ylabel('Root Mean Squared Error (RMSE)')
plt.show()

In [None]:
# Normalize the scores for comparison (optional)
comparison_df['normalized_r2'] = comparison_df['R^2'] / comparison_df['R^2'].max()
comparison_df['normalized_mae'] = comparison_df['MAE'].min() / comparison_df['MAE']
comparison_df['normalized_rmse'] = comparison_df['RMSE'].min() / comparison_df['RMSE']

# Compute a composite score (higher is better)
comparison_df['composite_score'] = (comparison_df['normalized_r2'] +
                                    comparison_df['normalized_mae'] +
                                    comparison_df['normalized_rmse']) / 3

# Find the index of the best model based on the composite score
best_model_index = comparison_df['composite_score'].idxmax()

# Extract the details of the best model
best_model = comparison_df.loc[best_model_index]

# Display the best model's parameters and metrics
print("Best Model based on Composite Score:")
print(f"Model: {best_model['model']}")
print(f"Parameters: {best_model['params']}")
print(f"Composite Score: {best_model['composite_score']}")
print(f"R^2 Score: {best_model['R^2']}")
print(f"MAE: {best_model['MAE']}")
print(f"MSE: {best_model['MSE']}")
print(f"RMSE: {best_model['RMSE']}")
print(f"MAPE: {best_model['MAPE']}")
print(f"Explained Variance: {best_model['Explained Variance']}")

In [None]:
# Sort by the composite score
sorted_comparison_df = comparison_df.sort_values(by='composite_score', ascending=False)

# Plot the top models
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='composite_score', data=sorted_comparison_df.head(10), palette='viridis')
plt.title('Top 10 Models by Composite Score')
plt.xlabel('Model')
plt.ylabel('Composite Score')
plt.show()