In [3]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from scipy.stats import mannwhitneyu
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler


df = pd.read_csv("DS 340 Final Dataset.csv")


# Separate by Independent and Dependent Variables
X = df.drop(['Team', 'Playoff'], axis=1)
y = df['Playoff']


# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)



# NOVELTY IMPLEMENTATION: MANN-WHITNEY U TEST
playoff_teams = df[df['Playoff'] == 1]
non_playoff_teams = df[df['Playoff'] == 0]

mannWhitneyU_results = pd.DataFrame(columns=['Variables', "U-Stat", "P-Value"])

for variable in X_scaled_df.columns:
  playoff_value = playoff_teams[variable]
  non_playoff_value = non_playoff_teams[variable]

  u_stat, p_value = mannwhitneyu(playoff_value, non_playoff_value)

  mannWhitneyU_results = mannWhitneyU_results._append({
      'Variable': variable,
      'U-Stat': u_stat,
      'P-Value': p_value}, ignore_index = True)

mannWhitneyU_results['Significant'] = mannWhitneyU_results['P-Value'] < 0.01

mannWhitneyU_results = mannWhitneyU_results.drop(columns = "Variables")

mannWhitneyU_results_sorted = mannWhitneyU_results.sort_values('P-Value')

significant_variables = mannWhitneyU_results[mannWhitneyU_results['Significant']]['Variable'].tolist()
significant_variables.remove("L")
significant_variables.remove("W")

print(mannWhitneyU_results_sorted)

print("\nVariables with Significant Difference Among Playoff Teams vs Non-Playoff Teams:")
print(significant_variables)


# Discriminant Analysis Implementation
lda = LinearDiscriminantAnalysis()

# PART OF NOVELTY IMPLEMENTATION
X_discrim = df[significant_variables]

X_discrim_scaled = scaler.fit_transform(X_discrim)


# Model Fitting
lda.fit(X_discrim_scaled, y)


# Make Predictions
y_pred_lda = lda.predict(X_discrim_scaled)


# Performance Evaluation by Analyzing the Coefficients
print("\nDiscriminant Analysis Results:")
print("Coefficients:")
for i, coef in enumerate(lda.coef_[0]):

    print(f"{X_discrim.columns[i]}: {coef:.4f}")


print("\nTop 4 variables that affect a team's playoff chances according to my discriminant analysis: ")
print("OwnPointsPerGame, OpponentPointsPerGame, OpponentTotalReboundsPerGame, OwnThreePointPercentage")
print("\nTop 4 variables that affect a team's playoff chances according to research paper's discriminant analysis: ")
print("OwnFieldGoalPercentage, OpponentPointsPerGame, OpponentFieldGoalPercentage, OpponentAssistsPerGame")


# Confusion Matrix
print("\nConfusion Matrix (Discriminant Analysis):")
conf_matrix_lda = confusion_matrix(y, y_pred_lda)
print(conf_matrix_lda)


# Accuracy Calculation using Confusion Matrix
accuracy_lda = (conf_matrix_lda[0,0] + conf_matrix_lda[1,1]) / conf_matrix_lda.sum()
print(f"Accuracy: {accuracy_lda:.4f}")


# Logistic Regression Implementation
X_logit = df[['OwnPointsPerGame', 'OpponentFieldGoalPercentage']]
X_logit_scaled = scaler.fit_transform(X_logit)


# Model Fitting
logit = LogisticRegression(random_state=42)
logit.fit(X_logit_scaled, y)


# Make Predictions
y_pred_logit = logit.predict(X_logit_scaled)


# Performance Evaluation by Analyzing Coefficients
print("\nLogistic Regression Results:")
print("Coefficients:")
for i, coef in enumerate(logit.coef_[0]):
    print(f"{X_logit.columns[i]}: {coef:.4f}")


# Confusion Matrix
print("\nConfusion Matrix (Logistic Regression):")
conf_matrix_logit = confusion_matrix(y, y_pred_logit)
print(conf_matrix_logit)


# Accuracy Calculation using Confusion Matrix
accuracy_logit = (conf_matrix_logit[0,0] + conf_matrix_logit[1,1]) / conf_matrix_logit.sum()
print(f"Accuracy: {accuracy_logit:.4f}")


# Calculating and showing teams that were false positives and false negatives
team_results = pd.DataFrame({
    'Team': df['Team'],
    'Actual': df['Playoff'],
    'LDA_Prediction': y_pred_lda,
    'Logistic_Prediction': y_pred_logit
})

print("\nTeams misclassified by Discriminant Analysis:")
print(team_results[team_results['Actual'] != team_results['LDA_Prediction']])

print("\nTeams misclassified by Logistic Regression:")
print(team_results[team_results['Actual'] != team_results['Logistic_Prediction']])

# Comparing Accuracies
print("\nComparison of methods:")
print(f"Discriminant Analysis accuracy: {accuracy_lda:.2%}")
print(f"Logistic Regression accuracy: {accuracy_logit:.2%}")

    U-Stat   P-Value                               Variable  Significant
2      2.0  0.000005                                      L         True
1    221.5  0.000005                                      W         True
15    32.0  0.000934           OpponentTotalReboundsPerGame         True
6     37.0  0.001934            OpponentFieldGoalPercentage         True
16    39.0  0.002562                 OpponentAssistsPerGame         True
4    181.5  0.004113                       OwnPointsPerGame         True
21    46.0  0.006466                  OpponentPointsPerGame         True
7    176.5  0.007775                OwnThreePointPercentage         True
5    175.5  0.008768                 OwnFieldGoalPercentage         True
12    56.0  0.020860            OpponentFreeThrowPercentage        False
14   161.0  0.042753                       OwnStealsPerGame        False
20   161.0  0.043314                          OpponentFouls        False
19    68.5  0.073209                  OpponentBlock

  mannWhitneyU_results = mannWhitneyU_results._append({
