In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Basic_Stats.csv')
print(df.shape)
df['Tournament App?'] = df['School'].str.contains('NCAA', case=True)*1 # 1 if tournament appearance
df.head()

(9802, 33)


Unnamed: 0,School,Games Played,Wins,Losses,W-L%,Simple Rating System,Strength of Schedule,Conf. Wins,Conf. Losses,Home Wins,...,FT%,ORB,TRB,AST,STL,BLK,TOV,PF,Year,Tournament App?
0,Air Force,28,5,23,0.179,-5.62,5.05,1.0,17.0,5.0,...,0.628,,1040.0,385.0,154.0,79.0,403.0,555.0,1996,0
1,Akron,26,3,23,0.115,-11.16,2.52,0.0,18.0,2.0,...,0.695,,782.0,307.0,147.0,60.0,372.0,454.0,1996,0
2,Alabama,32,19,13,0.594,6.99,6.11,9.0,7.0,12.0,...,0.663,,1252.0,408.0,197.0,196.0,440.0,567.0,1996,0
3,Alabama State,27,9,18,0.333,-23.42,-12.79,5.0,9.0,5.0,...,0.644,,834.0,289.0,225.0,51.0,326.0,482.0,1996,0
4,Alcorn State,25,10,15,0.4,-18.2,-9.2,7.0,7.0,7.0,...,0.658,,1009.0,354.0,233.0,90.0,472.0,558.0,1996,0


In [4]:
# Introduce per game stats
for per_game_col in df.columns[13:32]:
    if '%' in per_game_col:
        continue
    else:
        df[per_game_col+' Per Game'] = df[per_game_col]/df['Games Played']
df.head()

Unnamed: 0,School,Games Played,Wins,Losses,W-L%,Simple Rating System,Strength of Schedule,Conf. Wins,Conf. Losses,Home Wins,...,3PA Per Game,FTM Per Game,FTA Per Game,ORB Per Game,TRB Per Game,AST Per Game,STL Per Game,BLK Per Game,TOV Per Game,PF Per Game
0,Air Force,28,5,23,0.179,-5.62,5.05,1.0,17.0,5.0,...,14.785714,13.214286,21.035714,,37.142857,13.75,5.5,2.821429,14.392857,19.821429
1,Akron,26,3,23,0.115,-11.16,2.52,0.0,18.0,2.0,...,13.884615,14.115385,20.307692,,30.076923,11.807692,5.653846,2.307692,14.307692,17.461538
2,Alabama,32,19,13,0.594,6.99,6.11,9.0,7.0,12.0,...,21.875,11.3125,17.0625,,39.125,12.75,6.15625,6.125,13.75,17.71875
3,Alabama State,27,9,18,0.333,-23.42,-12.79,5.0,9.0,5.0,...,18.037037,14.481481,22.481481,,30.888889,10.703704,8.333333,1.888889,12.074074,17.851852
4,Alcorn State,25,10,15,0.4,-18.2,-9.2,7.0,7.0,7.0,...,16.12,15.16,23.04,,40.36,14.16,9.32,3.6,18.88,22.32


In [5]:
# Let's focus the analysis to 2019-2010 to not have to bother with NaN entries for now
df = df[(df['Year'] >= 2010) & (df['Year'] <= 2019)]
df.columns

Index(['School', 'Games Played', 'Wins', 'Losses', 'W-L%',
       'Simple Rating System', 'Strength of Schedule', 'Conf. Wins',
       'Conf. Losses', 'Home Wins', 'Home Losses', 'Away Wins', 'Away Losses',
       'Total Pts Scored', 'Total Pts Suffered', 'Minutes Played', 'FGM',
       'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'ORB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'Year', 'Tournament App?',
       'Total Pts Scored Per Game', 'Total Pts Suffered Per Game',
       'Minutes Played Per Game', 'FGM Per Game', 'FGA Per Game',
       '3PM Per Game', '3PA Per Game', 'FTM Per Game', 'FTA Per Game',
       'ORB Per Game', 'TRB Per Game', 'AST Per Game', 'STL Per Game',
       'BLK Per Game', 'TOV Per Game', 'PF Per Game'],
      dtype='object')

In [6]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

df = df.dropna()

X_train, X_test, y_train, y_test = train_test_split(df[['Minutes Played Per Game', 'FGM Per Game', 'FGA Per Game',
       '3PM Per Game', '3PA Per Game', 'FTM Per Game', 'FTA Per Game',
       'ORB Per Game', 'TRB Per Game', 'AST Per Game', 'STL Per Game',
       'BLK Per Game', 'TOV Per Game', 'PF Per Game']], df['Tournament App?'], test_size=0.2, random_state=40)

model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=3)
rfe = rfe.fit(X_train, y_train)
model.fit(X_train, y_train)

print("Logist Reg w/ All Pred: {:.2f}%".format(100*model.score(X_test, y_test)))
print("Logist Reg w/ All Pred Conf Mat: {0}".format(confusion_matrix(y_test, model.predict(X_test))))
X_test_rfemod = X_test.iloc[:, list(np.where(rfe.ranking_ == 1)[0])]
print("Logist Reg w/ 3 Most Relevant Pred: {:.2f}%".format(100*rfe.estimator_.score(X_test_rfemod, y_test)))
print("Logist Reg w/ 3 Most Relevant Pred Conf Mat: {0}".format(confusion_matrix(y_test, rfe.estimator_.predict(X_test_rfemod))))

Logist Reg w/ All Pred: 86.00%
Logist Reg w/ All Pred Conf Mat: [[545  20]
 [ 77  51]]
Logist Reg w/ 3 Most Relevant Pred: 83.41%
Logist Reg w/ 3 Most Relevant Pred Conf Mat: [[548  17]
 [ 98  30]]




In [7]:
# not used in final presentation

from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lasso = Lasso(alpha=0.01, max_iter=1000) # This only works with CV
lasso.fit(X_train_scaled, y_train)

# selected_features = np.where(lasso.coef_ != 0)[0]

print("Lasso Reg: {:.2f}%".format(100*lasso.score(X_test_scaled, y_test)))

Lasso Reg: 27.57%


In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=40)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
top_features = indices[:3]

print("Random Forest w/ All Pred: {:.2f}%".format(100*rf.score(X_test, y_test)))
rf_top3 = RandomForestClassifier(n_estimators=100, random_state=40)
rf_top3.fit(X_train.iloc[:, top_features], y_train)
print("Random Forest w/ 3 Most Relevant Pred: {:.2f}%".format(100*rf_top3.score(X_test.iloc[:, top_features], y_test)))
print(X_train.columns[top_features[0]], X_train.columns[top_features[1]], X_train.columns[top_features[2]])


Random Forest w/ All Pred: 85.43%
Random Forest w/ 3 Most Relevant Pred: 78.07%
TOV Per Game AST Per Game PF Per Game
