### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,f1_score, mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
import joblib

### Importing .csv file into the panda dataframe.

In [2]:
df = pd.read_csv("ncaa.csv") #Read the .CSV file into a panda dataframe.

### Removing the following columns from the dataframe.

These columns were removed from the data frame because I wanted to evaluate a teams success based off specific performance categories.

In [3]:
del df['TEAM'] 
del df['CONF'] 
del df['ADJOE'] 
del df['ADJDE']
del df['BARTHAG']
del df['WAB']
del df['SEED']
del df['YEAR']
del df['G']
del df['W']
del df['EFG_O']
del df['EFG_D']

Checking all columns for null values. The "POSTSEASON" column contains 1979 null values. These null values represent the teams that didn't make it into the tournament. 

In [4]:
df.isnull().sum()

TOR              0
TORD             0
ORB              0
DRB              0
FTR              0
FTRD             0
2P_O             0
2P_D             0
3P_O             0
3P_D             0
ADJ_T            0
POSTSEASON    1979
dtype: int64

### Format the POSTSEASON outcomes
The dependant variable in my model will become a binary value.
1 = the team took part in the NCAA basketball tournament.
0 = The team did not make it into the tournament.  

In [5]:
df['POSTSEASON'].replace(['Champions','2ND','F4','E8','S16','R32','R64','R68',],[1,1,1,1,1,1,1,1], inplace=True)
df

Unnamed: 0,TOR,TORD,ORB,DRB,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,POSTSEASON
0,15.4,18.2,40.7,30.0,32.3,30.4,53.9,44.6,32.7,36.2,71.7,1.0
1,12.4,15.8,32.1,23.7,36.2,22.4,54.8,44.7,36.5,37.5,59.3,1.0
2,14.0,19.5,25.5,24.9,30.7,30.0,54.7,46.8,35.2,33.2,65.9,1.0
3,17.7,22.8,27.4,28.7,32.9,36.6,52.8,41.9,36.5,29.7,67.5,1.0
4,16.2,17.1,30.0,26.2,39.0,26.9,56.3,40.0,38.2,29.0,71.5,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2450,20.8,19.2,36.1,27.6,36.6,32.4,50.4,44.3,34.1,30.1,64.4,1.0
2451,19.5,19.8,35.0,26.7,37.4,32.9,50.6,43.4,37.1,35.8,66.8,1.0
2452,21.4,22.0,35.8,27.2,38.4,33.3,49.1,44.9,33.3,33.4,69.2,1.0
2453,17.1,21.3,29.0,34.2,31.3,28.5,49.3,50.6,37.7,30.2,66.0,1.0


In [6]:
df['POSTSEASON'] = df['POSTSEASON'].fillna(0)

In [7]:
df.isnull().sum()

TOR           0
TORD          0
ORB           0
DRB           0
FTR           0
FTRD          0
2P_O          0
2P_D          0
3P_O          0
3P_D          0
ADJ_T         0
POSTSEASON    0
dtype: int64

### Splitting the dataframe into x and y values. X = independant variable  and y = dependant variable

In [8]:
y = df['POSTSEASON']
X = df.drop('POSTSEASON', axis=1)

### The data will be split into training and test data. 80% to test the model and 20% to test the model.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1)

### Fitting the model with the Gradient Boosting Classifier using the training data.

In [10]:
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0,max_depth=1, random_state=1)

gb_model.fit(X_train, y_train)
joblib.dump(gb_model, 'trainedBasketballModel.pkl')

['trainedBasketballModel.pkl']

In [11]:
gb_model.score(X_test, y_test)

0.879837067209776

In [12]:
print('Model performance Results Are:')
y_pred = gb_model.predict(X_test)
print("\n""algorithm Accuracy: {:.3f}%\n\t\t\t\t" "\n" "          F1-Score: {:.5f}"\
.format(accuracy_score(y_test, y_pred)*100, f1_score(y_test, y_pred)))

Model performance Results Are:

algorithm Accuracy: 87.984%
				
          F1-Score: 0.62420


In [13]:
y_pred = gb_model.predict(X_test)


In [15]:
feature_importance = gb_model.feature_importances_
print(feature_importance)

    


[0.20589722 0.04662086 0.08793696 0.01340298 0.03090161 0.03534728
 0.15577098 0.25734129 0.05869591 0.07652902 0.03155588]
0.20589721741442374
0.2525180763437952
0.34045503472108657
0.3538580187706835
0.3847596327672589
0.4201069159214752
0.5758778977020926
0.8332191899044288
0.891915102583626
0.9684441188489707
0.9999999999999999


In [None]:
y_test.head(30)

In [None]:

model = joblib.load('trainedBasketballModel.pkl')
#model = gb_model

Season_Path = [

19.3,     # Turnover Percentage Allowed  **Offense
15.8,     # Turnover Percentage Committed  **Defense
33.6,     # Offense Rebounds Rate  **Offense
31.6,     # Offense Rebounds Rate Allowed  **Defense
49.3,     # Free Throw Rate  **Offense
29.8,     # Free Throw Rate Allowed  **Defense
55.7,     # 2-Point Shooting Percentage  **Offense
48.9,     # 2-Point Shooting Percentage Allowed **Defense
35.8,     # 3-Point Shooting Percentage  **Offense
35.6,     # 3-Point Shooting Percentage Allowed  **Defense
67.6      # Adjusted Tempo(quantity of possesions per 40 minutes)
    
]

outlooks = [
    Season_Path
]

#19.3	15.8	33.6	31.2	49.3	29.8	50.7	48.9	35.8	35.6	67.6
predicted_season_outlook = model.predict(outlooks)

predicted_outlook = predicted_season_outlook[0]


print('**************Playoff Possibility: %.1f******************' % predicted_outlook)