In [None]:
## Uses 2015-2018 NCAA tournament birth information to predict future/current year births
## using key statistics from the season in a logistic regression model

In [1]:
## Gather API data for current data

from sportsreference.ncaab.teams import Teams
import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
pandas.options.display.max_columns = None

sport_ref_data = pandas.DataFrame()

for team in Teams():
    team_data = team.dataframe
    team_data = pandas.DataFrame(team_data)
    sport_ref_data = pandas.concat([sport_ref_data,team_data],axis=0,sort=True)    

def set_value(row_number, assigned_value): 
    return assigned_value[row_number] 
  
event_dictionary ={'sec' : 2, 'pac-12' : 2, 'big-12' : 2, 'big-ten' : 2, 'acc' : 2,
                  'southland' : 0, 'mwc' : 1, 'mac' : 0, 'swac' : 0, 'cusa' : 0,
                  'america-east' : 0, 'patriot' : 0, 'sun-belt' : 0, 'ovc' : 0, 'meac' : 0,
                  'mvc' : 0, 'wcc' : 1, 'ivy' : 0, 'northeast' : 0, 'big-east' : 1,
                  'big-west' : 0, 'wac' : 0, 'big-south' : 0, 'maac' : 0, 'aac' : 1,
                  'southern' : 0, 'horizon' : 0, 'colonial' : 0, 'atlantic-10' : 1, 'summit' : 0,
                  'big-sky' : 0, 'atlantic-sun' : 0, 'independent' : 0} 
  
sport_ref_data['major'] = sport_ref_data['conference'].apply(set_value, args =(event_dictionary, )) 

In [None]:
# sport_ref_data.to_csv('sf_data.csv')

In [2]:
## Transform datasets and limit columns

def tourney(row):
    if pandas.isna(row['seed']) == True:
        val = 0
    else:
        val = 1
    return val

sport_ref_training_data = pandas.read_csv("sf_data_test_2015_to_2019.csv")
sport_ref_training_data = pandas.DataFrame(sport_ref_training_data)
sport_ref_training_data['tourney'] = sport_ref_training_data.apply(tourney, axis=1)

tourney_training_data = sport_ref_training_data[['assist_percentage','away_losses',
       'away_wins', 'block_percentage', 'conference_losses', 'conference_wins',
        'effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate',
        'free_throw_percentage','free_throws_per_field_goal_attempt','home_wins', 'losses', 'net_rating',
       'offensive_rating', 'offensive_rebound_percentage',
       'opp_assist_percentage','opp_block_percentage','opp_effective_field_goal_percentage',
       'opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throw_percentage',
       'opp_free_throws_per_field_goal_attempt', 'opp_offensive_rating','opp_offensive_rebound_percentage',
       'opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage',
       'opp_total_rebound_percentage','opp_true_shooting_percentage', 'opp_turnover_percentage',
       'opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage',
        'strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage',
       'total_rebound_percentage','true_shooting_percentage', 'turnover_percentage',
        'two_point_field_goal_percentage','win_percentage', 'wins', 'major','tourney','auto']]
tourney_training_data = tourney_training_data[numpy.logical_and(tourney_training_data['wins']>16,
                                                                tourney_training_data['auto']==0)]
sport_ref_data = sport_ref_data[['assist_percentage','away_losses',
       'away_wins', 'block_percentage', 'conference_losses', 'conference_wins',
        'effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate',
        'free_throw_percentage','free_throws_per_field_goal_attempt','home_wins', 'losses', 'net_rating',
       'offensive_rating', 'offensive_rebound_percentage',
       'opp_assist_percentage','opp_block_percentage','opp_effective_field_goal_percentage',
       'opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throw_percentage',
       'opp_free_throws_per_field_goal_attempt', 'opp_offensive_rating','opp_offensive_rebound_percentage',
       'opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage',
       'opp_total_rebound_percentage','opp_true_shooting_percentage', 'opp_turnover_percentage',
       'opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage',
        'strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage',
       'total_rebound_percentage','true_shooting_percentage', 'turnover_percentage',
        'two_point_field_goal_percentage','win_percentage', 'wins', 'major']]
# sport_ref_data = sport_ref_data[sport_ref_data['wins']>16]

In [21]:
## Logistic Regression: predict tourney birth, training data results
## Training model confusion matrix for accuracy

import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing

y_logr = tourney_training_data.iloc[:,45]
x_logr = tourney_training_data.iloc[:,[35,12,33,4,11,42,2]]

scaler = preprocessing.StandardScaler()
x_logr_scaled = scaler.fit_transform(x_logr)

x_train_logr, x_test_logr, y_train_logr, y_test_logr = train_test_split(x_logr_scaled, y_logr, test_size=.3, random_state=42)
logR = LogisticRegression()

logR.fit(x_train_logr, y_train_logr)

predictions_logr = logR.predict(x_test_logr)
prediction_strength_logr = logR.predict_proba(x_test_logr)
score_logr = logR.score(x_test_logr, y_test_logr)

cm_logr = metrics.confusion_matrix(y_test_logr, predictions_logr, labels = [0,1])
cmtx_logr = pandas.DataFrame(
    cm_logr, 
    index=['true:no', 'true:yes'], 
    columns=['pred:no', 'pred:yes'])
print(cmtx_logr)

          pred:no  pred:yes
true:no       170         6
true:yes        1        45


In [None]:
## Figures out most important features to use

from sklearn.feature_selection import RFE,RFECV

rfecv = RFECV(estimator=logR, step=1, scoring='accuracy')
rfecv.fit(x_logr_scaled, y_logr)

import matplotlib.pyplot as plt

plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=3)

rankings = pandas.DataFrame()
rankings['attribute'] = x_logr.columns
selector = RFE(logR,n_features_to_select=1)
selector = selector.fit(x_logr_scaled,y_logr)
rankings['rank'] = selector.ranking_

plt.show()
print(rankings.sort_values('rank'))

In [9]:
## Predicting tourney bid probability for current ongoing season

result_x = sport_ref_data.iloc[:,[35,12,33,4,11,42,2]]

result_x_scaled = scaler.fit_transform(result_x)

predictions_results = logR.predict(result_x_scaled)
prediction_strength_results = logR.predict_proba(result_x_scaled)
sport_ref_data['prediction'] = predictions_results
sport_ref_data['confidence'] = prediction_strength_results[:,1]
preds_current_season_append = sport_ref_data

preds_current_season_append.sort_values('confidence',ascending=False).head(50)

Unnamed: 0,assist_percentage,away_losses,away_wins,block_percentage,conference_losses,conference_wins,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,free_throws_per_field_goal_attempt,home_wins,losses,net_rating,offensive_rating,offensive_rebound_percentage,opp_assist_percentage,opp_block_percentage,opp_effective_field_goal_percentage,opp_field_goal_percentage,opp_free_throw_attempt_rate,opp_free_throw_percentage,opp_free_throws_per_field_goal_attempt,opp_offensive_rating,opp_offensive_rebound_percentage,opp_steal_percentage,opp_three_point_attempt_rate,opp_three_point_field_goal_percentage,opp_total_rebound_percentage,opp_true_shooting_percentage,opp_turnover_percentage,opp_two_point_field_goal_percentage,pace,simple_rating_system,steal_percentage,strength_of_schedule,three_point_attempt_rate,three_point_field_goal_percentage,total_rebound_percentage,true_shooting_percentage,turnover_percentage,two_point_field_goal_percentage,win_percentage,wins,major,prediction,confidence
KANSAS,53.1,1,5,14.6,1,5,0.555,0.495,0.341,0.66,0.225,7,3,23.2,110.3,32.8,54.3,6.9,0.441,0.375,0.227,0.654,0.149,87.1,25.6,9.8,0.448,0.297,44.8,0.465,17.8,0.437,69.0,26.49,11.9,11.19,0.331,0.36,55.2,0.574,16.9,0.562,0.833,15,2,1,0.999989
SETON-HALL,55.6,2,5,15.9,0,6,0.515,0.448,0.358,0.708,0.254,7,4,14.4,104.7,29.0,48.7,10.9,0.441,0.39,0.334,0.681,0.228,90.3,28.6,9.8,0.334,0.302,48.7,0.478,18.1,0.434,72.2,18.87,10.8,8.48,0.392,0.341,51.3,0.549,16.7,0.517,0.778,14,1,1,0.999818
MICHIGAN-STATE,68.7,1,2,13.9,1,6,0.524,0.463,0.347,0.746,0.259,9,4,20.8,111.0,35.3,57.0,9.8,0.426,0.374,0.279,0.672,0.188,90.2,25.3,8.4,0.368,0.281,43.3,0.459,14.3,0.428,70.5,22.58,7.0,7.91,0.363,0.338,56.7,0.561,15.5,0.534,0.778,14,2,1,0.999773
WEST-VIRGINIA,50.4,3,3,11.3,2,4,0.487,0.44,0.416,0.64,0.266,9,3,17.7,102.8,39.7,45.5,6.5,0.411,0.362,0.374,0.695,0.26,85.1,27.6,10.8,0.384,0.256,43.8,0.459,19.3,0.428,71.2,21.23,11.2,8.62,0.304,0.307,56.2,0.518,17.5,0.499,0.833,15,2,1,0.999725
BAYLOR,52.8,0,3,13.1,0,6,0.492,0.428,0.306,0.702,0.214,9,1,21.0,107.4,37.3,54.0,13.6,0.435,0.38,0.274,0.66,0.181,86.4,28.7,8.6,0.352,0.315,46.0,0.465,19.5,0.415,67.7,19.69,12.0,5.45,0.372,0.346,54.0,0.523,14.5,0.476,0.941,16,2,1,0.999635
VILLANOVA,58.6,2,2,8.0,1,5,0.525,0.443,0.268,0.79,0.211,10,3,12.4,110.0,28.8,48.7,11.7,0.491,0.442,0.222,0.684,0.152,97.6,25.2,8.7,0.321,0.31,47.9,0.513,15.8,0.504,67.6,15.75,8.9,7.36,0.462,0.352,52.1,0.559,14.4,0.522,0.833,15,1,1,0.999335
OREGON,51.9,2,3,9.2,2,4,0.545,0.475,0.299,0.671,0.2,10,4,14.1,111.3,33.0,49.5,9.9,0.456,0.392,0.287,0.729,0.209,97.2,31.3,6.0,0.414,0.311,48.0,0.493,15.8,0.449,67.0,18.43,10.5,8.74,0.365,0.381,52.0,0.565,15.3,0.529,0.789,15,2,1,0.999213
DUKE,53.6,1,4,15.2,2,6,0.544,0.485,0.336,0.672,0.226,9,3,25.8,112.8,36.4,48.9,8.8,0.455,0.413,0.287,0.645,0.185,87.0,26.6,10.1,0.273,0.307,44.8,0.482,20.2,0.453,73.0,26.02,12.7,7.02,0.319,0.366,55.2,0.566,15.5,0.542,0.842,16,2,1,0.999184
COLORADO,55.6,2,2,8.7,2,3,0.502,0.435,0.389,0.73,0.284,8,4,13.7,102.6,31.7,46.4,10.0,0.451,0.4,0.248,0.719,0.178,88.9,24.5,10.9,0.318,0.316,45.6,0.483,18.8,0.44,69.2,17.04,9.0,7.48,0.37,0.362,54.4,0.544,17.6,0.478,0.778,14,2,1,0.999182
ARIZONA,58.0,3,0,11.1,2,3,0.546,0.482,0.363,0.737,0.268,10,5,22.2,113.9,30.8,55.1,7.0,0.443,0.383,0.369,0.718,0.265,91.7,27.2,6.4,0.377,0.317,46.9,0.49,17.9,0.423,71.2,22.6,9.1,6.71,0.334,0.382,53.1,0.58,14.2,0.533,0.722,13,2,1,0.998768


In [None]:
## Accuracy measure to use after season is over to check prediction results vs actual

# result_y = sport_ref_data.iloc[:,45]
# score_results = logR.score(result_x_scaled, result_y)

# cm_results = metrics.confusion_matrix(result_y, predictions_results, labels = [0,1])
# cmtx_results = pandas.DataFrame(
#     cm_results, 
#     index=['true:no', 'true:yes'], 
#     columns=['pred:no', 'pred:yes'])
# print(cmtx_results)
# preds_current_season_append[preds_current_season_append['TOURNEY']!=preds_current_season_append['prediction']].sort_values('confidence',ascending=False)

In [None]:
## Other unrelated stuff below ##

In [None]:
## OUTDATED DATASETS
# Initialize dataset and transform

# import pandas
# import numpy
# import matplotlib.pyplot as plt
# import seaborn as sns
# import warnings
# warnings.filterwarnings('ignore')
# pandas.options.display.max_columns = None

# def tourney(row):
#     if pandas.isna(row['POSTSEASON']) == True:
#         val = 0
#     else:
#         val = 1
#     return val

# data = pandas.read_csv('cbb.csv')
# data = data.rename(columns={'ADJOE':'offeff','ADJDE':'defeff','EFG_O':'FGpercent','EFG_D':'defFGpercent','TOR':'turnoverrate','TORD':'stealrate','ORB':'offREB',
#           'DRB':'defREB','FTR':'FTrate','FTRD':'defFTrate','2P_O':'FGpercenttwo','2P_D':'defFGpercenttwo','3P_O':'FGpercentthree',
#           '3P_D':'defFGpercentthree','ADJ_T':'pace','TEAM':'team','CONF':'conf','G':'games','W':'wins','WAB':'winsabovebubble','AUTO':'auto',
#                            'MAJOR':'major'})
# data['TOURNEY'] = data.apply(tourney, axis=1)
# data1 = data[numpy.logical_and(numpy.logical_and(data['YEAR'].isin([2015,2016,2017,2018]),data['auto']==0),data['wins']>16)]
# data2 = data[numpy.logical_and(numpy.logical_and(data['YEAR'].isin([2019]),data['auto']==0),data['wins']>16)]
# tourney_data = data1[['wins', 'offeff', 'defeff',
#        'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
#        'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
#        'FGpercentthree', 'defFGpercentthree', 'pace','major','TOURNEY']]
# result_data = data2[['wins', 'offeff', 'defeff',
#        'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
#        'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
#        'FGpercentthree', 'defFGpercentthree', 'pace','major','TOURNEY']]

In [None]:
# Simple scatter plot graph with legend

ax = plt.subplot(111)

play_x = data[data['YEAR']==2019]['defFGpercentthree']
play_y = data[data['YEAR']==2019]['winsabovebubble']
play_classes = data[data['YEAR']==2019]['TOURNEY']
    
play_scatter = sns.scatterplot(x=play_x, y=play_y, hue=play_classes, alpha=0.6)

box = ax.get_position()
ax.legend(loc='center left',bbox_to_anchor=(1,0.5))

plt.show()

In [None]:
# Multiple Linear Regression: x are numeric features, y is the label to predict
# Data is split and scaled
# Coefficients show importance to the label, 1 point change in variable causes x effect on the label

# List of columns:
# data[['team', 'conf', 'games', 'wins', 'offeff', 'defeff', 'BARTHAG',
#        'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
#        'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
#        'FGpercentthree', 'defFGpercentthree', 'pace', 'winsabovebubble',
#        'POSTSEASON', 'SEED', 'YEAR', 'TOURNEY']]

x_mlr = data[['offeff','defFGpercent','stealrate','defREB',
          'FGpercent','defFTrate','defeff','pace',
          'offREB','turnoverrate','FTrate','defFGpercenttwo',
          'FGpercenttwo','FGpercentthree']]
y_mlr = data['wins']

from sklearn.model_selection import train_test_split
X_train_mlr, X_test_mlr, y_train_mlr, y_test_mlr = train_test_split(x_mlr, y_mlr, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_mlr)

X_train_mlr = scaler.transform(X_train_mlr)
X_test_mlr = scaler.transform(X_test_mlr)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train_mlr, y_train_mlr)
# coeff_df = pandas.DataFrame(regressor.coef_, x.columns, columns=['Coefficient'])
regressor.coef_

In [None]:
# Compare actuals and predictions

y_pred_mlr = regressor.predict(X_test_mlr)
accuracy_results_mlr = pandas.DataFrame({'Actual': y_test_mlr, 'Predicted': y_pred_mlr})
accuracy_results_mlr

In [None]:
# Test for model efficiency

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_mlr, y_pred_mlr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_mlr, y_pred_mlr))
print('Root Mean Squared Error:', numpy.sqrt(metrics.mean_squared_error(y_test_mlr, y_pred_mlr)))
print('Mean:', data['wins'].mean())
print('Model Efficiency Difference:',data['wins'].mean()*.1 - numpy.sqrt(metrics.mean_squared_error(y_test_mlr, y_pred_mlr)))
if data['wins'].mean()*.1 < numpy.sqrt(metrics.mean_squared_error(y_test_mlr, y_pred_mlr)):
    print('Model does not predict well enough')
else:
    print('Model predicts well enough')

In [None]:
# Loops through all features and optimizes best r^2 list

import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

# Include all numeric features and the label
model_data_mlr = data[['wins', 'offeff', 'defeff',
       'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
       'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
       'FGpercentthree', 'defFGpercentthree', 'pace']]
model_mlr = forward_selected(model_data_mlr,'wins')

print(model_mlr.model.formula)
print(model_mlr.rsquared_adj)