In [None]:
## Uses 2015-2018 NCAA tournament birth information to predict future/current year births
## using key statistics from the season in a logistic regression model
pandas.set_option('display.max_columns', 500)
pandas.set_option('display.max_rows', 500)

In [None]:
## Gather API data for current data

from sportsreference.ncaab.teams import Teams
import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
pandas.options.display.max_columns = None

sport_ref_data = pandas.DataFrame()

for team in Teams():
    team_data = team.dataframe
    team_data = pandas.DataFrame(team_data)
    sport_ref_data = pandas.concat([sport_ref_data,team_data],axis=0,sort=True)    

def set_value(row_number, assigned_value): 
    return assigned_value[row_number] 
  
event_dictionary ={'sec' : 2, 'pac-12' : 2, 'big-12' : 2, 'big-ten' : 2, 'acc' : 2,
                  'southland' : 0, 'mwc' : 1, 'mac' : 0, 'swac' : 0, 'cusa' : 0,
                  'america-east' : 0, 'patriot' : 0, 'sun-belt' : 0, 'ovc' : 0, 'meac' : 0,
                  'mvc' : 0, 'wcc' : 1, 'ivy' : 0, 'northeast' : 0, 'big-east' : 1,
                  'big-west' : 0, 'wac' : 0, 'big-south' : 0, 'maac' : 0, 'aac' : 1,
                  'southern' : 0, 'horizon' : 0, 'colonial' : 0, 'atlantic-10' : 1, 'summit' : 0,
                  'big-sky' : 0, 'atlantic-sun' : 0, 'independent' : 0} 
  
sport_ref_data['major'] = sport_ref_data['conference'].apply(set_value, args =(event_dictionary, )) 

In [None]:
sport_ref_data.to_csv('sf_data.csv')

In [None]:
## Transform datasets and limit columns

def tourney(row):
    if pandas.isna(row['seed']) == True:
        val = 0
    else:
        val = 1
    return val

sport_ref_training_data = pandas.read_csv("sf_data_test_2015_to_2019.csv")
sport_ref_training_data = pandas.DataFrame(sport_ref_training_data)
sport_ref_training_data['tourney'] = sport_ref_training_data.apply(tourney, axis=1)

tourney_training_data = sport_ref_training_data[['assist_percentage','away_losses',
       'away_wins', 'block_percentage', 'conference_losses', 'conference_wins',
        'effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate',
        'free_throw_percentage','free_throws_per_field_goal_attempt','home_wins', 'losses', 'net_rating',
       'offensive_rating', 'offensive_rebound_percentage',
       'opp_assist_percentage','opp_block_percentage','opp_effective_field_goal_percentage',
       'opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throw_percentage',
       'opp_free_throws_per_field_goal_attempt', 'opp_offensive_rating','opp_offensive_rebound_percentage',
       'opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage',
       'opp_total_rebound_percentage','opp_true_shooting_percentage', 'opp_turnover_percentage',
       'opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage',
        'strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage',
       'total_rebound_percentage','true_shooting_percentage', 'turnover_percentage',
        'two_point_field_goal_percentage','win_percentage', 'wins', 'major','tourney','auto']]
tourney_training_data = tourney_training_data[numpy.logical_and(tourney_training_data['wins']>16,
                                                                tourney_training_data['auto']==0)]
sport_ref_data = sport_ref_data[['assist_percentage','away_losses',
       'away_wins', 'block_percentage', 'conference_losses', 'conference_wins',
        'effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate',
        'free_throw_percentage','free_throws_per_field_goal_attempt','home_wins', 'losses', 'net_rating',
       'offensive_rating', 'offensive_rebound_percentage',
       'opp_assist_percentage','opp_block_percentage','opp_effective_field_goal_percentage',
       'opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throw_percentage',
       'opp_free_throws_per_field_goal_attempt', 'opp_offensive_rating','opp_offensive_rebound_percentage',
       'opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage',
       'opp_total_rebound_percentage','opp_true_shooting_percentage', 'opp_turnover_percentage',
       'opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage',
        'strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage',
       'total_rebound_percentage','true_shooting_percentage', 'turnover_percentage',
        'two_point_field_goal_percentage','win_percentage', 'wins', 'major']]
# sport_ref_data = sport_ref_data[sport_ref_data['wins']>16]

In [None]:
## Logistic Regression: predict tourney birth, training data results
## Training model confusion matrix for accuracy

import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing

y_logr = tourney_training_data.iloc[:,45]
x_logr = tourney_training_data.iloc[:,[35,12,33,4,11,42,2]]

scaler = preprocessing.StandardScaler()
x_logr_scaled = scaler.fit_transform(x_logr)

x_train_logr, x_test_logr, y_train_logr, y_test_logr = train_test_split(x_logr_scaled, y_logr, test_size=.3, random_state=42)
logR = LogisticRegression()

logR.fit(x_train_logr, y_train_logr)

predictions_logr = logR.predict(x_test_logr)
prediction_strength_logr = logR.predict_proba(x_test_logr)
score_logr = logR.score(x_test_logr, y_test_logr)

cm_logr = metrics.confusion_matrix(y_test_logr, predictions_logr, labels = [0,1])
cmtx_logr = pandas.DataFrame(
    cm_logr, 
    index=['true:no', 'true:yes'], 
    columns=['pred:no', 'pred:yes'])
print(cmtx_logr)

In [None]:
## Figures out most important features to use

from sklearn.feature_selection import RFE,RFECV

rfecv = RFECV(estimator=logR, step=1, scoring='accuracy')
rfecv.fit(x_logr_scaled, y_logr)

import matplotlib.pyplot as plt

plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=3)

rankings = pandas.DataFrame()
rankings['attribute'] = x_logr.columns
selector = RFE(logR,n_features_to_select=1)
selector = selector.fit(x_logr_scaled,y_logr)
rankings['rank'] = selector.ranking_

plt.show()
print(rankings.sort_values('rank'))

In [26]:
## Predicting tourney bid probability for current ongoing season

result_x = sport_ref_data.iloc[:,[35,12,33,4,11,42,2]]

result_x_scaled = scaler.fit_transform(result_x)

predictions_results = logR.predict(result_x_scaled)
prediction_strength_results = logR.predict_proba(result_x_scaled)
sport_ref_data['prediction'] = predictions_results
sport_ref_data['confidence'] = prediction_strength_results[:,1]
preds_current_season_append = sport_ref_data

preds_current_season_append.sort_values('confidence',ascending=False).head(75).iloc[:,[35,12,33,4,11,42,2,46]]

Unnamed: 0,strength_of_schedule,losses,simple_rating_system,conference_losses,home_wins,win_percentage,away_wins,confidence
KANSAS,11.34,3,25.54,1,9,0.857,6,0.999984
BAYLOR,6.34,1,20.49,0,10,0.95,5,0.999763
WEST-VIRGINIA,9.1,4,21.05,3,11,0.81,3,0.999625
DUKE,7.51,3,25.7,2,10,0.857,5,0.999624
SETON-HALL,8.34,5,17.44,1,9,0.762,5,0.999435
VILLANOVA,8.5,4,16.11,2,10,0.81,4,0.999385
MARYLAND,8.69,4,18.74,3,12,0.81,2,0.999351
MICHIGAN-STATE,8.56,6,22.47,3,10,0.727,3,0.999242
ARIZONA,7.38,6,21.9,3,10,0.714,2,0.998597
FLORIDA,8.19,8,14.19,3,7,0.619,2,0.998346


In [None]:
## Accuracy measure to use after season is over to check prediction results vs actual

# result_y = sport_ref_data.iloc[:,45]
# score_results = logR.score(result_x_scaled, result_y)

# cm_results = metrics.confusion_matrix(result_y, predictions_results, labels = [0,1])
# cmtx_results = pandas.DataFrame(
#     cm_results, 
#     index=['true:no', 'true:yes'], 
#     columns=['pred:no', 'pred:yes'])
# print(cmtx_results)
# preds_current_season_append[preds_current_season_append['TOURNEY']!=preds_current_season_append['prediction']].sort_values('confidence',ascending=False)

In [None]:
## Other unrelated stuff below ##

In [None]:
## OUTDATED DATASETS
# Initialize dataset and transform

# import pandas
# import numpy
# import matplotlib.pyplot as plt
# import seaborn as sns
# import warnings
# warnings.filterwarnings('ignore')
# pandas.options.display.max_columns = None

# def tourney(row):
#     if pandas.isna(row['POSTSEASON']) == True:
#         val = 0
#     else:
#         val = 1
#     return val

# data = pandas.read_csv('cbb.csv')
# data = data.rename(columns={'ADJOE':'offeff','ADJDE':'defeff','EFG_O':'FGpercent','EFG_D':'defFGpercent','TOR':'turnoverrate','TORD':'stealrate','ORB':'offREB',
#           'DRB':'defREB','FTR':'FTrate','FTRD':'defFTrate','2P_O':'FGpercenttwo','2P_D':'defFGpercenttwo','3P_O':'FGpercentthree',
#           '3P_D':'defFGpercentthree','ADJ_T':'pace','TEAM':'team','CONF':'conf','G':'games','W':'wins','WAB':'winsabovebubble','AUTO':'auto',
#                            'MAJOR':'major'})
# data['TOURNEY'] = data.apply(tourney, axis=1)
# data1 = data[numpy.logical_and(numpy.logical_and(data['YEAR'].isin([2015,2016,2017,2018]),data['auto']==0),data['wins']>16)]
# data2 = data[numpy.logical_and(numpy.logical_and(data['YEAR'].isin([2019]),data['auto']==0),data['wins']>16)]
# tourney_data = data1[['wins', 'offeff', 'defeff',
#        'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
#        'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
#        'FGpercentthree', 'defFGpercentthree', 'pace','major','TOURNEY']]
# result_data = data2[['wins', 'offeff', 'defeff',
#        'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
#        'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
#        'FGpercentthree', 'defFGpercentthree', 'pace','major','TOURNEY']]

In [None]:
# Simple scatter plot graph with legend

ax = plt.subplot(111)

play_x = data[data['YEAR']==2019]['defFGpercentthree']
play_y = data[data['YEAR']==2019]['winsabovebubble']
play_classes = data[data['YEAR']==2019]['TOURNEY']
    
play_scatter = sns.scatterplot(x=play_x, y=play_y, hue=play_classes, alpha=0.6)

box = ax.get_position()
ax.legend(loc='center left',bbox_to_anchor=(1,0.5))

plt.show()

In [None]:
# Multiple Linear Regression: x are numeric features, y is the label to predict
# Data is split and scaled
# Coefficients show importance to the label, 1 point change in variable causes x effect on the label

# List of columns:
# data[['team', 'conf', 'games', 'wins', 'offeff', 'defeff', 'BARTHAG',
#        'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
#        'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
#        'FGpercentthree', 'defFGpercentthree', 'pace', 'winsabovebubble',
#        'POSTSEASON', 'SEED', 'YEAR', 'TOURNEY']]

x_mlr = data[['offeff','defFGpercent','stealrate','defREB',
          'FGpercent','defFTrate','defeff','pace',
          'offREB','turnoverrate','FTrate','defFGpercenttwo',
          'FGpercenttwo','FGpercentthree']]
y_mlr = data['wins']

from sklearn.model_selection import train_test_split
X_train_mlr, X_test_mlr, y_train_mlr, y_test_mlr = train_test_split(x_mlr, y_mlr, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_mlr)

X_train_mlr = scaler.transform(X_train_mlr)
X_test_mlr = scaler.transform(X_test_mlr)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train_mlr, y_train_mlr)
# coeff_df = pandas.DataFrame(regressor.coef_, x.columns, columns=['Coefficient'])
regressor.coef_

In [None]:
# Compare actuals and predictions

y_pred_mlr = regressor.predict(X_test_mlr)
accuracy_results_mlr = pandas.DataFrame({'Actual': y_test_mlr, 'Predicted': y_pred_mlr})
accuracy_results_mlr

In [None]:
# Test for model efficiency

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_mlr, y_pred_mlr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_mlr, y_pred_mlr))
print('Root Mean Squared Error:', numpy.sqrt(metrics.mean_squared_error(y_test_mlr, y_pred_mlr)))
print('Mean:', data['wins'].mean())
print('Model Efficiency Difference:',data['wins'].mean()*.1 - numpy.sqrt(metrics.mean_squared_error(y_test_mlr, y_pred_mlr)))
if data['wins'].mean()*.1 < numpy.sqrt(metrics.mean_squared_error(y_test_mlr, y_pred_mlr)):
    print('Model does not predict well enough')
else:
    print('Model predicts well enough')

In [None]:
# Loops through all features and optimizes best r^2 list

import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

# Include all numeric features and the label
model_data_mlr = data[['wins', 'offeff', 'defeff',
       'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
       'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
       'FGpercentthree', 'defFGpercentthree', 'pace']]
model_mlr = forward_selected(model_data_mlr,'wins')

print(model_mlr.model.formula)
print(model_mlr.rsquared_adj)