In [None]:
## Uses 2015-2018 NCAA tournament birth information to predict future/current year births
## using key statistics from the season in a logistic regression model

In [14]:
# Initialize dataset and transform

import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn as sns

def f(row):
    if pandas.isna(row['POSTSEASON']) == True:
        val = 0
    else:
        val = 1
    return val

data = pandas.read_csv('cbb.csv')
data = data.rename(columns={'ADJOE':'offeff','ADJDE':'defeff','EFG_O':'FGpercent','EFG_D':'defFGpercent','TOR':'turnoverrate','TORD':'stealrate','ORB':'offREB',
          'DRB':'defREB','FTR':'FTrate','FTRD':'defFTrate','2P_O':'FGpercenttwo','2P_D':'defFGpercenttwo','3P_O':'FGpercentthree',
          '3P_D':'defFGpercentthree','ADJ_T':'pace','TEAM':'team','CONF':'conf','G':'games','W':'wins','WAB':'winsabovebubble','AUTO':'auto'})
data['TOURNEY'] = data.apply(f, axis=1)
data1 = data[numpy.logical_and(data['YEAR'].isin([2015,2016,2017,2018]),data['auto']==0)]
data2 = data[numpy.logical_and(data['YEAR'].isin([2019]),data['auto']==0)]
tourney_data = data1[['wins', 'offeff', 'defeff',
       'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
       'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
       'FGpercentthree', 'defFGpercentthree', 'pace','TOURNEY']]
result_data = data2[['wins', 'offeff', 'defeff',
       'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
       'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
       'FGpercentthree', 'defFGpercentthree', 'pace','TOURNEY']]

In [15]:
#Logistic Regression: predict tourney birth, training data results

import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

y_logr = tourney_data.iloc[:,16]
x_logr = tourney_data.iloc[:,:16]

x_train_logr, x_test_logr, y_train_logr, y_test_logr = train_test_split(x_logr, y_logr, test_size=0.3, random_state=42)
logR = LogisticRegression(solver='lbfgs')

logR.fit(x_train_logr, y_train_logr)

predictions_logr = logR.predict(x_test_logr)
prediction_strength_logr = logR.predict_proba(x_test_logr)
score_logr = logR.score(x_test_logr, y_test_logr)

cm_logr = metrics.confusion_matrix(y_test_logr, predictions_logr, labels = [0,1])
cmtx_logr = pandas.DataFrame(
    cm_logr, 
    index=['true:no', 'true:yes'], 
    columns=['pred:no', 'pred:yes'])
cmtx_logr



Unnamed: 0,pred:no,pred:yes
true:no,328,12
true:yes,4,39


In [16]:
#Logistic Regression: predict tourney birth, result data results

result_y = result_data.iloc[:,16]
result_x = result_data.iloc[:,:16]

predictions_results = logR.predict(result_x)
prediction_strength_results = logR.predict_proba(result_x)
score_results = logR.score(result_x, result_y)

cm_results = metrics.confusion_matrix(result_y, predictions_results, labels = [0,1])
cmtx_results = pandas.DataFrame(
    cm_results, 
    index=['true:no', 'true:yes'], 
    columns=['pred:no', 'pred:yes'])
cmtx_results

Unnamed: 0,pred:no,pred:yes
true:no,280,5
true:yes,8,29


In [17]:
#Logistic Regression: predict tourney birth, incorrect predictions

data2['prediction'] = predictions_results
data2['confidence'] = prediction_strength_results[:,1]
preds_2019_append = data2

preds_2019_append[preds_2019_append['TOURNEY']!=preds_2019_append['prediction']].sort_values('confidence',ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,team,conf,games,wins,offeff,defeff,BARTHAG,FGpercent,defFGpercent,turnoverrate,...,defFGpercentthree,pace,winsabovebubble,POSTSEASON,SEED,YEAR,auto,TOURNEY,prediction,confidence
1484,Texas,B12,37,21,113.3,93.7,0.899,50.9,48.7,16.2,...,35.4,66.1,-0.5,,,2019,0,0,1,0.977126
1431,Clemson,ACC,34,20,107.2,90.3,0.8773,50.7,47.6,19.7,...,35.0,67.4,0.1,,,2019,0,0,1,0.721815
1485,TCU,B12,37,23,109.8,94.5,0.8486,52.1,48.4,19.2,...,30.4,70.1,1.1,,,2019,0,0,1,0.612343
1472,Penn St.,B10,32,14,109.9,92.8,0.8755,47.5,49.9,18.1,...,35.4,68.9,-2.7,,,2019,0,0,1,0.601542
1475,Nebraska,B10,35,19,112.5,98.1,0.8285,49.2,48.1,14.3,...,33.8,68.0,0.0,,,2019,0,0,1,0.538016
1649,Washington,P12,36,27,105.6,93.1,0.8097,52.1,47.5,19.8,...,33.4,67.3,3.1,R32,9.0,2019,0,1,0,0.499147
1637,Belmont,OVC,32,27,114.9,101.0,0.8138,57.6,48.7,15.2,...,34.4,72.5,1.2,R64,11.0,2019,0,1,0,0.417244
1698,Mississippi,SEC,33,20,113.3,97.9,0.8433,53.1,51.0,18.2,...,37.4,69.7,0.9,R64,8.0,2019,0,1,0,0.36408
1650,Arizona St.,P12,34,23,108.7,97.6,0.775,50.6,48.5,18.7,...,33.4,71.9,0.1,R64,11.0,2019,0,1,0,0.239249
1480,Baylor,B12,34,20,114.6,99.2,0.84,51.2,49.1,19.7,...,34.5,66.6,0.9,R32,9.0,2019,0,1,0,0.133608


In [None]:
## Other unrelated stuff below

In [None]:
# Simple scatter plot graph with legend

ax = plt.subplot(111)

play_x = data[data['YEAR']==2019]['defFGpercentthree']
play_y = data[data['YEAR']==2019]['winsabovebubble']
play_classes = data[data['YEAR']==2019]['TOURNEY']
    
play_scatter = sns.scatterplot(x=play_x, y=play_y, hue=play_classes, alpha=0.6)

box = ax.get_position()
ax.legend(loc='center left',bbox_to_anchor=(1,0.5))

plt.show()

In [None]:
# Multiple Linear Regression: x are numeric features, y is the label to predict
# Data is split and scaled
# Coefficients show importance to the label, 1 point change in variable causes x effect on the label

# List of columns:
# data[['team', 'conf', 'games', 'wins', 'offeff', 'defeff', 'BARTHAG',
#        'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
#        'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
#        'FGpercentthree', 'defFGpercentthree', 'pace', 'winsabovebubble',
#        'POSTSEASON', 'SEED', 'YEAR', 'TOURNEY']]

x_mlr = data[['offeff','defFGpercent','stealrate','defREB',
          'FGpercent','defFTrate','defeff','pace',
          'offREB','turnoverrate','FTrate','defFGpercenttwo',
          'FGpercenttwo','FGpercentthree']]
y_mlr = data['wins']

from sklearn.model_selection import train_test_split
X_train_mlr, X_test_mlr, y_train_mlr, y_test_mlr = train_test_split(x_mlr, y_mlr, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_mlr)

X_train_mlr = scaler.transform(X_train_mlr)
X_test_mlr = scaler.transform(X_test_mlr)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train_mlr, y_train_mlr)
# coeff_df = pandas.DataFrame(regressor.coef_, x.columns, columns=['Coefficient'])
regressor.coef_

In [None]:
# Compare actuals and predictions

y_pred_mlr = regressor.predict(X_test_mlr)
accuracy_results_mlr = pandas.DataFrame({'Actual': y_test_mlr, 'Predicted': y_pred_mlr})
accuracy_results_mlr

In [None]:
# Test for model efficiency

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_mlr, y_pred_mlr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_mlr, y_pred_mlr))
print('Root Mean Squared Error:', numpy.sqrt(metrics.mean_squared_error(y_test_mlr, y_pred_mlr)))
print('Mean:', data['wins'].mean())
print('Model Efficiency Difference:',data['wins'].mean()*.1 - numpy.sqrt(metrics.mean_squared_error(y_test_mlr, y_pred_mlr)))
if data['wins'].mean()*.1 < numpy.sqrt(metrics.mean_squared_error(y_test_mlr, y_pred_mlr)):
    print('Model does not predict well enough')
else:
    print('Model predicts well enough')

In [None]:
# Loops through all features and optimizes best r^2 list

import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

# Include all numeric features and the label
model_data_mlr = data[['wins', 'offeff', 'defeff',
       'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
       'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
       'FGpercentthree', 'defFGpercentthree', 'pace']]
model_mlr = forward_selected(model_data_mlr,'wins')

print(model_mlr.model.formula)
print(model_mlr.rsquared_adj)