In [None]:
# Initialize dataset and transform

import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn as sns

def f(row):
    if pandas.isna(row['POSTSEASON']) == True:
        val = 0
    else:
        val = 1
    return val

data = pandas.read_csv('cbb.csv')
data = data.rename(columns={'ADJOE':'offeff','ADJDE':'defeff','EFG_O':'FGpercent','EFG_D':'defFGpercent','TOR':'turnoverrate','TORD':'stealrate','ORB':'offREB',
          'DRB':'defREB','FTR':'FTrate','FTRD':'defFTrate','2P_O':'FGpercenttwo','2P_D':'defFGpercenttwo','3P_O':'FGpercentthree',
          '3P_D':'defFGpercentthree','ADJ_T':'pace','TEAM':'team','CONF':'conf','G':'games','W':'wins','WAB':'winsabovebubble'})
data['TOURNEY'] = data.apply(f, axis=1)
# data[numpy.logical_and(data['YEAR']==2017,data['TOURNEY']==1)].sort_values('wins',ascending=False)

In [None]:
# Simple scatter plot graph with legend

ax = plt.subplot(111)

play_x = data[data['YEAR']==2019]['defFGpercentthree']
play_y = data[data['YEAR']==2019]['winsabovebubble']
play_classes = data[data['YEAR']==2019]['TOURNEY']
    
play_scatter = sns.scatterplot(x=play_x, y=play_y, hue=play_classes, alpha=0.6)

box = ax.get_position()
ax.legend(loc='center left',bbox_to_anchor=(1,0.5))

plt.show()

In [None]:
# Multiple Linear Regression: x are numeric features, y is the label to predict
# Data is split and scaled
# Coefficients show importance to the label, 1 point change in variable causes x effect on the label

# List of columns:
# data[['team', 'conf', 'games', 'wins', 'offeff', 'defeff', 'BARTHAG',
#        'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
#        'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
#        'FGpercentthree', 'defFGpercentthree', 'pace', 'winsabovebubble',
#        'POSTSEASON', 'SEED', 'YEAR', 'TOURNEY']]

x = data[['offeff','defFGpercent','stealrate','defREB',
          'FGpercent','defFTrate','defeff','pace',
          'offREB','turnoverrate','FTrate','defFGpercenttwo',
          'FGpercenttwo','FGpercentthree']]
y = data['wins']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
coeff_df = pandas.DataFrame(regressor.coef_, x.columns, columns=['Coefficient'])

In [None]:
# Compare actuals and predictions

y_pred = regressor.predict(X_test)
accuracy_results = pandas.DataFrame({'Actual': y_test, 'Predicted': y_pred})

In [None]:
# Test for model efficiency

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', numpy.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Mean:', data['wins'].mean())
print('Model Efficiency Difference:',data['wins'].mean()*.1 - numpy.sqrt(metrics.mean_squared_error(y_test, y_pred)))
if data['wins'].mean()*.1 < numpy.sqrt(metrics.mean_squared_error(y_test, y_pred)):
    print('Model does not predict well enough')
else:
    print('Model predicts well enough')

In [None]:
# Loops through all features and optimizes best r^2 list

import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

# Include all numeric features and the label
model_data = data[['wins', 'offeff', 'defeff',
       'FGpercent', 'defFGpercent', 'turnoverrate', 'stealrate', 'offREB',
       'defREB', 'FTrate', 'defFTrate', 'FGpercenttwo', 'defFGpercenttwo',
       'FGpercentthree', 'defFGpercentthree', 'pace']]
model = forward_selected(model_data,'wins')

print(model.model.formula)
print(model.rsquared_adj)