In [None]:
## Packages and initial data set ##

from sportsreference.ncaab.schedule import Schedule
from sportsreference.ncaab.teams import Teams
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas
import numpy
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LinearRegression


warnings.filterwarnings('ignore')

pandas.set_option('display.max_columns', 500)
pandas.set_option('display.max_rows', 500)
data = pandas.read_csv('ncaab_box_scores.csv')
data = data[data.columns[-47:]]
data['ranked'] = numpy.where(data['team_rank']>0, 1, 0)
data['ranked_opp'] = numpy.where(data['opp_rank']>0, 1, 0)
data = pandas.get_dummies(data,prefix=['team_home_away'], columns=['team_home_away'])

In [None]:
## Determine matchup variables ##

matchup_drops = ['unique_game','opponent', 'opp_assist_pct',
       'opp_block_pct', 'opp_def_rating', 'opp_def_reb_pct', 'opp_eff_fg_pct',
       'opp_fg_pct', 'opp_ft_rate', 'opp_ft_pct', 'opp_off_rating',
       'opp_off_reb_pct', 'opp_pts', 'opp_rank', 'opp_stl_pct', 'opp_3pt_rate',
       'opp_3pt_pct', 'opp_reb_pct', 'opp_true_shoot_pct', 'opp_to_pct',
       'opp_2pt_rate', 'opp_2pt_pct','game_date']

matchup_data = data.drop(matchup_drops, 1).dropna()
team1cols = ['team1_assist_pct', 'team1_block_pct', 'team1_def_rating',
       'team1_def_reb_pct', 'team1_eff_fg_pct', 'team1_fg_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_rating', 'team1_off_reb_pct', 'team1_pts',
       'team1_rank', 'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct', 'team1_pace', 'team1_win_pct', 'team1_ranked_pct', 'team1_ranked_opp_pct',
       'team1_Away', 'team1_Home','team1']
team2cols = ['team2_assist_pct', 'team2_block_pct', 'team2_def_rating',
       'team2_def_reb_pct', 'team2_eff_fg_pct', 'team2_fg_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_rating', 'team2_off_reb_pct', 'team2_pts',
       'team2_rank', 'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct', 'team2_pace', 'team2_win_pct', 'team2_ranked_pct', 'team2_ranked_opp_pct',
       'team2_Away', 'team2_Home','team2']

## Detrmine the right scale of variables and filter situations accordingly ##

team1 = pandas.DataFrame(matchup_data[numpy.logical_and(numpy.logical_and(matchup_data['team']=='PENN-STATE',
                                                                  matchup_data['team_home_away_Home']==1),
                                                matchup_data['ranked_opp']==0)].mean())
team2 = pandas.DataFrame(matchup_data[numpy.logical_and(numpy.logical_and(matchup_data['team']=='ILLINOIS',
                                                                  matchup_data['team_home_away_Home']==0),
                                                matchup_data['ranked_opp']>0)].mean())
team1 = team1.transpose()
team2 = team2.transpose()
team1['team1'] = 'PENN-STATE'
team2['team2'] = 'ILLINOIS'
team1.columns = team1cols
team2.columns = team2cols

matchup = pandas.concat([team1,team2],axis=1,sort=True)

In [None]:
# ## Packages for modeling ##

# drops = ['team','unique_game','opponent','game_date']
# log_data = data.drop(drops, 1).dropna()
# datacols = ['team1_assist_pct', 'team1_block_pct', 'team1_def_rating',
#        'team1_def_reb_pct', 'team1_eff_fg_pct', 'team1_fg_pct', 'team1_ft_rate',
#        'team1_ft_pct', 'team1_off_rating', 'team1_off_reb_pct', 'team1_pts',
#        'team1_rank', 'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
#        'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
#        'team1_2pt_pct','team2_assist_pct', 'team2_block_pct', 'team2_def_rating',
#        'team2_def_reb_pct', 'team2_eff_fg_pct', 'team2_fg_pct', 'team2_ft_rate',
#        'team2_ft_pct', 'team2_off_rating', 'team2_off_reb_pct', 'team2_pts',
#        'team2_rank', 'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
#        'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
#        'team2_2pt_pct','team1_pace', 'team1_win_pct', 'team1_ranked_pct', 'team1_ranked_opp_pct',
#        'team1_Away', 'team1_Home']
# log_data.columns = datacols

# ## Determine which fields to drop for the model ##

# x = log_data.drop(['team1_win_pct','team1_pts','team2_pts','team1_off_rating',
#                   'team1_def_rating','team2_off_rating','team2_def_rating','team1_rank','team2_rank'], 1)
# y = log_data['team1_win_pct']

# # scaler = preprocessing.StandardScaler()
# # x_scaled = scaler.fit_transform(x)
# x_scaled = x

# x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=.3, random_state=42)

# logR = LogisticRegression()
# logR.fit(x_train, y_train)

# predictions_logr = logR.predict(x_test)
# prediction_strength_logr = logR.predict_proba(x_test)
# score_logr = logR.score(x_test, y_test)

# cm_logr = metrics.confusion_matrix(y_test, predictions_logr, labels = [0,1])
# cmtx_logr = pandas.DataFrame(
#     cm_logr, 
#     index=['true:no', 'true:yes'], 
#     columns=['pred:no', 'pred:yes'])

# ## View accuracy scores ##

# print(cmtx_logr)
# print(score_logr)

In [None]:
# ## Figures out most important features to use
# import matplotlib.pyplot as plt
# from sklearn.feature_selection import RFE,RFECV

# rfecv = RFECV(estimator=logR, step=1, scoring='accuracy')
# rfecv.fit(x_scaled, y)

# plt.figure(figsize=(16, 9))
# plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20)
# plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
# plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
# plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=3)

# rankings = pandas.DataFrame()
# rankings['attribute'] = x.columns
# selector = RFE(logR,n_features_to_select=1)
# selector = selector.fit(x_scaled,y)
# rankings['rank'] = selector.ranking_

# plt.show()
# print(rankings.sort_values('rank'))

In [None]:
# ## Get prediction and append to the matchup dataset ##

# result_x = matchup.drop(['team2_pace', 'team2_win_pct', 'team2_ranked_pct',
#        'team2_ranked_opp_pct', 'team2_Away', 'team2_Home', 'team2','team1',
#         'team1_win_pct','team1_pts','team2_pts','team1_off_rating',
#         'team1_def_rating','team2_off_rating','team2_def_rating','team1_rank','team2_rank'],1)

# # result_x_scaled = scaler.fit_transform(result_x)
# result_x_scaled = result_x

# predictions_results = logR.predict(result_x_scaled)
# prediction_strength_results = logR.predict_proba(result_x_scaled)
# matchup['prediction'] = predictions_results
# matchup['confidence'] = prediction_strength_results[:,1]
# preds_matchup = matchup

In [None]:
# ## Train linear regression model twice for team points and opp points ##

# x_lin = log_data.drop(['team1_win_pct','team1_pts','team2_pts','team1_off_rating',
#                   'team1_def_rating','team2_off_rating','team2_def_rating','team1_rank','team2_rank'], 1)
# y_lin_team1 = log_data['team1_pts']
# y_lin_team2 = log_data['team2_pts']

# # x_lin_scaled = scaler.fit_transform(x_lin)
# x_lin_scaled = x_lin

# x_lin_train_team1, x_lin_test_team1, y_lin_train_team1, y_lin_test_team1 = train_test_split(x_lin_scaled, y_lin_team1, test_size=.3, random_state=42)
# x_lin_train_team2, x_lin_test_team2, y_lin_train_team2, y_lin_test_team2 = train_test_split(x_lin_scaled, y_lin_team2, test_size=.3, random_state=42)

# linreg_team1 = LinearRegression()
# linreg_team2 = LinearRegression()

# model_team1 = linreg_team1.fit(x_lin_train_team1, y_lin_train_team1)
# model_team2 = linreg_team2.fit(x_lin_train_team2, y_lin_train_team2)

# ## Determine r^2 accuracy ##

# print(model_team1.score(x_lin_train_team1, y_lin_train_team1))
# print(model_team2.score(x_lin_train_team2, y_lin_train_team2))

# ## Predict scores and append to matchup prediction ##

# y_predict_team1 = model_team1.predict(result_x_scaled)
# y_predict_team2 = model_team2.predict(result_x_scaled)

# preds_matchup['team1_pts'] = y_predict_team1
# preds_matchup['team2_pts'] = y_predict_team2

In [None]:
# preds_matchup[['team1','team2','prediction','confidence','team1_pts','team2_pts']]

In [None]:
## WORKING ON FIXING FROM HERE START HERE ##

drops = ['team','unique_game','opponent','game_date','team_def_rating','opp_def_rating','team_off_rating',
        'opp_off_rating','team_rank','opp_rank','team_eff_fg_pct', 'team_fg_pct','opp_eff_fg_pct', 'opp_fg_pct']
log_data = data.drop(drops, 1).dropna()
datacols = ['team1_assist_pct', 'team1_block_pct',
       'team1_def_reb_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_reb_pct', 'team1_pts',
       'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct','team2_assist_pct', 'team2_block_pct',
       'team2_def_reb_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_reb_pct', 'team2_pts',
       'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct','team1_pace', 'team1_win_pct', 'team1_ranked_pct', 'team1_ranked_opp_pct',
       'team1_Away', 'team1_Home']
log_data.columns = datacols

matchup_2 = pandas.DataFrame([],columns=datacols)
matchup_2 = matchup[datacols]

In [None]:
x = log_data.drop(['team1_pts','team2_pts'],1)
# x = preprocessing.scale(x)
y = log_data['team1_pts']
y_2 = log_data['team2_pts']
new_x = matchup_2.drop(['team1_pts','team2_pts'],1)
# new_x = preprocessing.scale(new_x)

linreg_team1 = LinearRegression()
model_team1 = linreg_team1.fit(x, y)
model_team2 = linreg_team2.fit(x, y_2)
print(model_team1.score(x, y))
print(model_team2.score(x, y_2))

y_predict_matchup = model_team1.predict(new_x)
matchup_2['team1_pts_est'] = y_predict_matchup
y_predict_matchup_2 = model_team2.predict(new_x)
matchup_2['team2_pts_est'] = y_predict_matchup_2

In [None]:
print(model_team1.intercept_)
pandas.DataFrame(x.columns,model_team1.coef_)

In [153]:
matchup_2[['team1_pts_est','team2_pts_est']]

Unnamed: 0,team1_pts_est,team2_pts_est
0,85.455909,75.265751
