In [None]:
## Packages and initial data set ##

from sportsreference.ncaab.schedule import Schedule
from sportsreference.ncaab.teams import Teams
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas
import numpy
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LinearRegression


warnings.filterwarnings('ignore')

pandas.set_option('display.max_columns', 500)
pandas.set_option('display.max_rows', 500)
data = pandas.read_csv('ncaab_box_scores.csv')
data = data[data.columns[-47:]]
data['ranked'] = numpy.where(data['team_rank']>0, 1, 0)
data['ranked_opp'] = numpy.where(data['opp_rank']>0, 1, 0)
data = pandas.get_dummies(data,prefix=['team_home_away'], columns=['team_home_away'])

In [None]:
## Create specific matchup variables ##

matchup_drops = ['unique_game','opponent', 'opp_assist_pct',
       'opp_block_pct', 'opp_def_rating', 'opp_def_reb_pct', 'opp_eff_fg_pct',
       'opp_fg_pct', 'opp_ft_rate', 'opp_ft_pct', 'opp_off_rating',
       'opp_off_reb_pct', 'opp_pts', 'opp_rank', 'opp_stl_pct', 'opp_3pt_rate',
       'opp_3pt_pct', 'opp_reb_pct', 'opp_true_shoot_pct', 'opp_to_pct',
       'opp_2pt_rate', 'opp_2pt_pct','game_date']

matchup_data = data.drop(matchup_drops, 1).dropna()
team1cols = ['team1_assist_pct', 'team1_block_pct', 'team1_def_rating',
       'team1_def_reb_pct', 'team1_eff_fg_pct', 'team1_fg_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_rating', 'team1_off_reb_pct', 'team1_pts',
       'team1_rank', 'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct', 'team1_pace', 'team1_win_pct', 'team1_ranked_pct', 'team1_ranked_opp_pct',
       'team1_Away', 'team1_Home','team1']
team2cols = ['team2_assist_pct', 'team2_block_pct', 'team2_def_rating',
       'team2_def_reb_pct', 'team2_eff_fg_pct', 'team2_fg_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_rating', 'team2_off_reb_pct', 'team2_pts',
       'team2_rank', 'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct', 'team2_pace', 'team2_win_pct', 'team2_ranked_pct', 'team2_ranked_opp_pct',
       'team2_Away', 'team2_Home','team2']

## Choose which matchup specifics to model and filter situations accordingly ##

team1 = pandas.DataFrame(matchup_data[numpy.logical_and(numpy.logical_and(matchup_data['team']=='PENN-STATE',
                                                                  matchup_data['team_home_away_Home']==1),
                                                matchup_data['ranked_opp']==0)].mean())
team2 = pandas.DataFrame(matchup_data[numpy.logical_and(numpy.logical_and(matchup_data['team']=='ILLINOIS',
                                                                  matchup_data['team_home_away_Home']==0),
                                                matchup_data['ranked_opp']>0)].mean())
team1 = team1.transpose()
team2 = team2.transpose()
team1['team1'] = 'PENN-STATE'
team2['team2'] = 'ILLINOIS'
team1.columns = team1cols
team2.columns = team2cols

matchup = pandas.concat([team1,team2],axis=1,sort=True)

In [None]:
## Choose variables to include in linear regression ##

drops = ['team','unique_game','opponent','game_date','team_def_rating','opp_def_rating','team_off_rating',
        'opp_off_rating','team_rank','opp_rank','team_eff_fg_pct', 'team_fg_pct','opp_eff_fg_pct', 'opp_fg_pct']
log_data = data.drop(drops, 1).dropna()
datacols = ['team1_assist_pct', 'team1_block_pct',
       'team1_def_reb_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_reb_pct', 'team1_pts',
       'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct','team2_assist_pct', 'team2_block_pct',
       'team2_def_reb_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_reb_pct', 'team2_pts',
       'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct','team1_pace', 'team1_win_pct', 'team1_ranked_pct', 'team1_ranked_opp_pct',
       'team1_Away', 'team1_Home']
log_data.columns = datacols

matchup_2 = pandas.DataFrame([],columns=datacols)
matchup_2 = matchup[datacols]

In [None]:
## Choose x y variables for linear regression and the matchup to apply model to ##

x = log_data.drop(['team1_pts','team2_pts'],1)
y = log_data['team1_pts']
y_2 = log_data['team2_pts']
new_x = matchup_2.drop(['team1_pts','team2_pts'],1)

linreg_team1 = LinearRegression()
model_team1 = linreg_team1.fit(x, y)
model_team2 = linreg_team2.fit(x, y_2)
print(model_team1.score(x, y))
print(model_team2.score(x, y_2))

## Get point predictions for the matchup ##

y_predict_matchup = model_team1.predict(new_x)
matchup_2['team1_pts_est'] = y_predict_matchup
y_predict_matchup_2 = model_team2.predict(new_x)
matchup_2['team2_pts_est'] = y_predict_matchup_2

In [None]:
## Model calculations ##

print(model_team1.intercept_)
pandas.DataFrame(x.columns,model_team1.coef_)

In [153]:
## Matchup predictions ##

matchup_2[['team1_pts_est','team2_pts_est']]

Unnamed: 0,team1_pts_est,team2_pts_est
0,85.455909,75.265751
