In [31]:
## Import packages and initial data ##

from sklearn.model_selection import train_test_split
import pandas
import numpy
import warnings
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LinearRegression

warnings.filterwarnings('ignore')

pandas.set_option('display.max_columns', 500)
pandas.set_option('display.max_rows', 500)
data = pandas.read_csv('ncaab_box_scores.csv')
data = data[data.columns[-47:]]
data['ranked'] = numpy.where(data['team_rank']>0, 1, 0)
data['ranked_opp'] = numpy.where(data['opp_rank']>0, 1, 0)
data['game_date'] = pandas.to_datetime(data['game_date'])
data = pandas.get_dummies(data,prefix=['team_home_away'], columns=['team_home_away'])

## Transform and order columns ##

drops = ['unique_game','opponent','team_def_rating','opp_def_rating','team_off_rating',
        'opp_off_rating','team_rank','opp_rank','team_eff_fg_pct', 'team_fg_pct','opp_eff_fg_pct', 'opp_fg_pct']
data_transform = data.drop(drops, 1).dropna()
datacols = ['team','team1_assist_pct', 'team1_block_pct',
       'team1_def_reb_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_reb_pct', 'team1_pts',
       'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct','team2_assist_pct', 'team2_block_pct',
       'team2_def_reb_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_reb_pct', 'team2_pts',
       'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct','team1_pace', 'team1_win_pct','game_date', 'team1_ranked_pct', 'team1_ranked_opp_pct',
       'team1_Away', 'team1_Home']
data_transform.columns = datacols

## Create dictionary of matchups to predict on ##
## Order of dictionary: home team name, 1 for home, 1 for ranked opp, away team name, 0 for away, 1 for ranked opp ##
###################Opportunity to add date filter here now that I am passing through date!!!
matchup_dict = [('PENN-STATE',1,0,'ILLINOIS',0,1),
                ('BAYLOR',1,1,'KANSAS',0,1),
               ('NORTH-CAROLINA-STATE',1,1,'DUKE',0,0)]
matchup_concat = pandas.DataFrame()

## Loop through list to get matchups in correct form ##

for teamname1,home1,rankopp1,teamname2,home2,rankopp2 in matchup_dict:
    team1 = pandas.DataFrame(data_transform[numpy.logical_and(numpy.logical_and(data_transform['team']==teamname1,data_transform['team1_Home']==home1),data_transform['team1_ranked_opp_pct']==rankopp1)].mean())
    team2 = pandas.DataFrame(data_transform[numpy.logical_and(numpy.logical_and(data_transform['team']==teamname2,data_transform['team1_Home']==home2),data_transform['team1_ranked_opp_pct']==rankopp2)].mean())
    team1 = team1.transpose()
    team2 = team2.transpose()
    team1['team1'] = teamname1
    team2['team2'] = teamname2
    matchup = pandas.concat([team1,team2],axis=1)
    matchup_concat = pandas.concat([matchup_concat,matchup],axis=0)

matchup_concat = matchup_concat.iloc[:,numpy.r_[0:15,30:52,73:74]]

matchup_concat_cols = ['team1_assist_pct', 'team1_block_pct',
       'team1_def_reb_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_reb_pct', 'team1_pts',
       'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct', 'team1_pace', 'team1_win_pct', 'team1_ranked_pct', 'team1_ranked_opp_pct',
       'team1_Away', 'team1_Home','team1','team2_assist_pct', 'team2_block_pct',
       'team2_def_reb_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_reb_pct', 'team2_pts',
       'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct', 'team2']
matchup_concat.columns = matchup_concat_cols

regcols = ['team1_assist_pct', 'team1_block_pct',
       'team1_def_reb_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_reb_pct', 'team1_pts',
       'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct','team1_pace','team1_win_pct', 'team1_ranked_pct', 'team1_ranked_opp_pct',
       'team1_Away', 'team1_Home','team2_assist_pct', 'team2_block_pct',
       'team2_def_reb_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_reb_pct', 'team2_pts',
       'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct']

regdata = data_transform[regcols]

## Prepare LinReg variables ##

x = regdata.drop(['team1_pts','team2_pts','team1_win_pct','team1_ranked_pct',
       'team1_ranked_opp_pct', 'team1_Away', 'team1_Home'],1)
y = regdata['team1_pts']
y_2 = regdata['team2_pts']
new_x = matchup_concat.drop(['team1_pts','team2_pts','team1_win_pct', 'team1_ranked_pct',
       'team1_ranked_opp_pct', 'team1_Away', 'team1_Home', 'team1','team2'],1)

linreg_team1 = LinearRegression()
linreg_team2 = LinearRegression()

## Fit the models ##

model_team1 = linreg_team1.fit(x, y)
model_team2 = linreg_team2.fit(x, y_2)

## Get point predictions for the matchup ##

y_predict_matchup = model_team1.predict(new_x)
matchup_concat['team1_pts_est'] = y_predict_matchup
y_predict_matchup_2 = model_team2.predict(new_x)
matchup_concat['team2_pts_est'] = y_predict_matchup_2

matchup_concat

Unnamed: 0,team1_assist_pct,team1_block_pct,team1_def_reb_pct,team1_ft_rate,team1_ft_pct,team1_off_reb_pct,team1_pts,team1_stl_pct,team1_3pt_rate,team1_3pt_pct,team1_reb_pct,team1_true_shoot_pct,team1_to_pct,team1_2pt_rate,team1_2pt_pct,team1_pace,team1_win_pct,team1_ranked_pct,team1_ranked_opp_pct,team1_Away,team1_Home,team1,team2_assist_pct,team2_block_pct,team2_def_reb_pct,team2_ft_rate,team2_ft_pct,team2_off_reb_pct,team2_pts,team2_stl_pct,team2_3pt_rate,team2_3pt_pct,team2_reb_pct,team2_true_shoot_pct,team2_to_pct,team2_2pt_rate,team2_2pt_pct,team2,team1_pts_est,team2_pts_est
0,0.544923,0.147308,0.783615,0.319385,0.690769,0.293077,77.076923,0.121231,0.395154,0.296538,0.540615,0.534692,0.113846,0.604846,0.549077,73.215385,0.846154,0.461538,0.0,0.0,1.0,PENN-STATE,0.49775,0.0385,0.69325,0.26325,0.7015,0.27125,62.0,0.06175,0.326,0.2495,0.47325,0.46575,0.14,0.674,0.46025,ILLINOIS,83.350379,78.127792
0,0.56925,0.16075,0.70725,0.296,0.64975,0.33175,68.25,0.10725,0.3805,0.346,0.51725,0.51925,0.1275,0.6195,0.476,68.7,1.0,1.0,1.0,0.0,1.0,BAYLOR,0.516,0.148333,0.702667,0.389667,0.560667,0.247667,59.666667,0.143333,0.242333,0.320333,0.508667,0.505667,0.166667,0.757667,0.493,KANSAS,78.515722,59.195538
0,0.3,0.114,0.708,0.279,0.706,0.278,57.0,0.1,0.377,0.217,0.45,0.413,0.11,0.623,0.395,70.2,0.0,0.0,1.0,0.0,1.0,NORTH-CAROLINA-STATE,0.508625,0.09575,0.7295,0.371,0.731625,0.327625,82.75,0.1065,0.301875,0.32625,0.53075,0.56875,0.12125,0.698125,0.554,DUKE,56.351571,84.446635
