In [None]:
 ## Import packages and initial data ##

from sklearn.model_selection import train_test_split
import pandas
import numpy
import warnings
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LinearRegression

warnings.filterwarnings('ignore')

pandas.set_option('display.max_columns', 500)
pandas.set_option('display.max_rows', 500)
data = pandas.read_csv('ncaab_box_scores.csv')
data = data[data.columns[-47:]]

cluster_data = pandas.read_csv('college_hoops_team_clusters.csv')
cluster_data = cluster_data.iloc[:,numpy.r_[1:2,86:88]]
cluster_data.columns = ['opponent','opp_major','opp_cluster']

data['ranked'] = numpy.where(data['team_rank']>0, 1, 0)
data['ranked_opp'] = numpy.where(data['opp_rank']>0, 1, 0)
data['game_date'] = pandas.to_datetime(data['game_date'])
data['game_pace'] = data['game_pace']/100
data = pandas.get_dummies(data,prefix=['team_home_away'], columns=['team_home_away'])

## Transform and order columns ##

drops = ['unique_game','team_def_rating','opp_def_rating','team_off_rating',
        'opp_off_rating','team_rank','opp_rank','team_eff_fg_pct', 'team_fg_pct','opp_eff_fg_pct', 'opp_fg_pct']
data_transform = data.drop(drops, 1).dropna()
data_transform['r'] = data_transform.sort_values('game_date',ascending=False).groupby(['team']).cumcount() + 1
datacols = ['team','team1_assist_pct', 'team1_block_pct',
       'team1_def_reb_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_reb_pct', 'team1_pts',
       'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct','opponent','team2_assist_pct', 'team2_block_pct',
       'team2_def_reb_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_reb_pct', 'team2_pts',
       'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct','team1_pace', 'team1_win_pct','game_date', 'team1_ranked_pct', 'team1_ranked_opp_pct',
       'team1_Away', 'team1_Home','r']
data_transform.columns = datacols

data_transform_recent = data_transform[data_transform['r']<6]
data_transform_recent['true_shoot_recent'] = data_transform_recent.groupby(by=['team'])['team1_true_shoot_pct'].transform(lambda x: x.mean())
data_transform_recent['team_reb_recent'] = data_transform_recent.groupby(by=['team'])['team1_reb_pct'].transform(lambda x: x.mean())
data_transform_recent['pace_recent'] = data_transform_recent.groupby(by=['team'])['team1_pace'].transform(lambda x: x.mean())
data_transform_recent['win_recent'] = data_transform_recent.groupby(by=['team'])['team1_win_pct'].transform(lambda x: x.mean())
data_transform_recent['pts_recent'] = data_transform_recent.groupby(by=['team'])['team1_pts'].transform(lambda x: x.mean())

t1 = (data_transform_recent.assign(true_shoot_recent=data_transform_recent['true_shoot_recent'].abs())
       .groupby(['team'])['true_shoot_recent'].agg([('true_shoot_recent_avg' , 'min')]))
t2 = (data_transform_recent.assign(team_reb_recent=data_transform_recent['team_reb_recent'].abs())
       .groupby(['team'])['team_reb_recent'].agg([('team_reb_recent_avg' , 'min')]))
t3 = (data_transform_recent.assign(pace_recent=data_transform_recent['pace_recent'].abs())
       .groupby(['team'])['pace_recent'].agg([('pace_recent_avg' , 'min')]))
t4 = (data_transform_recent.assign(win_recent=data_transform_recent['win_recent'].abs())
       .groupby(['team'])['win_recent'].agg([('win_recent_avg' , 'min')]))
t5 = (data_transform_recent.assign(pts_recent=data_transform_recent['pts_recent'].abs())
       .groupby(['team'])['pts_recent'].agg([('pts_recent_avg' , 'min')]))
data_transform = data_transform.merge(t1,on='team')
data_transform = data_transform.merge(t2,on='team')
data_transform = data_transform.merge(t3,on='team')
data_transform = data_transform.merge(t4,on='team')
data_transform = data_transform.merge(t5,on='team')
data_transform = pandas.merge(data_transform,cluster_data,how='left',left_on='opponent',right_on='opponent')
cluster_data

In [None]:
## Create dictionary of matchups to predict on ##
## Order of dictionary: home team name, 1 for home, 0-2 for opp conf, 1-8 for opp cluster
##                      away team name, 0 for away, 0-2 for opp conf, 1-8 for opp cluster
matchup_dict = [('VILLANOVA',1,1,4,'PROVIDENCE',0,1,6),
                ('TEXAS-TECH',1,2,6,'TEXAS',0,2,1),
                ('IOWA',1,2,1,'PENN-STATE',0,2,1),
                ('KANSAS-STATE',1,2,1,'KANSAS',0,2,7),
                ('CLEMSON',1,2,1,'FLORIDA-STATE',0,2,6),
                ('TEXAS-CHRISTIAN',1,2,1,'BAYLOR',0,2,7),
                ('MARQUETTE',1,1,1,'SETON-HALL',0,1,1),
                ('WEST-VIRGINIA',1,2,6,'OKLAHOMA',0,2,1),
                ('VIRGINIA',1,2,1,'DUKE',0,2,6),
                ('PEPPERDINE',1,1,3,'BRIGHAM-YOUNG',0,1,5),
                ('MARYLAND',1,2,1,'MICHIGAN-STATE',0,2,1),
                ('KENTUCKY',1,2,1,'AUBURN',0,2,1),
                ('NEVADA',1,1,6,'SAN-DIEGO-STATE',0,1,3),
                ('GONZAGA',1,1,3,'SAINT-MARYS-CA',0,1,3)
               ]

matchup_concat = pandas.DataFrame()

## Loop through list to get matchups in correct form ##

for teamname1,home1,oppconf1,oppcluster1,teamname2,home2,oppconf2,oppcluster2 in matchup_dict:
    team1 = pandas.DataFrame(data_transform[numpy.logical_and(numpy.logical_and(
        numpy.logical_and(data_transform['team']==teamname1,
                          data_transform['team1_Home']==home1),
        data_transform['opp_major']==oppconf1),data_transform['opp_cluster']==oppcluster1)].mean())
    team2 = pandas.DataFrame(data_transform[numpy.logical_and(numpy.logical_and(
        numpy.logical_and(data_transform['team']==teamname2,
                          data_transform['team1_Home']==home2),
        data_transform['opp_major']==oppconf2),data_transform['opp_cluster']==oppcluster2)].mean())
    team1 = team1.transpose()
    team2 = team2.transpose()
    team1['team1'] = teamname1
    team2['team2'] = teamname2
    matchup = pandas.concat([team1,team2],axis=1)
    matchup_concat = pandas.concat([matchup_concat,matchup],axis=0)
matchup_concat = matchup_concat.iloc[:,numpy.r_[0:15,30:32,34:36,37:42,44:60,75:77,79:81,82:87,89:90]]

matchup_concat_cols = ['team1_assist_pct', 'team1_block_pct',
       'team1_def_reb_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_reb_pct', 'team1_pts',
       'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct', 'team1_pace', 'team1_win_pct',
       'team1_Away', 'team1_Home','team1_true_shoot_recent_avg', 'team1_team_reb_recent_avg', 'team1_pace_recent_avg',
       'team1_win_recent_avg','team1_pts_recent_avg','team1','team2_assist_pct', 'team2_block_pct',
       'team2_def_reb_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_reb_pct', 'team2_pts',
       'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct','team2_pace',
       'team2_win_pct',
       'team2_Away', 'team2_Home', 'team2_true_shoot_recent_avg',
       'team2_team_reb_recent_avg', 'team2_pace_recent_avg', 'team2_win_recent_avg','team2_pts_recent_avg', 'team2']
matchup_concat.columns = matchup_concat_cols

regcols = ['team1_assist_pct', 'team1_block_pct',
       'team1_def_reb_pct', 'team1_ft_rate',
       'team1_ft_pct', 'team1_off_reb_pct', 'team1_pts',
       'team1_stl_pct', 'team1_3pt_rate', 'team1_3pt_pct',
       'team1_reb_pct', 'team1_true_shoot_pct', 'team1_to_pct', 'team1_2pt_rate',
       'team1_2pt_pct','team1_pace','team1_win_pct', 'team1_ranked_pct', 'team1_ranked_opp_pct',
       'team1_Away', 'team1_Home','team2_assist_pct', 'team2_block_pct',
       'team2_def_reb_pct', 'team2_ft_rate',
       'team2_ft_pct', 'team2_off_reb_pct', 'team2_pts',
       'team2_stl_pct', 'team2_3pt_rate', 'team2_3pt_pct',
       'team2_reb_pct', 'team2_true_shoot_pct', 'team2_to_pct', 'team2_2pt_rate',
       'team2_2pt_pct','true_shoot_recent_avg', 'team_reb_recent_avg', 'pace_recent_avg',
       'win_recent_avg','pts_recent_avg']

regdata = data_transform[regcols]

In [None]:
## Testing correlations and multicolinearity to determine which features to keep for LinReg ##
# data_transform.to_csv('feature_eng.csv')
cor = data_transform.drop('team1_win_pct',1).corr()
cor_target = (cor['team1_pts'])
relevant_features = pandas.DataFrame(cor_target).sort_values('team1_pts',ascending=False)
print(relevant_features)

In [80]:
## Prepare LinReg variables ##

x = regdata[['pace_recent_avg','team1_assist_pct','team1_ft_rate','team1_Home','team1_pace',
             'team1_reb_pct','team1_stl_pct','team1_to_pct','team1_true_shoot_pct','team1_win_pct','true_shoot_recent_avg']]
y = regdata['team1_pts']
y_2 = regdata['team2_pts']

new_x = matchup_concat[['team1_pace_recent_avg','team1_assist_pct','team1_ft_rate','team1_Home','team1_pace',
             'team1_reb_pct','team1_stl_pct','team1_to_pct','team1_true_shoot_pct','team1_win_pct','team1_true_shoot_recent_avg']]

linreg_team1 = LinearRegression()
linreg_team2 = LinearRegression()

## Fit the models ##

model_team1 = linreg_team1.fit(x, y)
model_team2 = linreg_team2.fit(x, y_2)

## Get point predictions for the matchup ##

y_predict_matchup = model_team1.predict(new_x)
matchup_concat['team1_pts_est'] = y_predict_matchup
matchup_concat['team1_pts_est'] = round(matchup_concat['team1_pts_est']).astype(int)
y_predict_matchup_2 = model_team2.predict(new_x)
matchup_concat['team2_pts_est'] = y_predict_matchup_2
matchup_concat['team2_pts_est'] = round(matchup_concat['team2_pts_est']).astype(int)

print(model_team1.score(x,y))
matchup_concat[['team1','team1_pts_est','team2','team2_pts_est']]

0.862340757875038


Unnamed: 0,team1,team1_pts_est,team2,team2_pts_est
0,VILLANOVA,75,PROVIDENCE,64
0,TEXAS-TECH,76,TEXAS,57
0,IOWA,76,PENN-STATE,64
0,KANSAS-STATE,71,KANSAS,75
0,CLEMSON,79,FLORIDA-STATE,66
0,TEXAS-CHRISTIAN,61,BAYLOR,61
0,MARQUETTE,87,SETON-HALL,66
0,WEST-VIRGINIA,90,OKLAHOMA,57
0,VIRGINIA,55,DUKE,56
0,PEPPERDINE,76,BRIGHAM-YOUNG,82
