In [530]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import log_loss
%matplotlib inline
pd.set_option('display.max_columns', 500)
print("Seaborn version: ", sns.__version__)

Seaborn version:  0.9.0


In [531]:
#read in cumulative team data from seasons 2013 to 2017 
#Note season indicates start of season, e.g. 2013 to 2014, the corresponding tournament season is 1 greater than this value
# The tournament season for season 2013 is 2014
summary_data = pd.read_csv('season_sum_cumulative.csv')
print(summary_data.shape)

#display rows with NaNs
summary_data[summary_data.isnull().any(axis=1)]


(1755, 19)


Unnamed: 0,market,season,team_id,points_avg,opp_pts_avg,possesion_avg,fg_pct,allow_fg_pct,off_rebs_avg,allow_off_rebs_avg,def_rebs_avg,allow_def_rebs_avg,ft_att_avg,allow_ft_att_avg,ft_pct,turnover_avg,take_away_avg,win_pct,off_rating
1749,Portland State,2013,79965018-7f6f-4273-9eef-65c99b592b01,71.433333,73.433333,,,,,,,,,,,,,0.5,
1750,St. Francis (PA),2013,6637ac5f-59c2-4888-af34-a86d138ccfb3,61.806452,69.354839,,,,,,,,,,,,,0.322581,
1751,Montana State,2013,bec40585-b587-4548-93f5-b7c00b046aea,65.206897,70.068966,,,,,,,,,,,,,0.413793,
1752,UC Riverside,2013,6b170a32-8445-4cb1-8638-724996c3094b,66.892857,72.142857,,,,,0.0,,0.0,,0.0,,,0.0,0.285714,
1753,Central Michigan,2013,472c22e0-843d-472c-a871-f1593fb0725d,69.586207,75.689655,,,,,,,,,,,,,0.275862,
1754,Incarnate Word,2013,912671c7-19fd-451b-813e-885485427820,76.8,74.533333,,,,,,,,,,,,,0.6,


In [532]:
#drop records with NaNs
summary_data.dropna(inplace=True)
print(summary_data.shape)


(1749, 19)


In [533]:
# Read table of team names and associated team meta data
teams = pd.read_csv('D1_teams.csv')
teams.drop(columns=['code_ncaa','school_ncaa','turner_name','league_name','league_alias','conf_alias',
                    'conf_id','division_name','division_alias','division_id','venue_id'], inplace=True)
teams.info()
teams.set_index('id',inplace=True)
teams.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 3 columns):
id                351 non-null object
kaggle_team_id    351 non-null int64
conf_name         351 non-null object
dtypes: int64(1), object(2)
memory usage: 8.3+ KB


Unnamed: 0_level_0,kaggle_team_id,conf_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
fe406882-9f22-495e-9df6-ef357a6803c6,1343,Ivy
ca478771-aa3d-4231-81e0-b70f519134fb,1463,Ivy
5c7bf63f-bc39-43c5-9907-73b50b7a6b34,1217,Ivy
d60357bd-1205-42e9-9092-d986a2843a34,1171,Ivy
88ff8c00-958e-4ccf-a21d-77fab9e93692,1165,Ivy


In [535]:
tourney_data = pd.read_csv('tournament_results.csv')
tourney_data.drop(columns=['days_from_epoch','day','num_ot','academic_year','win_region','win_alias','lose_region',
                           'lose_alias','lose_code_ncaa','win_school_ncaa','win_code_ncaa','win_name','lose_name',
                           'win_pts','win_kaggle_team_id','lose_school_ncaa','lose_kaggle_team_id','lose_pts'],inplace=True)

tourney_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2117 entries, 0 to 2116
Data columns (total 9 columns):
season          2117 non-null int64
round           2117 non-null int64
game_date       2117 non-null object
win_seed        2117 non-null int64
win_market      2117 non-null object
win_team_id     2117 non-null object
lose_seed       2117 non-null int64
lose_market     2117 non-null object
lose_team_id    2117 non-null object
dtypes: int64(4), object(5)
memory usage: 148.9+ KB


In [536]:
tourney_data.describe()['season']

count    2117.000000
mean     2001.193198
std         9.563883
min      1985.000000
25%      1993.000000
50%      2001.000000
75%      2010.000000
max      2017.000000
Name: season, dtype: float64

Note the tournament data is available for tournament years 1985 through 2017.

In [537]:
summary_data.describe()['season']

count    1749.000000
mean     2015.006861
std         1.412173
min      2013.000000
25%      2014.000000
50%      2015.000000
75%      2016.000000
max      2017.000000
Name: season, dtype: float64

The summary season data is available from 2013 through 2017.

So the intersection of season summary data with tournament data are seasons 2013-2014, 2014-2015, 2015-2016, 2016-2017.

In [538]:
summary_data = summary_data[(summary_data['season'] >= 2013) & (summary_data['season'] < 2017) ]
summary_data.describe()['season']

count    1398.000000
mean     2014.506438
std         1.116494
min      2013.000000
25%      2014.000000
50%      2015.000000
75%      2016.000000
max      2016.000000
Name: season, dtype: float64

In [539]:
tourney_data = tourney_data[tourney_data['season'] >= 2014]
tourney_data.describe()['season']

count     268.000000
mean     2015.500000
std         1.120126
min      2014.000000
25%      2014.750000
50%      2015.500000
75%      2016.250000
max      2017.000000
Name: season, dtype: float64

In [540]:
tourney_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 36 to 2034
Data columns (total 9 columns):
season          268 non-null int64
round           268 non-null int64
game_date       268 non-null object
win_seed        268 non-null int64
win_market      268 non-null object
win_team_id     268 non-null object
lose_seed       268 non-null int64
lose_market     268 non-null object
lose_team_id    268 non-null object
dtypes: int64(4), object(5)
memory usage: 20.9+ KB


In [541]:
#tourn_data_2016.reset_index(drop=True,inplace=True)
#tourn_data_2016.head()
tourney_data.reset_index()
tourney_data.head()

Unnamed: 0,season,round,game_date,win_seed,win_market,win_team_id,lose_seed,lose_market,lose_team_id
36,2016,64,2016-03-18,10,Virginia Commonwealth,c1c1e6df-a383-4fbd-ba7b-32d4f9ef9518,7,Oregon State,532d3874-b4b3-4c5c-acc6-749a6db26c8f
44,2017,64,2017-03-17,11,Rhode Island,7e42bca2-3cd7-4aca-aa46-f190fe5d8eb4,6,Creighton,7d797407-623e-476d-b299-46de4275414d
46,2015,64,2015-03-19,5,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,12,Stephen F. Austin,5ef64f01-86ae-4553-9834-c79cc0859eaf
48,2016,64,2016-03-17,3,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,14,Fresno State,dd8db4d8-d984-4cab-b7f6-22c6b8c2c45f
49,2015,32,2015-03-21,5,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,4,Georgetown,8736b67f-5924-400b-bf94-3bd804c36600


In [542]:
#games = tourn_data_2016.drop(columns=['win_pts','win_kaggle_team_id','lose_school_ncaa','lose_kaggle_team_id','lose_pts'])
#copy the tournament data
# recode the tourney data so that the teams are not marked with  win/lose status and the game result is encoded as a binary
tourney_data['game_result'] = 1
tourney_data.game_result = tourney_data.game_result.astype(int)
tourney_data.rename(columns={"win_seed":"team_seed","win_market":"team","win_team_id":"team_id"}, inplace=True)
tourney_data.rename(columns={"lose_seed":"opp_team_seed","lose_market":"opp_team","lose_team_id":"opp_team_id"}, inplace=True)
tourney_data['start_season'] = tourney_data['season'] -1
tourney_data


Unnamed: 0,season,round,game_date,team_seed,team,team_id,opp_team_seed,opp_team,opp_team_id,game_result,start_season
36,2016,64,2016-03-18,10,Virginia Commonwealth,c1c1e6df-a383-4fbd-ba7b-32d4f9ef9518,7,Oregon State,532d3874-b4b3-4c5c-acc6-749a6db26c8f,1,2015
44,2017,64,2017-03-17,11,Rhode Island,7e42bca2-3cd7-4aca-aa46-f190fe5d8eb4,6,Creighton,7d797407-623e-476d-b299-46de4275414d,1,2016
46,2015,64,2015-03-19,5,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,12,Stephen F. Austin,5ef64f01-86ae-4553-9834-c79cc0859eaf,1,2014
48,2016,64,2016-03-17,3,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,14,Fresno State,dd8db4d8-d984-4cab-b7f6-22c6b8c2c45f,1,2015
49,2015,32,2015-03-21,5,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,4,Georgetown,8736b67f-5924-400b-bf94-3bd804c36600,1,2014
72,2017,64,2017-03-17,3,Baylor,db6e1cab-3fa3-4a93-a673-8b2a358ff4bf,14,New Mexico State,5016fe1a-9571-4d10-bf5b-b9c1b496bd57,1,2016
73,2017,32,2017-03-19,3,Baylor,db6e1cab-3fa3-4a93-a673-8b2a358ff4bf,11,USC,3a000455-de7c-4ca8-880e-abdce7f21da9,1,2016
80,2014,64,2014-03-21,6,Baylor,db6e1cab-3fa3-4a93-a673-8b2a358ff4bf,11,Nebraska,f8b1bf00-5b73-4ac4-98c0-ec554027ae32,1,2013
81,2014,32,2014-03-23,6,Baylor,db6e1cab-3fa3-4a93-a673-8b2a358ff4bf,3,Creighton,7d797407-623e-476d-b299-46de4275414d,1,2013
82,2014,64,2014-03-21,14,Mercer,a14b0057-8eb5-43d2-a33b-666196da933e,3,Duke,faeb1160-5d15-4f26-99fc-c441cf21fc7f,1,2013


In [543]:
# We need to recode the game datga so that we have representation of the losing game result class.
# Swap the team and opp team data on the odd rows and reverse the sense of the game result.

#create some temporary buffer columns
tourney_data['copy_team'] = tourney_data['team']
tourney_data['copy_team_seed'] = tourney_data['team_seed']
tourney_data['copy_team_id'] = tourney_data['team_id']

#swap the team and opp team data
tourney_data.loc[1::2,'team'] = tourney_data.loc[1::2,'opp_team']
tourney_data.loc[1::2,'opp_team'] = tourney_data.loc[1::2,'copy_team']
tourney_data.loc[1::2,'team_seed'] = tourney_data.loc[1::2,'opp_team_seed']
tourney_data.loc[1::2,'opp_team_seed'] = tourney_data.loc[1::2,'copy_team_seed']
tourney_data.loc[1::2,'team_id'] = tourney_data.loc[1::2,'opp_team_id']
tourney_data.loc[1::2,'opp_team_id'] = tourney_data.loc[1::2,'copy_team_id']

# flip the game result
tourney_data.loc[1::2,'game_result'] = 0

#drop the temporary columns
tourney_data.drop(columns=['copy_team','copy_team_seed','copy_team_id'],inplace=True)
tourney_data.head()

Unnamed: 0,season,round,game_date,team_seed,team,team_id,opp_team_seed,opp_team,opp_team_id,game_result,start_season
36,2016,64,2016-03-18,7,Oregon State,532d3874-b4b3-4c5c-acc6-749a6db26c8f,10,Virginia Commonwealth,c1c1e6df-a383-4fbd-ba7b-32d4f9ef9518,0,2015
44,2017,64,2017-03-17,11,Rhode Island,7e42bca2-3cd7-4aca-aa46-f190fe5d8eb4,6,Creighton,7d797407-623e-476d-b299-46de4275414d,1,2016
46,2015,64,2015-03-19,12,Stephen F. Austin,5ef64f01-86ae-4553-9834-c79cc0859eaf,5,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,0,2014
48,2016,64,2016-03-17,3,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,14,Fresno State,dd8db4d8-d984-4cab-b7f6-22c6b8c2c45f,1,2015
49,2015,32,2015-03-21,4,Georgetown,8736b67f-5924-400b-bf94-3bd804c36600,5,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,0,2014


In [544]:
#summary_2015_data.head()

Unnamed: 0,market,season,team_id,points_avg,opp_pts_avg,possesion_avg,fg_pct,allow_fg_pct,off_rebs_avg,allow_off_rebs_avg,def_rebs_avg,allow_def_rebs_avg,ft_att_avg,allow_ft_att_avg,ft_pct,turnover_avg,take_away_avg,win_pct,off_rating
20,Duke,2015,faeb1160-5d15-4f26-99fc-c441cf21fc7f,81.111111,72.472222,67.725144,0.460422,0.444742,10.222222,11.027778,22.416667,22.527778,24.055556,15.777778,0.722864,9.666667,11.666667,0.694444,119.765137
30,Indiana,2015,c3f0a8ce-af67-497f-a750-3b859376b20a,82.6,69.914286,69.527923,0.501967,0.444281,10.4,8.742857,23.971429,18.514286,19.685714,17.571429,0.731495,13.085714,12.6,0.771429,118.80119
41,North Florida,2015,09920a5f-1b25-466c-b5ae-6167214f5ba9,82.903226,80.387097,70.475216,0.480064,0.47191,7.0,10.516129,23.935484,24.612903,17.419355,17.870968,0.725926,10.774194,10.064516,0.612903,117.634582
49,Saint Mary's,2015,b18f34af-a7f1-4659-a2e5-fc11a31cd316,73.411765,61.058824,62.603797,0.504381,0.409948,7.470588,8.235294,25.058824,19.323529,15.558824,13.529412,0.688091,9.323529,9.558824,0.823529,117.264077
57,Oakland,2015,aeaaef0d-5238-414e-ac04-c55a22cba208,86.371429,78.457143,73.820948,0.458371,0.434513,10.114286,10.285714,25.885714,23.685714,26.085714,20.714286,0.769989,11.314286,12.0,0.657143,117.001245


In [545]:

tourney_data = tourney_data.merge(summary_data, left_on=['start_season','team_id'], 
                                right_on=['season', 'team_id'],how='left',suffixes=('','_y'))

tourney_data.drop(columns=['season_y'],inplace=True)
#tourney_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 0 to 267
Data columns (total 28 columns):
season                268 non-null int64
round                 268 non-null int64
game_date             268 non-null object
team_seed             268 non-null int64
team                  268 non-null object
team_id               268 non-null object
opp_team_seed         268 non-null int64
opp_team              268 non-null object
opp_team_id           268 non-null object
game_result           268 non-null int32
start_season          268 non-null int64
market                268 non-null object
points_avg            268 non-null float64
opp_pts_avg           268 non-null float64
possesion_avg         268 non-null float64
fg_pct                268 non-null float64
allow_fg_pct          268 non-null float64
off_rebs_avg          268 non-null float64
allow_off_rebs_avg    268 non-null float64
def_rebs_avg          268 non-null float64
allow_def_rebs_avg    268 non-null float64
ft_att_avg

In [423]:
#tourney_data.head()

Unnamed: 0,season,round,game_date,game_result,team,team_seed,team_id,opp_team,opp_team_seed,opp_team_id,start_season,market,points_avg,opp_pts_avg,possesion_avg,fg_pct,allow_fg_pct,off_rebs_avg,allow_off_rebs_avg,def_rebs_avg,allow_def_rebs_avg,ft_att_avg,allow_ft_att_avg,ft_pct,turnover_avg,take_away_avg,win_pct,off_rating
0,2016,64,2016-03-18,1,Virginia Commonwealth,10,c1c1e6df-a383-4fbd-ba7b-32d4f9ef9518,Oregon State,7,532d3874-b4b3-4c5c-acc6-749a6db26c8f,2015,Virginia Commonwealth,77.25,67.777778,69.932659,0.451296,0.440627,10.666667,8.083333,23.583333,23.138889,20.027778,20.222222,0.692094,11.333333,14.722222,0.694444,110.463411
1,2016,64,2016-03-17,0,Fresno State,14,dd8db4d8-d984-4cab-b7f6-22c6b8c2c45f,Utah,3,0d037a5d-827a-44dd-8b70-57603d671d5d,2015,Fresno State,74.454545,71.424242,69.086585,0.43208,0.423374,9.69697,9.0,23.787879,24.939394,22.787879,25.484848,0.698138,10.424242,14.606061,0.69697,107.769902
2,2016,64,2016-03-18,1,Oregon,1,1da70895-f77f-44ef-b216-d63c02e696eb,Holy Cross,16,31aedd91-a77e-46c1-8bdc-80e9860c159d,2015,Oregon,78.432432,69.027027,68.820151,0.463695,0.427294,10.081081,9.081081,22.756757,21.405405,23.675676,18.378378,0.715753,10.972973,13.945946,0.810811,113.967249
3,2016,16,2016-03-24,0,Duke,4,faeb1160-5d15-4f26-99fc-c441cf21fc7f,Oregon,1,1da70895-f77f-44ef-b216-d63c02e696eb,2015,Duke,81.111111,72.472222,67.725144,0.460422,0.444742,10.222222,11.027778,22.416667,22.527778,24.055556,15.777778,0.722864,9.666667,11.666667,0.694444,119.765137
4,2016,32,2016-03-20,1,Oregon,1,1da70895-f77f-44ef-b216-d63c02e696eb,Saint Joseph's (PA),8,f9e4261e-d11d-46c4-bd33-c7bbc94ef0e8,2015,Oregon,78.432432,69.027027,68.820151,0.463695,0.427294,10.081081,9.081081,22.756757,21.405405,23.675676,18.378378,0.715753,10.972973,13.945946,0.810811,113.967249


In [548]:
tourney_data = tourney_data.merge(summary_data, left_on=['start_season','opp_team_id'], 
                                right_on=['season', 'team_id'], how='left',suffixes=('_t','_o'))
#tourney_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 0 to 267
Data columns (total 47 columns):
season_t                268 non-null int64
round                   268 non-null int64
game_date               268 non-null object
team_seed               268 non-null int64
team                    268 non-null object
team_id_t               268 non-null object
opp_team_seed           268 non-null int64
opp_team                268 non-null object
opp_team_id             268 non-null object
game_result             268 non-null int32
start_season            268 non-null int64
market_t                268 non-null object
points_avg_t            268 non-null float64
opp_pts_avg_t           268 non-null float64
possesion_avg_t         268 non-null float64
fg_pct_t                268 non-null float64
allow_fg_pct_t          268 non-null float64
off_rebs_avg_t          268 non-null float64
allow_off_rebs_avg_t    268 non-null float64
def_rebs_avg_t          268 non-null float64
allow_def_reb

In [549]:
tourney_data.head()

Unnamed: 0,season_t,round,game_date,team_seed,team,team_id_t,opp_team_seed,opp_team,opp_team_id,game_result,start_season,market_t,points_avg_t,opp_pts_avg_t,possesion_avg_t,fg_pct_t,allow_fg_pct_t,off_rebs_avg_t,allow_off_rebs_avg_t,def_rebs_avg_t,allow_def_rebs_avg_t,ft_att_avg_t,allow_ft_att_avg_t,ft_pct_t,turnover_avg_t,take_away_avg_t,win_pct_t,off_rating_t,market_o,season_o,team_id_o,points_avg_o,opp_pts_avg_o,possesion_avg_o,fg_pct_o,allow_fg_pct_o,off_rebs_avg_o,allow_off_rebs_avg_o,def_rebs_avg_o,allow_def_rebs_avg_o,ft_att_avg_o,allow_ft_att_avg_o,ft_pct_o,turnover_avg_o,take_away_avg_o,win_pct_o,off_rating_o
0,2016,64,2016-03-18,7,Oregon State,532d3874-b4b3-4c5c-acc6-749a6db26c8f,10,Virginia Commonwealth,c1c1e6df-a383-4fbd-ba7b-32d4f9ef9518,0,2015,Oregon State,71.83871,70.709677,67.269882,0.438724,0.431779,8.677419,9.741935,22.193548,23.870968,21.193548,22.0,0.680365,11.064516,13.193548,0.580645,106.791788,Virginia Commonwealth,2015,c1c1e6df-a383-4fbd-ba7b-32d4f9ef9518,77.25,67.777778,69.932659,0.451296,0.440627,10.666667,8.083333,23.583333,23.138889,20.027778,20.222222,0.692094,11.333333,14.722222,0.694444,110.463411
1,2017,64,2017-03-17,11,Rhode Island,7e42bca2-3cd7-4aca-aa46-f190fe5d8eb4,6,Creighton,7d797407-623e-476d-b299-46de4275414d,1,2016,Rhode Island,73.628571,65.371429,66.839573,0.453915,0.406183,10.457143,8.514286,23.771429,22.028571,22.6,24.142857,0.667509,10.628571,12.714286,0.714286,110.157154,Creighton,2016,7d797407-623e-476d-b299-46de4275414d,81.264706,72.941176,72.844691,0.504373,0.438316,6.647059,9.5,25.029412,23.823529,16.911765,17.794118,0.683478,12.294118,13.441176,0.705882,111.558859
2,2015,64,2015-03-19,12,Stephen F. Austin,5ef64f01-86ae-4553-9834-c79cc0859eaf,5,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,0,2014,Stephen F. Austin,78.032258,65.225806,67.621122,0.487962,0.442408,10.225806,8.0,21.064516,18.064516,23.032258,24.225806,0.733894,13.451613,16.258065,0.83871,115.396277,Utah,2014,0d037a5d-827a-44dd-8b70-57603d671d5d,70.911765,57.352941,62.749043,0.477472,0.384184,8.058824,8.029412,23.764706,18.529412,21.823529,17.970588,0.704852,11.117647,11.382353,0.735294,113.00852
3,2016,64,2016-03-17,3,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,14,Fresno State,dd8db4d8-d984-4cab-b7f6-22c6b8c2c45f,1,2015,Utah,76.085714,69.371429,67.8937,0.483704,0.421273,8.171429,8.685714,25.057143,20.457143,20.942857,14.171429,0.72442,11.971429,10.228571,0.742857,112.065942,Fresno State,2015,dd8db4d8-d984-4cab-b7f6-22c6b8c2c45f,74.454545,71.424242,69.086585,0.43208,0.423374,9.69697,9.0,23.787879,24.939394,22.787879,25.484848,0.698138,10.424242,14.606061,0.69697,107.769902
4,2015,32,2015-03-21,4,Georgetown,8736b67f-5924-400b-bf94-3bd804c36600,5,Utah,0d037a5d-827a-44dd-8b70-57603d671d5d,0,2014,Georgetown,70.909091,65.242424,65.337318,0.45579,0.40676,9.787879,8.575758,22.515152,19.939394,23.30303,22.909091,0.703511,12.393939,12.909091,0.666667,108.527704,Utah,2014,0d037a5d-827a-44dd-8b70-57603d671d5d,70.911765,57.352941,62.749043,0.477472,0.384184,8.058824,8.029412,23.764706,18.529412,21.823529,17.970588,0.704852,11.117647,11.382353,0.735294,113.00852


In [550]:
numeric_feature_to_scale = ['points_avg_t','opp_pts_avg_t',
                            'off_rebs_avg_t','allow_off_rebs_avg_t',
                            'def_rebs_avg_t','allow_def_rebs_avg_t',
                            'ft_att_avg_t','allow_ft_att_avg_t',
                            'turnover_avg_t','take_away_avg_t',
                            'off_rating_t',
                            'points_avg_o','opp_pts_avg_o',
                            'off_rebs_avg_o','allow_off_rebs_avg_o',
                            'def_rebs_avg_o','allow_def_rebs_avg_o',
                            'ft_att_avg_o','allow_ft_att_avg_o',
                            'turnover_avg_o','take_away_avg_o',
                            'off_rating_o']

game_data_scale = tourney_data[numeric_feature_to_scale].copy()
scaler =StandardScaler()
scaled_features = scaler.fit_transform(game_data_scale.values)
scaled_df = pd.DataFrame(scaled_features, index=game_data_scale.index, columns=game_data_scale.columns)

# Add the categorical columns and previously scaled numerics
scaled_df['team_seed'] = tourney_data['team_seed']
scaled_df['opp_team_seed'] = tourney_data['opp_team_seed']
scaled_df['fg_pct_t'] = tourney_data['fg_pct_t']
scaled_df['allow_fg_pct_t'] = tourney_data['allow_fg_pct_t']
scaled_df['ft_pct_t'] = tourney_data['ft_pct_t']
scaled_df['win_pct_t'] = tourney_data['win_pct_t']

scaled_df['fg_pct_o'] = tourney_data['fg_pct_o']
scaled_df['allow_fg_pct_o'] = tourney_data['allow_fg_pct_o']
scaled_df['ft_pct_o'] = tourney_data['ft_pct_o']
scaled_df['win_pct_o'] = tourney_data['win_pct_o']

#scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 0 to 267
Data columns (total 32 columns):
points_avg_t            268 non-null float64
opp_pts_avg_t           268 non-null float64
off_rebs_avg_t          268 non-null float64
allow_off_rebs_avg_t    268 non-null float64
def_rebs_avg_t          268 non-null float64
allow_def_rebs_avg_t    268 non-null float64
ft_att_avg_t            268 non-null float64
allow_ft_att_avg_t      268 non-null float64
turnover_avg_t          268 non-null float64
take_away_avg_t         268 non-null float64
off_rating_t            268 non-null float64
points_avg_o            268 non-null float64
opp_pts_avg_o           268 non-null float64
off_rebs_avg_o          268 non-null float64
allow_off_rebs_avg_o    268 non-null float64
def_rebs_avg_o          268 non-null float64
allow_def_rebs_avg_o    268 non-null float64
ft_att_avg_o            268 non-null float64
allow_ft_att_avg_o      268 non-null float64
turnover_avg_o          268 non-null flo

In [551]:
X= scaled_df
y= tourney_data['game_result']


In [552]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 10)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
X_train.head()

(214, 32)
(214,)
(54, 32)
(54,)


Unnamed: 0,points_avg_t,opp_pts_avg_t,off_rebs_avg_t,allow_off_rebs_avg_t,def_rebs_avg_t,allow_def_rebs_avg_t,ft_att_avg_t,allow_ft_att_avg_t,turnover_avg_t,take_away_avg_t,off_rating_t,points_avg_o,opp_pts_avg_o,off_rebs_avg_o,allow_off_rebs_avg_o,def_rebs_avg_o,allow_def_rebs_avg_o,ft_att_avg_o,allow_ft_att_avg_o,turnover_avg_o,take_away_avg_o,off_rating_o,team_seed,opp_team_seed,fg_pct_t,allow_fg_pct_t,ft_pct_t,win_pct_t,fg_pct_o,allow_fg_pct_o,ft_pct_o,win_pct_o
105,1.605049,-0.933918,-0.672657,0.435112,3.086391,-0.095555,0.550612,-0.746471,0.112397,-0.461651,0.701734,0.279859,2.265483,-0.94162,0.05524,0.509415,0.833726,0.283646,-0.652951,1.013981,-0.721266,-0.371549,1,16,0.503726,0.364735,0.716854,0.948718,0.450136,0.452746,0.765348,0.484848
236,-0.0778,0.925317,2.146482,0.775178,0.881945,-0.211293,1.765482,1.454562,0.541553,-0.665225,-0.901421,-0.164916,-0.247129,-1.130525,0.111619,-1.074031,-0.560755,-0.885981,-1.246556,-1.572975,-1.023532,1.602681,7,2,0.431981,0.401667,0.671067,0.685714,0.476791,0.444717,0.762663,0.756757
6,0.723543,1.413867,0.014398,0.853977,-0.284372,0.703829,0.180703,-1.235282,-0.253534,-0.166523,0.495876,-0.586992,-0.466716,0.887218,-0.176622,-0.342542,-1.848764,-0.837081,-0.56799,1.227589,-0.876612,-0.491109,11,3,0.45582,0.433794,0.740788,0.722222,0.46642,0.405561,0.71875,0.764706
55,0.308047,0.836344,0.843833,1.297569,-0.479102,-0.445911,0.724968,0.29048,1.340831,0.893627,-0.603287,-0.31901,0.132313,0.756565,1.617345,-1.507999,0.468907,0.841887,-0.193556,0.099742,0.898709,0.149234,8,9,0.473251,0.439037,0.647135,0.69697,0.414159,0.441696,0.665272,0.727273
189,-1.84693,-2.946305,0.014398,-1.754249,0.059434,-1.389986,-1.285311,-1.43155,-1.534988,-1.177206,-0.249921,-0.242242,0.812036,-1.05164,0.276578,-0.404681,-0.143467,-1.264483,-0.781661,1.558012,0.065237,-0.400895,2,15,0.457338,0.366902,0.726384,0.882353,0.474168,0.450191,0.691622,0.65625


In [553]:
logreg = LogisticRegression()

result = logreg.fit(X_train,y_train)

print("Coeffs ",logreg.coef_)
print("Intercept ", logreg.intercept_)

Coeffs  [[ 0.47445728  0.10057322 -0.00797054  0.20854324  0.11969116 -0.21373387
  -0.21835742 -0.28837125 -0.66766631  0.29244177 -0.63015907  0.27415409
   0.05404842 -0.37405637  0.01774592 -0.21059831  0.41388498  0.36928837
  -0.0495347   0.13536983 -0.43513301 -0.250198   -0.13790963  0.15948768
  -0.02807795  0.04304409  0.03913496 -0.02548998  0.01834588  0.05230875
   0.1340552  -0.39767799]]
Intercept  [0.00288434]


In [554]:
y_pred = logreg.predict(X_test)
print("LR model accuracy is %2.2f" % metrics.accuracy_score(y_test, y_pred))

LR model accuracy is 0.69


In [555]:
print("Log loss= ",log_loss(y_test,logreg.predict_proba(X_test)))

Log loss=  0.668671842544527
