In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# display multiple outputs, default is 'last_expr' (last expression)
# http://ipython.readthedocs.io/en/stable/config/options/terminal.html
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [62]:
data_dir = './data/extracted/'
proc_dir = './data/processed/'
out_dir = './output/'

## Process Tournament Results

In [13]:
# bring data in
df_tourney = pd.read_csv(data_dir + "NCAATourneyCompactResults.csv")
categorical = ['Season', 'DayNum', 'WTeamID', 'LTeamID', 'WLoc']
df_tourney[categorical] = df_tourney[categorical].astype(str)

# restructure data
# align team as the lower numbered team, and opponent as higher numbered team
# this is consistent with the submission file
df_tourney['outcome'] = np.where(df_tourney['WTeamID'] < df_tourney['LTeamID'], 1, 0)
df_tourney['team'] = np.where(df_tourney['outcome'] == 1, df_tourney['WTeamID'], df_tourney['LTeamID'])
df_tourney['opponent'] = np.where(df_tourney['outcome'] == 1, df_tourney['LTeamID'], df_tourney['WTeamID'])
df_tourney['score'] = np.where(df_tourney['outcome'] == 1, df_tourney['WScore'], df_tourney['LScore'])
df_tourney['opponentScore'] = np.where(df_tourney['outcome'] == 1, df_tourney['LScore'], df_tourney['WScore'])

In [56]:
tmp1 = df_tourney[['Season', 'DayNum', 'WLoc',
                   'NumOT', 'outcome', 'team',
                   'opponent', 'score', 'opponentScore']].copy()

# reverse everything to stack data
tmp2 = df_tourney[['Season', 'DayNum', 'WLoc',
                   'NumOT', 'outcome', 'team',
                   'opponent', 'score', 'opponentScore']].copy()
tmp2['outcome'] = 1 - tmp2['outcome']
col_list = list(tmp2)
col_list[5], col_list[6] = col_list[6], col_list[5]
col_list[7], col_list[8] = col_list[8], col_list[7]
tmp2.columns = col_list

# concatenate
df_results = pd.concat([tmp1, tmp2]).reset_index()
del df_results['index']
df_results = df_results[['Season', 'DayNum', 'team',
                         'opponent', 'outcome', 'score',
                         'opponentScore', 'NumOT', 'WLoc']]

# create indicator for holdout
df_results['holdout'] = np.where((pd.to_numeric(df_results['Season']) >= 2014) & 
                                 (pd.to_numeric(df_results['Season']) < 2018), 1, 0)

In [61]:
# quick checks
tmp1.shape
tmp2.shape

df_results.head()
df_results.shape
df_results[df_results['Season'] == '2014'].head()

(2117, 9)

(2117, 9)

Unnamed: 0,Season,DayNum,team,opponent,outcome,score,opponentScore,NumOT,WLoc,holdout
0,1985,136,1116,1234,1,63,54,0,N,0
1,1985,136,1120,1345,1,59,58,0,N,0
2,1985,136,1207,1250,1,68,43,0,N,0
3,1985,136,1229,1425,1,58,55,0,N,0
4,1985,136,1242,1325,1,49,38,0,N,0


(4234, 10)

Unnamed: 0,Season,DayNum,team,opponent,outcome,score,opponentScore,NumOT,WLoc,holdout
1849,2014,134,1107,1291,1,71,64,0,N,1
1850,2014,134,1301,1462,1,74,59,0,N,1
1851,2014,135,1142,1411,1,81,69,0,N,1
1852,2014,135,1234,1397,0,65,78,1,N,1
1853,2014,136,1163,1386,1,89,81,1,N,1


In [63]:
# save to disk
df_results.to_csv(proc_dir + 'Proc_TourneyResults.csv', index=False)