In [132]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt

In [133]:
detailed_season_df = pd.read_csv("../data//Kaggle-Data/MDataFiles_Stage1/MRegularSeasonDetailedResults.csv")
detailed_season_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [134]:
detailed_season_df.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

### Calculation of Advanced Metrics per Game

In [135]:
advanved_detailed_season_df = detailed_season_df.copy()

# Drop DayNum and NumOT as irrelevant
advanved_detailed_season_df = advanved_detailed_season_df.drop(columns = ["DayNum", "NumOT"])

# Effective Field Goal Percentage
# Winning Team: WFGM + (0.5 * WFGM3) / WFGA
advanved_detailed_season_df['WEFGP'] = (advanved_detailed_season_df['WFGM'] + (0.5 * advanved_detailed_season_df['WFGM3'])) / advanved_detailed_season_df['WFGA']
# Losing Team: LFGM + (0.5 * LFGM3) / LFGA
advanved_detailed_season_df['LEFGP'] = (advanved_detailed_season_df['LFGM'] + (0.5 * advanved_detailed_season_df['LFGM3'])) / advanved_detailed_season_df['LFGA']

#Turnover Percentage
# Winning Team: WTO / (WFGA + (0.44 * WFTA) + WTO)
advanved_detailed_season_df['WTP'] = advanved_detailed_season_df['WTO'] / (advanved_detailed_season_df['WFGA'] + (0.44 * advanved_detailed_season_df['WFTA']) + advanved_detailed_season_df['WTO']) 
# Losing Team: LTO / (LFGA + (0.44 * LFTA) + LTO)
advanved_detailed_season_df['LTP'] = advanved_detailed_season_df['LTO'] / (advanved_detailed_season_df['LFGA'] + (0.44 * advanved_detailed_season_df['LFTA']) + advanved_detailed_season_df['LTO'])

# Offensive Rebound Percentage
# Winning Team: WOR / WOR + LDR
advanved_detailed_season_df['WORP'] = advanved_detailed_season_df['WOR'] / (advanved_detailed_season_df['WOR'] + advanved_detailed_season_df['LDR'])
#Losing Team: LOR / LOR + WDR
advanved_detailed_season_df['LORP'] = advanved_detailed_season_df['LOR'] / (advanved_detailed_season_df['LOR'] + advanved_detailed_season_df['WDR'])

# Defensive Rebound Percentage
# Winning Team: WDR / WDR + LOR
advanved_detailed_season_df['WDRP'] = advanved_detailed_season_df['WDR'] / (advanved_detailed_season_df['WDR'] + advanved_detailed_season_df['LOR'])
#Losing Team: LDR / LDR + WOR
advanved_detailed_season_df['LDRP'] = advanved_detailed_season_df['LDR'] / (advanved_detailed_season_df['LDR'] + advanved_detailed_season_df['WOR'])

# Rebound Percentage
advanved_detailed_season_df['WRP'] = (advanved_detailed_season_df['WDR'] + advanved_detailed_season_df['WOR']) / (advanved_detailed_season_df['WDR'] + advanved_detailed_season_df['WOR'] + advanved_detailed_season_df['LDR'] + advanved_detailed_season_df['LOR'])
advanved_detailed_season_df['LRP'] = (advanved_detailed_season_df['LDR'] + advanved_detailed_season_df['LOR']) / (advanved_detailed_season_df['WDR'] + advanved_detailed_season_df['WOR'] + advanved_detailed_season_df['LDR'] + advanved_detailed_season_df['LOR'])

# Field Goal Percentage
# Winning Team: WFGM / WFGA 
advanved_detailed_season_df['WFGPct'] = advanved_detailed_season_df['WFGM'] / advanved_detailed_season_df['WFGA']
# Losing Team: LFGM / LFGA 
advanved_detailed_season_df['LFGPct'] = advanved_detailed_season_df['LFGM'] / advanved_detailed_season_df['LFGA']

# 3 Point Percentage
# Winning Team: WFGM3 / WFGA3 
advanved_detailed_season_df['WFG3Pct'] = advanved_detailed_season_df['WFGM3'] / advanved_detailed_season_df['WFGA3']
# Losing Team: LFGM3 / LFGA3
advanved_detailed_season_df['LFG3Pct'] = advanved_detailed_season_df['LFGM3'] / advanved_detailed_season_df['LFGA3']

# Free Throw Rate
# Winning Team: WFTM / WFGA
advanved_detailed_season_df['WFTR'] = advanved_detailed_season_df['WFTM'] / advanved_detailed_season_df['WFGA'] 
# Losing Team: LFTM / LFGA
advanved_detailed_season_df['LFTR'] = advanved_detailed_season_df['LFTM'] / advanved_detailed_season_df['LFGA']

# Free Throw Attempt Rate
advanved_detailed_season_df['WFTAR'] = advanved_detailed_season_df['WFTA'] / advanved_detailed_season_df['WFGA'] 
advanved_detailed_season_df['LFTAR'] = advanved_detailed_season_df['LFTA'] / advanved_detailed_season_df['LFGA'] 

# True Shooting Percentage
# Winning Team: 100 * WScore / 2(WFGA + 0.44*WFTA)
advanved_detailed_season_df['WTSP'] = 100 * advanved_detailed_season_df['WScore'] / 2*(advanved_detailed_season_df['WFGA'] + 0.44 * advanved_detailed_season_df['WFTA'])
# Losing Team: 100 * LScore / 2(LFGA + 0.44*LFTA)
advanved_detailed_season_df['LTSP'] = 100 * advanved_detailed_season_df['LScore'] / 2*(advanved_detailed_season_df['LFGA'] + 0.44 * advanved_detailed_season_df['LFTA'])

# Possessions
# Winning Team: (WFGA - WOR) + WTO + (.436 x WFTA) 
advanved_detailed_season_df['WPos'] = advanved_detailed_season_df['WFGA'] -advanved_detailed_season_df['WOR'] + advanved_detailed_season_df['WTO'] + 0.436 * advanved_detailed_season_df['WFTA']
# Losing Team: (LFGA - LOR) + LTO + (.436 x LFTA) 
advanved_detailed_season_df['LPos'] = advanved_detailed_season_df['LFGA'] -advanved_detailed_season_df['LOR'] + advanved_detailed_season_df['LTO'] + 0.436 * advanved_detailed_season_df['LFTA']

# Offensive Points per Possession
# Winning Team: WScore / WPos
advanved_detailed_season_df['WOffPtsPos'] = advanved_detailed_season_df['WScore'] / advanved_detailed_season_df['WPos']
# Losing Team: LScore / LPos
advanved_detailed_season_df['LOffPtsPos'] = advanved_detailed_season_df['LScore'] / advanved_detailed_season_df['LPos']

# Defensive Points per Possession
# Winning Team: Losing Team's Offensive Points per Possession
advanved_detailed_season_df['WDefPtsPos'] = advanved_detailed_season_df['LOffPtsPos']
# Losing Team: Winning Team's Offensive Points per Possession
advanved_detailed_season_df['LDefPtsPos'] = advanved_detailed_season_df['WOffPtsPos']

# Net Points per 2 Possession
advanved_detailed_season_df['WNetPtsPos'] = advanved_detailed_season_df['WOffPtsPos'] - advanved_detailed_season_df['WDefPtsPos']
advanved_detailed_season_df['LNetPtsPos'] = advanved_detailed_season_df['LOffPtsPos'] - advanved_detailed_season_df['LDefPtsPos']

# Assist per Possession
advanved_detailed_season_df['WAstP'] = advanved_detailed_season_df['WAst'] /  advanved_detailed_season_df['WPos']
advanved_detailed_season_df['LAstP'] = advanved_detailed_season_df['LAst'] /  advanved_detailed_season_df['LPos']


advanved_detailed_season_df.head()

Unnamed: 0,Season,WTeamID,WScore,LTeamID,LScore,WLoc,WFGM,WFGA,WFGM3,WFGA3,...,WPos,LPos,WOffPtsPos,LOffPtsPos,WDefPtsPos,LDefPtsPos,WNetPtsPos,LNetPtsPos,WAstP,LAstP
0,2003,1104,68,1328,62,N,27,58,3,14,...,74.848,70.592,0.908508,0.878286,0.878286,0.908508,0.030221,-0.030221,0.173685,0.113327
1,2003,1272,70,1393,63,N,26,62,8,20,...,68.284,67.72,1.02513,0.930301,0.930301,1.02513,0.094829,-0.094829,0.234316,0.103367
2,2003,1266,73,1437,61,N,24,58,8,18,...,63.644,64.028,1.147005,0.952708,0.952708,1.147005,0.194297,-0.194297,0.235686,0.140564
3,2003,1296,56,1457,50,N,18,38,3,9,...,57.516,57.54,0.973642,0.868961,0.868961,0.973642,0.104681,-0.104681,0.191251,0.156413
4,2003,1400,77,1208,71,N,30,61,6,14,...,63.668,62.772,1.209399,1.131078,1.131078,1.209399,0.078321,-0.078321,0.188478,0.191168


### Calculation of Advanced Metrics by Season

In [136]:
# Determining which columns pertain to winners and losers
W_cols = []
L_cols = []

for c in advanved_detailed_season_df.columns:
    if c[0] == "W": #Column pertains to winner
        W_cols.append(c)
    elif c[0] == "L": # Column pertains to loser
        L_cols.append(c)
    else: # Column applies to both
        W_cols.append(c)
        L_cols.append(c)

In [137]:
winners_season_totals_df = advanved_detailed_season_df[W_cols].groupby(["Season","WTeamID"]).sum().reset_index()
winners_season_totals_df.head()


Unnamed: 0,Season,WTeamID,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,...,WFGPct,WFG3Pct,WFTR,WFTAR,WTSP,WPos,WOffPtsPos,WDefPtsPos,WNetPtsPos,WAstP
0,2003,1102,825,271,480,120,259,163,249,46,...,6.815206,5.398589,4.270526,6.500567,2030728.0,675.564,14.717831,11.245398,3.472432,3.655099
1,2003,1103,1141,390,720,71,187,290,402,122,...,7.053325,4.716761,5.224313,7.208349,3978162.0,936.272,15.86497,14.029778,1.835191,3.201642
2,2003,1104,1270,439,992,120,354,272,383,230,...,7.554681,5.906111,4.728497,6.672295,4354016.0,1150.988,18.779597,15.343111,3.436486,3.521438
3,2003,1105,556,179,433,64,157,134,180,102,...,2.894502,2.741719,2.202557,2.977323,2048104.0,535.48,7.264406,6.072738,1.191667,1.454221
4,2003,1106,888,322,700,76,207,168,270,166,...,6.00307,4.985266,3.225302,5.145364,2834226.0,881.72,13.092858,10.962341,2.130517,2.4939


In [138]:
losers_season_totals_df = advanved_detailed_season_df[L_cols].groupby(["Season","LTeamID"]).sum().reset_index()
losers_season_totals_df.head()


Unnamed: 0,Season,LTeamID,LScore,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,...,LFGPct,LFG3Pct,LFTR,LFTAR,LTSP,LPos,LOffPtsPos,LDefPtsPos,LNetPtsPos,LAstP
0,2003,1102,778,265,634,99,324,149,230,71,...,6.796967,4.895253,3.889684,6.006825,1790344.0,850.28,14.692165,18.237504,-3.54534,3.042051
1,2003,1103,986,343,788,76,247,224,296,142,...,6.103607,4.24697,4.036028,5.350286,3333574.0,953.056,14.415529,16.201768,-1.786239,2.659924
2,2003,1104,670,234,609,58,202,144,203,150,...,4.196258,3.206261,2.659445,3.753494,2151588.0,697.508,10.525695,12.399883,-1.874189,1.595748
3,2003,1105,1310,455,1169,133,383,267,388,249,...,7.406804,6.608656,4.443809,6.369713,4669836.0,1448.168,17.188943,20.515484,-3.326541,3.517597
4,2003,1106,893,334,848,95,287,130,191,178,...,5.91176,4.820217,2.367428,3.46639,2779964.0,1000.276,13.385059,15.83104,-2.445981,2.37817


### Calculating Num Wins & Losses per Team by Year

In [139]:
# Read in data
compact_season_df = pd.read_csv("../data//Kaggle-Data/MDataFiles_Stage1/MRegularSeasonCompactResults.csv")
# Filter to Season > 2003 (min year for deatailed daya)
compact_season_df = compact_season_df[compact_season_df["Season"] >= 2003].reset_index(drop=True)
compact_season_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,2003,10,1104,68,1328,62,N,0
1,2003,10,1272,70,1393,63,N,0
2,2003,11,1266,73,1437,61,N,0
3,2003,11,1296,56,1457,50,N,0
4,2003,11,1400,77,1208,71,N,0


In [140]:
# Group by Season and WTeamID, count rows, replace NA's w/ 0's for undefeated teams
wins_per_season_df = compact_season_df.groupby(["Season","WTeamID"])["DayNum"].count().reset_index().fillna(0)
wins_per_season_df.rename(columns = {"DayNum": "NumWins"}, inplace = True)

wins_per_season_df.head()

Unnamed: 0,Season,WTeamID,NumWins
0,2003,1102,12
1,2003,1103,13
2,2003,1104,17
3,2003,1105,7
4,2003,1106,13


In [141]:
# Group by Season and LTeamID, count rows, replace NA's w/ 0's for teams without win
losses_per_season_df = compact_season_df.groupby(["Season","LTeamID"])["DayNum"].count().reset_index().fillna(0)
losses_per_season_df.rename(columns = {"DayNum": "NumLosses"}, inplace = True)

losses_per_season_df.head()

Unnamed: 0,Season,LTeamID,NumLosses
0,2003,1102,16
1,2003,1103,14
2,2003,1104,11
3,2003,1105,19
4,2003,1106,15


### Computing Weighted avg. Advanced Metrics per Team by Season

In [142]:
combined_winners_df = wins_per_season_df.merge(winners_season_totals_df, on=["Season","WTeamID"])
combined_winners_df.head()

Unnamed: 0,Season,WTeamID,NumWins,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,...,WFGPct,WFG3Pct,WFTR,WFTAR,WTSP,WPos,WOffPtsPos,WDefPtsPos,WNetPtsPos,WAstP
0,2003,1102,12,825,271,480,120,259,163,249,...,6.815206,5.398589,4.270526,6.500567,2030728.0,675.564,14.717831,11.245398,3.472432,3.655099
1,2003,1103,13,1141,390,720,71,187,290,402,...,7.053325,4.716761,5.224313,7.208349,3978162.0,936.272,15.86497,14.029778,1.835191,3.201642
2,2003,1104,17,1270,439,992,120,354,272,383,...,7.554681,5.906111,4.728497,6.672295,4354016.0,1150.988,18.779597,15.343111,3.436486,3.521438
3,2003,1105,7,556,179,433,64,157,134,180,...,2.894502,2.741719,2.202557,2.977323,2048104.0,535.48,7.264406,6.072738,1.191667,1.454221
4,2003,1106,13,888,322,700,76,207,168,270,...,6.00307,4.985266,3.225302,5.145364,2834226.0,881.72,13.092858,10.962341,2.130517,2.4939


In [143]:
combined_losers_df = losses_per_season_df.merge(losers_season_totals_df, on=["Season","LTeamID"])
combined_losers_df.head()

Unnamed: 0,Season,LTeamID,NumLosses,LScore,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,...,LFGPct,LFG3Pct,LFTR,LFTAR,LTSP,LPos,LOffPtsPos,LDefPtsPos,LNetPtsPos,LAstP
0,2003,1102,16,778,265,634,99,324,149,230,...,6.796967,4.895253,3.889684,6.006825,1790344.0,850.28,14.692165,18.237504,-3.54534,3.042051
1,2003,1103,14,986,343,788,76,247,224,296,...,6.103607,4.24697,4.036028,5.350286,3333574.0,953.056,14.415529,16.201768,-1.786239,2.659924
2,2003,1104,11,670,234,609,58,202,144,203,...,4.196258,3.206261,2.659445,3.753494,2151588.0,697.508,10.525695,12.399883,-1.874189,1.595748
3,2003,1105,19,1310,455,1169,133,383,267,388,...,7.406804,6.608656,4.443809,6.369713,4669836.0,1448.168,17.188943,20.515484,-3.326541,3.517597
4,2003,1106,15,893,334,848,95,287,130,191,...,5.91176,4.820217,2.367428,3.46639,2779964.0,1000.276,13.385059,15.83104,-2.445981,2.37817


In [144]:
combined_season_df = combined_winners_df.merge(combined_losers_df, left_on=["Season", "WTeamID"], right_on=["Season", "LTeamID"])
combined_season_df.head()


Unnamed: 0,Season,WTeamID,NumWins,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,...,LFGPct,LFG3Pct,LFTR,LFTAR,LTSP,LPos,LOffPtsPos,LDefPtsPos,LNetPtsPos,LAstP
0,2003,1102,12,825,271,480,120,259,163,249,...,6.796967,4.895253,3.889684,6.006825,1790344.0,850.28,14.692165,18.237504,-3.54534,3.042051
1,2003,1103,13,1141,390,720,71,187,290,402,...,6.103607,4.24697,4.036028,5.350286,3333574.0,953.056,14.415529,16.201768,-1.786239,2.659924
2,2003,1104,17,1270,439,992,120,354,272,383,...,4.196258,3.206261,2.659445,3.753494,2151588.0,697.508,10.525695,12.399883,-1.874189,1.595748
3,2003,1105,7,556,179,433,64,157,134,180,...,7.406804,6.608656,4.443809,6.369713,4669836.0,1448.168,17.188943,20.515484,-3.326541,3.517597
4,2003,1106,13,888,322,700,76,207,168,270,...,5.91176,4.820217,2.367428,3.46639,2779964.0,1000.276,13.385059,15.83104,-2.445981,2.37817


In [145]:
# Make sure both DataFrames have same number of columns
assert(len(combined_winners_df.columns) == len(combined_losers_df.columns))

# Create new DataFrame to store weighted results
weighted_season_df = pd.DataFrame()

# Iterate through features, 
for idx in range(len(combined_winners_df.columns)): 
    
    win_col_name = combined_winners_df.columns[idx]
    
    # Set new columnn name removing W/L prefix
    if win_col_name[0] == "W":
        new_col_name = "avg_" + win_col_name[1:]
    else:
        new_col_name =  win_col_name
        
    # Take weighted avg of stats from wins and losses
    weighted_season_df[new_col_name] = (combined_season_df[combined_winners_df.columns[idx]] + combined_season_df[combined_losers_df.columns[idx]]) / (combined_season_df["NumWins"] + combined_season_df["NumLosses"]) 


weighted_season_df.rename(columns = {"avg_TeamID": "TeamID"}, inplace = True)
weighted_season_df["Season"] = combined_season_df["Season"]
weighted_season_df["TeamID"] = combined_season_df["WTeamID"]
weighted_season_df["NumLosses"] = combined_season_df["NumLosses"]
weighted_season_df["NumWins"] = combined_season_df["NumWins"]
weighted_season_df["NumLosses"] = combined_season_df["NumLosses"]

weighted_season_df.head()


Unnamed: 0,Season,TeamID,NumWins,avg_Score,avg_FGM,avg_FGA,avg_FGM3,avg_FGA3,avg_FTM,avg_FTA,...,avg_FG3Pct,avg_FTR,avg_FTAR,avg_TSP,avg_Pos,avg_OffPtsPos,avg_DefPtsPos,avg_NetPtsPos,avg_AstP,NumLosses
0,2003,1102,12,57.25,19.142857,39.785714,7.821429,20.821429,11.142857,17.107143,...,0.367637,0.291436,0.446693,136466.857143,54.494429,1.050357,1.052961,-0.002604,0.239184,16
1,2003,1103,13,78.777778,27.148148,55.851852,5.444444,16.074074,19.037037,25.851852,...,0.33199,0.342976,0.465135,270805.037037,69.975111,1.1215,1.119687,0.001813,0.217095,14
2,2003,1104,17,69.285714,24.035714,57.178571,6.357143,19.857143,14.857143,20.928571,...,0.325442,0.263855,0.37235,232343.0,66.017714,1.046618,0.990821,0.055796,0.182757,11
3,2003,1105,7,71.769231,24.384615,61.615385,7.576923,20.769231,15.423077,21.846154,...,0.35963,0.255629,0.359501,258382.307692,76.294154,0.940513,1.022624,-0.082111,0.191224,19
4,2003,1106,13,63.607143,23.428571,55.285714,6.107143,17.642857,10.642857,16.464286,...,0.350196,0.19974,0.307563,200506.785714,67.214143,0.94564,0.956906,-0.011267,0.174003,15


### Computing Avg. of Massey Ordinals

In [146]:
# Read in Massey Ordinal Data
massey_ordinals = pd.read_csv("../data//Kaggle-Data/MDataFiles_Stage1/MMasseyOrdinals.csv")

# Only include rankings of teams right before tournament starts
massey_ordinals = massey_ordinals[massey_ordinals['RankingDayNum'] == 133]

# Group by Team and Season, calculate average rank across all systems
avg_ordinal_ranking = massey_ordinals.groupby(['Season','TeamID'])['OrdinalRank'].mean().reset_index()

avg_ordinal_ranking.head()

Unnamed: 0,Season,TeamID,OrdinalRank
0,2003,1102,156.03125
1,2003,1103,168.0
2,2003,1104,38.03125
3,2003,1105,308.96875
4,2003,1106,262.6875


## Adding Seeds to Data

In [227]:
tournament_seeds = pd.read_csv('../data//Kaggle-Data/MDataFiles_Stage1/MNCAATourneySeeds.csv')
tournament_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [228]:
def clean_seed_col(cell):
    return(int(cell[1:3]))

tournament_seeds['Seed'] = tournament_seeds['Seed'].map(clean_seed_col)

tournament_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,1,1207
1,1985,2,1210
2,1985,3,1228
3,1985,4,1260
4,1985,5,1374


# Combining Tournament Matches w/ Regular Season Data

In [147]:
compact_tournament_df = pd.read_csv("../data//Kaggle-Data/MDataFiles_Stage1/MNCAATourneyCompactResults.csv")
compact_tournament_df = compact_tournament_df[compact_tournament_df["Season"] >= 2003].reset_index(drop=True)
compact_tournament_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,2003,134,1421,92,1411,84,N,1
1,2003,136,1112,80,1436,51,N,0
2,2003,136,1113,84,1272,71,N,0
3,2003,136,1141,79,1166,73,N,0
4,2003,136,1143,76,1301,74,N,1


In [148]:
num_games = len(compact_tournament_df.index)
random_selection = np.random.choice(a=[0,1], size=num_games)

ATeamID = np.empty(num_games)
BTeamID = np.empty(num_games)
AScore = np.empty(num_games)
BScore = np.empty(num_games)
Winner_label = np.empty(num_games)

for game_num in range(num_games):
    # Randomly choose number
    random_scramble = np.random.choice([0,1])
    # Number = 1: Team A won, Number = 0: Team B won
    Winner_label[game_num] = random_scramble
    
    if random_scramble == 1:
        ATeamID[game_num] = compact_tournament_df['WTeamID'][game_num]
        AScore[game_num] = compact_tournament_df['WScore'][game_num]
        
        BTeamID[game_num] = compact_tournament_df['LTeamID'][game_num]
        BScore[game_num] = compact_tournament_df['LScore'][game_num]
    else:
        ATeamID[game_num] = compact_tournament_df['LTeamID'][game_num]
        AScore[game_num] = compact_tournament_df['LScore'][game_num]
        
        BTeamID[game_num] = compact_tournament_df['WTeamID'][game_num]
        BScore[game_num] = compact_tournament_df['WScore'][game_num]

In [257]:
scrambled_tournament_df = pd.DataFrame({"Season":compact_tournament_df['Season'],
                                        "DayNum":compact_tournament_df['DayNum'],
                                        "WinnerLabel":Winner_label,
                                        "ATeamID":ATeamID, 
                                        "AScore":AScore, 
                                        "BTeamID":BTeamID, 
                                        "BScore":BScore,
                                        "NumOT": compact_tournament_df['NumOT']})

scrambled_tournament_df.head()
                                                                                                                                                                        

Unnamed: 0,Season,DayNum,WinnerLabel,ATeamID,AScore,BTeamID,BScore,NumOT
0,2003,134,0.0,1411.0,84.0,1421.0,92.0,1
1,2003,136,1.0,1112.0,80.0,1436.0,51.0,0
2,2003,136,0.0,1272.0,71.0,1113.0,84.0,0
3,2003,136,1.0,1141.0,79.0,1166.0,73.0,0
4,2003,136,1.0,1143.0,76.0,1301.0,74.0,1


In [268]:
A_stats = weighted_season_df.add_prefix('A_')
B_stats = weighted_season_df.add_prefix('B_')

A_ordinal_ranking = avg_ordinal_ranking.rename(columns={"Season":"A_Season", "OrdinalRank":"A_OrdinalRank"})
B_ordinal_ranking = avg_ordinal_ranking.rename(columns={"Season":"B_Season", "OrdinalRank":"B_OrdinalRank"})

A_seeds = tournament_seeds.add_prefix("A_")
B_seeds = tournament_seeds.add_prefix("B_")

full_joined_df = scrambled_tournament_df.merge(A_stats, how="left", 
                              left_on=["Season","ATeamID"], 
                              right_on = ["A_Season","A_TeamID"])

full_joined_df = full_joined_df.merge(B_stats, how="left", 
                              left_on=["Season","BTeamID"], 
                              right_on = ["B_Season","B_TeamID"])

full_joined_df = full_joined_df.merge(A_ordinal_ranking, how="left", 
                                      left_on=["Season","ATeamID"], 
                                      right_on=["A_Season","TeamID"])

full_joined_df = full_joined_df.merge(B_ordinal_ranking, how="left", 
                                      left_on=["Season","BTeamID"], 
                                      right_on=["B_Season","TeamID"])

full_joined_df = full_joined_df.merge(A_seeds, how="left",
                                     left_on=["Season","ATeamID"],
                                     right_on=["A_Season","A_TeamID"])

full_joined_df = full_joined_df.merge(B_seeds, how="left",
                                     left_on=["Season","BTeamID"],
                                     right_on=["B_Season","B_TeamID"])



# Caclulating Winning Percentage for both teams
full_joined_df['A_WinPct'] = full_joined_df['A_NumWins'] / (full_joined_df['A_NumWins'] + full_joined_df['A_NumLosses'])
full_joined_df['B_WinPct'] = full_joined_df['B_NumWins'] / (full_joined_df['B_NumWins'] + full_joined_df['B_NumLosses'])


# # Number of Possessions = (avg_FGA-avg_OR) + avg_TO + (.436 x avg_FTA)
# full_joined_df['A_avg_POS'] = (full_joined_df['A_avg_FGA'] - full_joined_df['A_avg_OR']) + full_joined_df['A_avg_TO'] + (0.436 * full_joined_df['A_avg_FTA'])
# full_joined_df['B_avg_POS'] = (full_joined_df['B_avg_FGA'] - full_joined_df['B_avg_OR']) + full_joined_df['B_avg_TO'] + (0.436 * full_joined_df['B_avg_FTA'])

# # Calculating points per possession
# full_joined_df['A_avg_PerPos'] = full_joined_df['A_avg_Score'] / full_joined_df['A_avg_POS']
# full_joined_df['B_avg_PerPos'] = full_joined_df['B_avg_Score'] / full_joined_df['B_avg_POS']

full_joined_df = full_joined_df.dropna()
full_joined_df = full_joined_df.reset_index(drop=True)

# Remove duplicate, redundant columns
unique_cols = full_joined_df.head().T.drop_duplicates().T.columns
full_joined_df = full_joined_df[unique_cols]

full_joined_df.head()


Unnamed: 0,Season,DayNum,WinnerLabel,ATeamID,AScore,BTeamID,BScore,NumOT,A_NumWins,A_avg_Score,...,B_avg_DefPtsPos,B_avg_NetPtsPos,B_avg_AstP,B_NumLosses,A_OrdinalRank,B_OrdinalRank,A_Seed,B_Seed,A_WinPct,B_WinPct
0,2003,134,0.0,1411.0,84.0,1421.0,92.0,1,18.0,72.8,...,1.122992,-0.099691,0.188002,16.0,239.28125,240.34375,16,16,0.6,0.448276
1,2003,136,1.0,1112.0,80.0,1436.0,51.0,0,25.0,85.214286,...,0.967096,0.071297,0.217935,10.0,2.676471,153.125,1,16,0.892857,0.655172
2,2003,136,0.0,1272.0,71.0,1113.0,84.0,0,23.0,74.517241,...,0.999149,0.1098,0.227179,11.0,21.705882,36.0,7,10,0.793103,0.62069
3,2003,136,1.0,1141.0,79.0,1166.0,73.0,0,23.0,79.344828,...,0.939856,0.211275,0.243801,4.0,45.6875,20.735294,11,6,0.793103,0.878788
4,2003,136,1.0,1143.0,76.0,1301.0,74.0,1,21.0,74.482759,...,1.023209,0.061666,0.218855,12.0,36.40625,50.3125,8,9,0.724138,0.6


In [269]:
drop_cols = ['Season','DayNum','WinnerLabel','ATeamID','AScore','BTeamID','BScore','NumOT']
feature_df = full_joined_df.drop(columns=drop_cols,axis=1)
feature_df.head()

Unnamed: 0,A_NumWins,A_avg_Score,A_avg_FGM,A_avg_FGA,A_avg_FGM3,A_avg_FGA3,A_avg_FTM,A_avg_FTA,A_avg_OR,A_avg_DR,...,B_avg_DefPtsPos,B_avg_NetPtsPos,B_avg_AstP,B_NumLosses,A_OrdinalRank,B_OrdinalRank,A_Seed,B_Seed,A_WinPct,B_WinPct
0,18.0,72.8,24.733333,55.266667,5.933333,18.5,17.4,28.066667,13.166667,24.8,...,1.122992,-0.099691,0.188002,16.0,239.28125,240.34375,16,16,0.6,0.448276
1,25.0,85.214286,30.321429,65.714286,7.035714,20.071429,17.535714,25.0,15.178571,27.642857,...,0.967096,0.071297,0.217935,10.0,2.676471,153.125,1,16,0.892857,0.655172
2,23.0,74.517241,26.275862,60.0,7.0,20.068966,14.965517,22.896552,14.068966,25.965517,...,0.999149,0.1098,0.227179,11.0,21.705882,36.0,7,10,0.793103,0.62069
3,23.0,79.344828,26.62069,52.689655,6.827586,17.931034,19.275862,25.172414,10.586207,23.275862,...,0.939856,0.211275,0.243801,4.0,45.6875,20.735294,11,6,0.793103,0.878788
4,21.0,74.482759,27.344828,58.724138,6.413793,17.034483,13.37931,19.517241,11.241379,24.37931,...,1.023209,0.061666,0.218855,12.0,36.40625,50.3125,8,9,0.724138,0.6


In [270]:
feature_difference_df = pd.DataFrame()
feature_difference_df['NumWins_diff'] = feature_df['A_NumWins'] - feature_df['B_NumWins']
feature_difference_df['NumLosses_diff'] = feature_df['A_NumLosses'] - feature_df['B_NumLosses']
feature_difference_df['WinPct_diff'] = feature_df['A_WinPct'] - feature_df['B_WinPct']
feature_difference_df['PPG_diff'] = feature_df['A_avg_Score'] - feature_df['B_avg_Score']
feature_difference_df['FGM_diff'] = feature_df['A_avg_FGM'] - feature_df['B_avg_FGM']
feature_difference_df['FGA_diff'] = feature_df['A_avg_FGA'] - feature_df['B_avg_FGA']
feature_difference_df['FGM3_diff'] = feature_df['A_avg_FGM3'] - feature_df['B_avg_FGM3']
feature_difference_df['FGA3_diff'] = feature_df['A_avg_FGA3'] - feature_df['B_avg_FGA3']
feature_difference_df['FTM_diff'] = feature_df['A_avg_FTM'] - feature_df['B_avg_FTM']
feature_difference_df['FTA_diff'] = feature_df['A_avg_FTA'] - feature_df['B_avg_FTA']
feature_difference_df['OR_diff'] = feature_df['A_avg_OR'] - feature_df['B_avg_OR']
feature_difference_df['DR_diff'] = feature_df['A_avg_DR'] - feature_df['B_avg_DR']
feature_difference_df['Ast_diff'] = feature_df['A_avg_Ast'] - feature_df['B_avg_Ast']
feature_difference_df['TO_diff'] = feature_df['A_avg_TO'] - feature_df['B_avg_TO']
feature_difference_df['Stl_diff'] = feature_df['A_avg_Stl'] - feature_df['B_avg_Stl']
feature_difference_df['Blk_diff'] = feature_df['A_avg_Blk'] - feature_df['B_avg_Blk']
feature_difference_df['PF_diff'] = feature_df['A_avg_PF'] - feature_df['B_avg_PF']
feature_difference_df['FGPct_diff'] = feature_df['A_avg_FGPct'] - feature_df['B_avg_FGPct']
feature_difference_df['FG3Pct_diff'] = feature_df['A_avg_FG3Pct'] - feature_df['B_avg_FG3Pct']
feature_difference_df['EFGP_diff'] = feature_df['A_avg_EFGP'] - feature_df['B_avg_EFGP']
feature_difference_df['TP_diff'] = feature_df['A_avg_TP'] - feature_df['B_avg_TP']
feature_difference_df['ORP_diff'] = feature_df['A_avg_ORP'] - feature_df['B_avg_ORP']
feature_difference_df['DRP_diff'] = feature_df['A_avg_DRP'] - feature_df['B_avg_DRP']
feature_difference_df['RP_diff'] = feature_df['A_avg_RP'] - feature_df['B_avg_RP']
feature_difference_df['FTR_diff'] = feature_df['A_avg_FTR'] - feature_df['B_avg_FTR']
feature_difference_df['FTAR_diff'] = feature_df['A_avg_FTAR'] - feature_df['B_avg_FTAR']
feature_difference_df['TSP_diff'] = feature_df['A_avg_TSP'] - feature_df['B_avg_TSP']
feature_difference_df['Pos_diff'] = feature_df['A_avg_Pos'] - feature_df['B_avg_Pos']
feature_difference_df['OffPtsPos_diff'] = feature_df['A_avg_OffPtsPos'] - feature_df['B_avg_OffPtsPos']
feature_difference_df['DefPtsPos_diff'] = feature_df['A_avg_DefPtsPos'] - feature_df['B_avg_DefPtsPos']
feature_difference_df['NetPtsPos_diff'] = feature_df['A_avg_NetPtsPos'] - feature_df['B_avg_NetPtsPos']
feature_difference_df['AstP_diff'] = feature_df['A_avg_AstP'] - feature_df['B_avg_AstP']
feature_difference_df['OrdinalRank_diff'] = feature_df['A_OrdinalRank'] - feature_df['B_OrdinalRank']
feature_difference_df['Seed_diff'] = feature_df['A_Seed'] - feature_df['B_Seed']




feature_difference_df.head()


Unnamed: 0,NumWins_diff,NumLosses_diff,WinPct_diff,PPG_diff,FGM_diff,FGA_diff,FGM3_diff,FGA3_diff,FTM_diff,FTA_diff,...,FTR_diff,FTAR_diff,TSP_diff,Pos_diff,OffPtsPos_diff,DefPtsPos_diff,NetPtsPos_diff,AstP_diff,OrdinalRank_diff,Seed_diff
0,5.0,-4.0,0.151724,1.593103,0.354023,-1.526437,-0.549425,0.5,1.434483,7.135632,...,0.042472,0.152277,11761.181609,-0.279669,0.022172,-0.12167,0.143843,0.014735,-1.0625,0
1,6.0,-7.0,0.237685,17.421182,5.493842,9.852217,1.759852,4.58867,4.673645,5.448276,...,0.03713,0.031691,107964.332512,10.73136,0.082223,-0.040683,0.122906,0.013217,-150.448529,-15
2,5.0,-5.0,0.172414,-1.448276,-0.931034,3.103448,3.0,7.482759,-2.586207,-3.310345,...,-0.05397,-0.071937,400.413793,1.073931,-0.040473,-0.053821,0.013348,0.011253,-14.294118,-3
3,-6.0,2.0,-0.085684,0.102403,-2.07628,-4.76489,-1.142111,-2.553814,5.397074,5.142111,...,0.124461,0.127689,-9497.456635,2.647394,-0.040235,0.086365,-0.1266,-0.025458,24.952206,5
4,3.0,-4.0,0.124138,2.082759,3.011494,5.390805,-1.552874,-5.465517,-2.387356,-0.949425,...,-0.057522,-0.039185,23262.262069,3.441223,-0.020934,-0.026102,0.005169,0.008755,-13.90625,-1


## Scaling Data

In [297]:
scaler = preprocessing.StandardScaler().fit(feature_difference_df)
scaled_feature_diff_df = scaler.transform(feature_difference_df)

In [296]:
x_train, x_test, y_train, y_test = train_test_split(scaled_feature_diff_df, full_joined_df['WinnerLabel'], test_size=0.1)


svm = SVC()
svm.fit(x_train, y_train)

y_pred = svm.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.66      0.75      0.70        52
         1.0       0.78      0.69      0.73        65

    accuracy                           0.72       117
   macro avg       0.72      0.72      0.72       117
weighted avg       0.72      0.72      0.72       117



# Training & Testing Models

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn import preprocessing

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA


In [168]:
# Helpful Functions

def get_bet_on_team(row):
    if row['Prediction'] == 1:
        return row['ATeamID']
    else:
        return row['BTeamID']
    
def calc_net_payout(moneyline, amount_bet = 1):
    if moneyline < 0:
        return -amount_bet*(100/moneyline)
    else:
        return amount_bet*(moneyline/100)
    
def test_model(model, features, targets):
    x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size=0.2)

    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    test_bet_output = full_joined_df.iloc[y_test.index][['Season', 'DayNum', 'ATeamID', 'BTeamID','WinnerLabel']]
    test_bet_output['Prediction'] = y_pred
    test_bet_output['Correct?'] = test_bet_output['WinnerLabel'] == test_bet_output['Prediction']
    test_bet_output['BetOnTeamID'] = test_bet_output.apply(get_bet_on_team, axis=1)
    
    test_bet_full_df = test_bet_output.merge(moneyline_df, how="left",  
                                         left_on=["Season","DayNum","BetOnTeamID"], right_on=["Season","DayNum","TeamID"])[['Season','BetOnTeamID','Correct?','Moneyline']]

    test_bet_full_df = test_bet_full_df.dropna()
    
    initial_investment = len(test_bet_full_df.index)
    running_balanace = initial_investment

    for idx, row in test_bet_full_df.iterrows():
        if row["Correct?"]:
            running_balanace += calc_net_payout(row["Moneyline"])
        else:
            running_balanace -= 1

    ROI = (running_balanace - initial_investment)/initial_investment
    accuracy = sum(y_pred == y_test)/len(y_pred)
    return ROI, accuracy
    

In [171]:
# Loading betting data
moneyline_df = pd.read_csv("../data/Pre-Processed-Data/CleanedMoneylineData.csv",index_col=0)
moneyline_df.head()

Unnamed: 0,Season,DayNum,TeamID,Moneyline
0,2008,0,1263,200.0
1,2008,0,1350,-240.0
2,2008,0,1404,13000.0
3,2008,0,1272,-39000.0
4,2008,1,1205,-160.0


# Preprocessing

In [207]:
scaler = preprocessing.StandardScaler().fit(feature_df)
scaled_feature_df = scaler.transform(feature_df)

# LR Model

In [249]:
num_trials = 100
ROI_tracker = np.empty(num_trials)
accuracy_tracker = np.empty(num_trials)

for idx in range(num_trials):
    ROI, acc = test_model(LogisticRegression(),feature_difference_df, full_joined_df['WinnerLabel'])
    ROI_tracker[idx] = ROI
    accuracy_tracker[idx] = acc

print("Avg. ROI:", np.mean(ROI_tracker))
print("Avg. Accuracy:", np.mean(accuracy_tracker))

ZeroDivisionError: division by zero

# SVM Model

In [172]:
num_trials = 100
ROI_tracker = np.empty(num_trials)
accuracy_tracker = np.empty(num_trials)

for idx in range(num_trials):
    ROI, acc = test_model(SVC(),scaled_feature_diff_df, full_joined_df['WinnerLabel'])
    ROI_tracker[idx] = ROI
    accuracy_tracker[idx] = acc

print("Avg. ROI:", np.mean(ROI_tracker))
print("Avg. Accuracy:", np.mean(accuracy_tracker))

Avg. ROI: -0.03431323474952479
Avg. Accuracy: 0.6796581196581198


# Naive Baysian Classifier

In [228]:
num_trials = 100
ROI_tracker = np.empty(num_trials)
accuracy_tracker = np.empty(num_trials)

for idx in range(num_trials):
    ROI, acc = test_model(GaussianNB(),scaled_feature_df, full_joined_df['WinnerLabel'])
    ROI_tracker[idx] = ROI
    accuracy_tracker[idx] = acc

print("Avg. ROI:", np.mean(ROI_tracker))
print("Avg. Accuracy:", np.mean(accuracy_tracker))

Avg. ROI: -0.0639112969549941
Avg. Accuracy: 0.6597008547008546


# Decision Tree

In [229]:
num_trials = 100
ROI_tracker = np.empty(num_trials)
accuracy_tracker = np.empty(num_trials)

for idx in range(num_trials):
    ROI, acc = test_model(DecisionTreeClassifier(),scaled_feature_df, full_joined_df['WinnerLabel'])
    ROI_tracker[idx] = ROI
    accuracy_tracker[idx] = acc

print("Avg. ROI:", np.mean(ROI_tracker))
print("Avg. Accuracy:", np.mean(accuracy_tracker))

Avg. ROI: -0.01994978669460487
Avg. Accuracy: 0.6217521367521368


# KNearest Neighbors

In [181]:
num_trials = 100
ROI_tracker = np.empty(num_trials)
accuracy_tracker = np.empty(num_trials)

for idx in range(num_trials):
    ROI, acc = test_model(KNeighborsClassifier(),scaled_feature_diff_df, full_joined_df['WinnerLabel'])
    ROI_tracker[idx] = ROI
    accuracy_tracker[idx] = acc

print("Avg. ROI:", np.mean(ROI_tracker))
print("Avg. Accuracy:", np.mean(accuracy_tracker))

Avg. ROI: -0.02844922166819723
Avg. Accuracy: 0.6211965811965814


# Sandbox 1

In [165]:
train_idx = full_joined_df[full_joined_df['Season'] <= 2015].index
test_idx = full_joined_df[full_joined_df['Season'] >= 2016].index

x_train = feature_difference_df.iloc[train_idx]
x_test = feature_difference_df.iloc[test_idx]
y_train = full_joined_df['WinnerLabel'].iloc[train_idx]
y_test = full_joined_df['WinnerLabel'].iloc[test_idx]

model_pipeline_knn = Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('pca', PCA(0.90, svd_solver='full')),
    ('knn', KNeighborsClassifier())
])


params = {
         'knn__n_neighbors': [9] # Tested np.array(range(1,30))
}

knn_model = GridSearchCV(model_pipeline_knn, 
                   params,
                   cv=5, 
                   scoring='neg_log_loss', 
                   refit=True)

knn_model.fit(x_train, y_train)

y_pred = knn_model.predict(x_test)
print(sum(y_pred == y_test)/len(y_pred))

0.6158536585365854


In [183]:
feature_difference_df.columns

Index(['Score_diff', 'FGM_diff', 'FGA_diff', 'FGM3_diff', 'FGA3_diff',
       'FTM_diff', 'FTA_diff', 'FGPct_diff', 'FG3Pct_diff', 'OR_diff',
       'DR_diff', 'Ast_diff', 'AstP_diff', 'TO_diff', 'Stl_diff', 'Blk_diff',
       'PF_diff', 'EFGP_diff', 'TP_diff', 'ORP_diff', 'DRP_diff', 'RP_diff',
       'FTR_diff', 'FTAR_diff', 'TSP_diff', 'OrdinalRank_diff', 'WinPct_diff',
       'Pos_diff', 'OffPtsPos_diff', 'DefPtsPos_diff', 'NetPtsPos_diff'],
      dtype='object')

In [188]:
x_train_sus.columns

Index(['Seed1', 'Seed2', 'Round', 'ScoreT1', 'ScoreT2', 'Seed_diff',
       'ScoreT_diff', 'FGM_diff', 'FGM%_diff', 'FGA_diff', 'FGM3_diff',
       'FGM3%_diff', 'FGA3_diff', 'FTM_diff', 'FTM%_diff', 'FTA_diff',
       'OR_diff', 'DR_diff', 'Ast_diff', 'TO_diff', 'Stl_diff', 'Blk_diff',
       'PF_diff', 'Pos_diff', 'OffRtg_diff', 'DefRtg_diff', 'NetRtg_diff',
       'AstR_diff', 'TOR_diff', 'TSP_diff', 'eFGP_diff', 'FTAR_diff',
       'ORP_diff', 'DRP_diff', 'RP_diff', 'LQuality', 'WQuality', 'Wpowerrank',
       'Lpowerrank'],
      dtype='object')

In [213]:
norm_common_cols = ['A_NumWins', 'A_avg_Score', 'A_avg_FGM', 'A_avg_FGA', 'A_avg_FGM3',
       'A_avg_FGA3', 'A_avg_FTM', 'A_avg_FTA', 'A_avg_OR', 'A_avg_DR',
       'A_avg_Ast', 'A_avg_TO', 'A_avg_Stl', 'A_avg_Blk', 'A_avg_PF',
       'A_avg_EFGP', 'A_avg_TP', 'A_avg_ORP', 'A_avg_DRP', 'A_avg_RP',
       'A_avg_FGPct', 'A_avg_FG3Pct', 'A_avg_FTR', 'A_avg_FTAR', 'A_avg_TSP',
       'A_avg_Pos', 'A_avg_OffPtsPos', 'A_avg_DefPtsPos', 'A_avg_NetPtsPos',
       'A_avg_AstP', 'A_NumLosses', 'B_NumWins', 'B_avg_Score', 'B_avg_FGM',
       'B_avg_FGA', 'B_avg_FGM3', 'B_avg_FGA3', 'B_avg_FTM', 'B_avg_FTA',
       'B_avg_OR', 'B_avg_DR', 'B_avg_Ast', 'B_avg_TO', 'B_avg_Stl',
       'B_avg_Blk', 'B_avg_PF', 'B_avg_EFGP', 'B_avg_TP', 'B_avg_ORP',
       'B_avg_DRP', 'B_avg_RP', 'B_avg_FGPct', 'B_avg_FG3Pct', 'B_avg_FTR',
       'B_avg_FTAR', 'B_avg_TSP', 'B_avg_Pos', 'B_avg_OffPtsPos',
       'B_avg_DefPtsPos', 'B_avg_NetPtsPos', 'B_avg_AstP', 'B_NumLosses',
       'A_OrdinalRank', 'B_Season_y', 'B_OrdinalRank', 'A_WinPct', 'B_WinPct']


sus_common_cols = ['Seed1', 'Seed2', 'ScoreT1', 'ScoreT2', 
       'ScoreT_diff', 'FGM_diff', 'FGM%_diff', 'FGA_diff', 'FGM3_diff',
       'FGM3%_diff', 'FGA3_diff', 'FTM_diff', 'FTM%_diff', 'FTA_diff',
       'OR_diff', 'DR_diff', 'Ast_diff', 'TO_diff', 'Stl_diff', 'Blk_diff',
       'PF_diff', 'Pos_diff', 'OffRtg_diff', 'DefRtg_diff', 'NetRtg_diff',
       'AstR_diff', 'TOR_diff', 'TSP_diff', 'eFGP_diff', 'FTAR_diff',
       'ORP_diff', 'DRP_diff', 'RP_diff', 'LQuality', 'WQuality', 'Wpowerrank',
       'Lpowerrank']

x_train_sus_2 = x_train_sus[sus_common_cols]
x_test_sus_2 = x_test_sus[sus_common_cols]

### Adding Seeds to Data

In [214]:
x_train, x_test, y_train, y_test = train_test_split(feature_df, full_joined_df['WinnerLabel'], test_size=0.2)

model_pipeline_knn = Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('pca', PCA(0.90, svd_solver='full')),
    ('knn', KNeighborsClassifier())
])


params = {
         'knn__n_neighbors': [9] # Tested np.array(range(1,30))
}

knn_model = GridSearchCV(model_pipeline_knn, 
                   params,
                   cv=5, 
                   scoring='neg_log_loss', 
                   refit=True)

knn_model.fit(x_train_sus_2, y_train_sus)

y_pred = knn_model.predict(x_test_sus_2)
print(sum(y_pred == y_test_sus)/len(y_pred))

0.7854477611940298


In [77]:
tourney_data_train = full_joined_df[full_joined_df['Season'] <= 2015]
tourney_data_test = full_joined_df[full_joined_df['Season'] >= 2016]

x_train = tourney_data_train[feature_columns]
x_test = tourney_data_test[feature_columns]

y_train = full_joined_df['WinnerLabel'].iloc[x_train.index]
y_test = full_joined_df['WinnerLabel'].iloc[x_test.index]

model_pipeline_knn = Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('pca', PCA(0.90, svd_solver='full')),
    ('knn', KNeighborsClassifier())
])


params = {
         'knn__n_neighbors': [9] # Tested np.array(range(1,30))
}

knn_model = GridSearchCV(model_pipeline_knn, 
                   params,
                   cv=5, 
                   scoring='neg_log_loss', 
                   refit=True)

knn_model.fit(x_train, y_train)

y_pred = knn_model.predict(x_test)
print(sum(y_pred == y_test)/len(y_pred))

0.6128048780487805


0.8731343283582089


In [298]:
feature_difference_df.columns

Index(['Score_diff', 'FGM_diff', 'FGA_diff', 'FGM3_diff', 'FGA3_diff',
       'FTM_diff', 'FTA_diff', 'OR_diff', 'DR_diff', 'Ast_diff', 'TO_diff',
       'Stl_diff', 'Blk_diff', 'PF_diff', 'EFGP_diff', 'TP_diff', 'ORP_diff',
       'FTR_diff', 'OrdinalRank_diff', 'WinPct_diff', 'POS_diff',
       'PerPos_diff'],
      dtype='object')

In [307]:
x_train_sus.columns

Index(['ScoreT_diff', 'FGM_diff', 'FGM%_diff', 'FGA_diff', 'FGM3_diff',
       'FGM3%_diff', 'FGA3_diff', 'FTM_diff', 'FTM%_diff', 'FTA_diff',
       'OR_diff', 'DR_diff', 'Ast_diff', 'TO_diff', 'Stl_diff', 'Blk_diff',
       'PF_diff', 'Pos_diff', 'AstR_diff', 'TOR_diff', 'eFGP_diff', 'ORP_diff',
       'DRP_diff', 'RP_diff'],
      dtype='object')

In [313]:
drop_cols = ['AstR_diff']
x_train_sus = x_train_sus.drop(drop_cols, axis=1)
x_test_sus = x_test_sus.drop(drop_cols, axis=1)

# Sandbox 2

In [109]:
tourney_data = pd.read_csv('../../xgballing/tourney_result_2003+.csv')
tourney_data.drop_duplicates(inplace=True)
# split data into training (2003-2015) and test (2016-2019)
tourney_data_train = tourney_data[tourney_data['Season'] <= 2015].drop('Unnamed: 0',axis=1)
tourney_data_test = tourney_data[tourney_data['Season'] >= 2016].drop('Unnamed: 0',axis=1)

In [126]:
# Split Data on both training and test sets
tourney_data_train_clean = tourney_data_train.drop(['Season', 'WTeamID', 'LTeamID','LDivision','WDivision'], axis=1)
tourney_data_test_clean = tourney_data_test.drop(['Season', 'WTeamID', 'LTeamID','LDivision','WDivision'], axis=1)
# Train data
x_train_sus = tourney_data_train_clean.drop('result', axis=1)
y_train_sus = tourney_data_train_clean.result
# Test data
x_test_sus = tourney_data_test_clean.drop('result', axis=1)
y_test_sus = tourney_data_test_clean.result

#X_train, y_train = shuffle(X_train, y_train)

In [246]:
model_pipeline_knn = Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('pca', PCA(0.90, svd_solver='full')),
    ('knn', KNeighborsClassifier())
])


params = {
         'knn__n_neighbors': [9] # Tested np.array(range(1,30))
}

knn_model = GridSearchCV(model_pipeline_knn, 
                   params,
                   cv=5, 
                   scoring='neg_log_loss', 
                   refit=True)

knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)
print(sum(y_pred == y_test)/len(y_pred))

0.8731343283582089


In [None]:
model_pipeline_knn = Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('pca', PCA(0.90, svd_solver='full')),
    ('knn', KNeighborsClassifier())
])
params = {
         'knn__n_neighbors': [9] # Tested np.array(range(1,30))
}

knn_model = GridSearchCV(model_pipeline_knn, 
                   params,
                   cv=5, 
                   scoring='neg_log_loss', 
                   refit=True)

knn_model.fit(X_train, y_train)

score = knn_model.score(X_test, y_test)
print(score)

In [235]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

model_pipeline_knn = Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('pca', PCA(0.90, svd_solver='full')),
    ('knn', KNeighborsClassifier())
])
params = {
         'knn__n_neighbors': [9] # Tested np.array(range(1,30))
}



num_trials = 100
ROI_tracker = np.empty(num_trials)
accuracy_tracker = np.empty(num_trials)

for idx in range(num_trials):
    knn_model = GridSearchCV(model_pipeline_knn, 
                   params,
                   cv=5, 
                   scoring='neg_log_loss', 
                   refit=True)
    ROI, acc = test_model(knn_model,scaled_feature_df, full_joined_df['WinnerLabel'])
    ROI_tracker[idx] = ROI
    accuracy_tracker[idx] = acc

print("Avg. ROI:", np.mean(ROI_tracker))
print("Avg. Accuracy:", np.mean(accuracy_tracker))

Avg. ROI: -0.04885348848222565
Avg. Accuracy: 0.6064102564102563


# Informed Beetting

In [139]:
def calc_implied_p(moneyline):
    if moneyline < 0:
        return (-1*(moneyline)) / (-1*(moneyline) + 100)
    else:
        return 100 / (moneyline + 100)

In [136]:
tournament_compact_results_df = pd.read_csv("../data//Kaggle-Data/MDataFiles_Stage1/MNCAATourneyCompactResults.csv")
tournament_compact_results_df = tournament_compact_results_df[tournament_compact_results_df['Season'] >= 2008]

In [137]:
winner_moneylines = pd.merge(tournament_compact_results_df, moneyline_df, how='left', 
                             left_on=['Season','DayNum','WTeamID'], 
                             right_on = ['Season','DayNum','TeamID'])[["Season", "DayNum", "WTeamID", "LTeamID","Moneyline"]]

winner_moneylines.rename(columns = {"Moneyline": "WMoneyline"}, inplace = True)

tournament_moneyline_df = pd.merge(winner_moneylines, moneyline_df, how='left', 
                             left_on=['Season','DayNum','LTeamID'], 
                             right_on = ['Season','DayNum','TeamID'])[["Season", "DayNum", "WTeamID", "LTeamID","WMoneyline","Moneyline"]]

tournament_moneyline_df.rename(columns = {"Moneyline": "LMoneyline"}, inplace = True)

tournament_moneyline_df.head()

Unnamed: 0,Season,DayNum,WTeamID,LTeamID,WMoneyline,LMoneyline
0,2008,134,1291,1164,-375.0,315.0
1,2008,136,1181,1125,-4500.0,2250.0
2,2008,136,1242,1340,-5000.0,2500.0
3,2008,136,1243,1425,140.0,-160.0
4,2008,136,1266,1246,-275.0,235.0


In [140]:
tournament_probability_df = tournament_moneyline_df
tournament_probability_df["WTeam_Win%"] = tournament_probability_df["WMoneyline"].apply(calc_implied_p)
tournament_probability_df["LTeam_Win%"] = tournament_probability_df["LMoneyline"].apply(calc_implied_p)

tournament_probability_df.head()

Unnamed: 0,Season,DayNum,WTeamID,LTeamID,WMoneyline,LMoneyline,WTeam_Win%,LTeam_Win%
0,2008,134,1291,1164,-375.0,315.0,0.789474,0.240964
1,2008,136,1181,1125,-4500.0,2250.0,0.978261,0.042553
2,2008,136,1242,1340,-5000.0,2500.0,0.980392,0.038462
3,2008,136,1243,1425,140.0,-160.0,0.416667,0.615385
4,2008,136,1266,1246,-275.0,235.0,0.733333,0.298507


In [154]:
model = LogisticRegression()
x_train, x_test, y_train, y_test = train_test_split(scaled_feature_df, full_joined_df['WinnerLabel'], test_size=0.2)
model.fit(x_train, y_train)
model_prob = model.predict_proba(x_test)
y_pred = model.predict(x_test)
    
test_bet_output = full_joined_df.iloc[y_test.index][['Season', 'DayNum', 'ATeamID', 'BTeamID','WinnerLabel']]
test_bet_output['Prediction'] = y_pred
test_bet_output['Correct?'] = test_bet_output['WinnerLabel'] == test_bet_output['Prediction']
test_bet_output['BetOnTeamID'] = test_bet_output.apply(get_bet_on_team, axis=1)
test_bet_output['P(A)'] = [prob[1] for prob in model_prob]
test_bet_output['P(B)'] = [prob[0] for prob in model_prob]

In [160]:
informed_bet_input = test_bet_output[["Season", "DayNum","ATeamID","BTeamID","WinnerLabel",'P(A)','P(B)']]
informed_bet_input = informed_bet_input.merge(moneyline_df, how="left", 
                         left_on=["Season","DayNum","ATeamID"], 
                         right_on=["Season","DayNum","TeamID"]).rename(columns={"Moneyline":"A_Moneyline"})
informed_bet_input = informed_bet_input.merge(moneyline_df, how="left", 
                         left_on=["Season","DayNum","BTeamID"], 
                         right_on=["Season","DayNum","TeamID"]).rename(columns={"Moneyline":"B_Moneyline"})

In [167]:
moneyline_df['Moneyline'].apply(calc_implied_p)


0         0.333333
1         0.705882
2         0.007634
3         0.997442
4         0.615385
            ...   
125207    0.655172
125208    0.363636
125209    0.666667
125210    0.344828
125211    0.687500
Name: Moneyline, Length: 122851, dtype: float64

In [22]:
x_train, x_test, y_train, y_test = train_test_split(feature_df, full_joined_df['WinnerLabel'], test_size=0.1)
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
score = logisticRegr.score(x_test, y_test)
print(score)

0.6752136752136753


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [92]:
x_train, x_test, y_train, y_test = train_test_split(feature_df, full_joined_df['WinnerLabel'], test_size=0.1)

svm = SVC()
svm.fit(x_train, y_train)

y_pred = svm.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.69      0.74        64
         1.0       0.68      0.79      0.73        53

    accuracy                           0.74       117
   macro avg       0.74      0.74      0.73       117
weighted avg       0.74      0.74      0.74       117



## Betting

Unnamed: 0,Season,DayNum,TeamID,Moneyline
0,2008,0,1263,200.0
1,2008,0,1350,-240.0
2,2008,0,1404,13000.0
3,2008,0,1272,-39000.0
4,2008,1,1205,-160.0


In [28]:
test_bet_output = full_joined_df.iloc[y_test.index][['Season', 'DayNum', 'ATeamID', 'BTeamID','WinnerLabel']]
test_bet_output['Prediction'] = y_pred
test_bet_output['Correct?'] = test_bet_output['WinnerLabel'] == test_bet_output['Prediction']
test_bet_output['BetOnTeamID'] = test_bet_output.apply(get_bet_on_team, axis=1)
test_bet_output.head()

Unnamed: 0,Season,DayNum,ATeamID,BTeamID,WinnerLabel,Prediction,Correct?,BetOnTeamID
125,2004,152,1163.0,1181.0,1.0,0.0,False,1181.0
1053,2019,136,1125.0,1268.0,0.0,0.0,True,1268.0
1106,2019,152,1120.0,1438.0,0.0,0.0,True,1438.0
145,2005,137,1242.0,1137.0,0.0,1.0,False,1242.0
1048,2019,136,1199.0,1436.0,1.0,1.0,True,1199.0


In [36]:
test_bet_full_df = test_bet_output.merge(moneyline_df, how="left",  
                                         left_on=["Season","DayNum","BetOnTeamID"], right_on=["Season","DayNum","TeamID"])[['Season','BetOnTeamID','Correct?','Moneyline']]

test_bet_full_df = test_bet_full_df.dropna()

test_bet_full_df.head()

Unnamed: 0,Season,BetOnTeamID,Correct?,Moneyline
1,2019,1268.0,True,-170.0
2,2019,1438.0,True,-330.0
4,2019,1199.0,True,-450.0
5,2021,1329.0,True,-340.0
6,2015,1211.0,True,-2640.0


In [85]:
initial_investment = len(test_bet_full_df.index)
running_balanace = initial_investment

for idx, row in test_bet_full_df.iterrows():
    if row["Correct?"]:
        running_balanace += calc_net_payout(row["Moneyline"])
    else:
        running_balanace -= 1
    
ROI = (running_balanace - initial_investment)/initial_investment
print(ROI)

0.05395732570731582


In [88]:
test_model(SVC(),feature_df, full_joined_df['WinnerLabel'])

-0.12766304480648763

In [80]:
num_trials = 1000
tracker = np.empty(num_trials)

for idx in range(num_trials):
    tracker[idx] = ROI

np.mean(tracker)

-0.04415400527915118

In [33]:
val = moneyline_df[(moneyline_df["Season"] == 2010) & 
                   (moneyline_df["DayNum"] == 143) & (moneyline_df["TeamID"] == 1393)]["Moneyline"].item()

In [34]:
def get_moneyline(season, daynum, teamID):
    moneyline_df

In [35]:
initial_investment = len(test_bet_output.index)
running_balance = initial_investment

for game_num in test_bet_output.index:
    
    prediction = test_bet_output['Prediction'][game_num]
    label = test_bet_output['WinnerLabel'][game_num]
    season = test_bet_output["Season"]
    daynum = test_bet_output["DayNum"]
    
    if prediction == 1: # We Bet on A
        teamID = test_bet_output["ATeamID"]
    else: # We bet on B
        teamID = test_bet_output["BTeamID"]
        
    
    if prediction == label: # We were correct
        if label == 1: # Team A won
            
            moneyline = moneyline_df[(moneyline_df["Season"] == season) &
                                     (moneyline_df["DayNum"] == daynum) &
                                     (moneyline_df["TeamID"] == teamID)]["Moneyline"].item()
            print("won bet on A")
        else:
            print("won bet on B")
    
    else: # We were wrong
        if prediction == 1: # We bet on A
            print("lost bet on A")
        else: 
            print("lost bet on B")
            

lost bet on B
won bet on B
won bet on B
lost bet on A


ValueError: Can only compare identically-labeled Series objects

In [None]:
full_joined_df.isna().sum()

## Massey Ordinals

In [None]:
# Read in Massey Ordinal Data
massey_ordinals = pd.read_csv("../data//Kaggle-Data/MDataFiles_Stage1/MMasseyOrdinals.csv")

# Remove rankings that have low or high ordinal rank such as top 25 (allows for more reasonable computation of average rank)
# massey_ordinals = massey_ordinals[(massey_ordinals['OrdinalRank']>320) & (massey_ordinals['OrdinalRank']<360)] 

# Only include rankings of teams right before tournament starts
massey_ordinals = massey_ordinals[massey_ordinals['RankingDayNum'] == 133]

massey_ordinals.head()

In [None]:
avg_ordinal_ranking = massey_ordinals.groupby(['Season','TeamID'])['OrdinalRank'].mean().reset_index()
A_ordinal_ranking = avg_ordinal_ranking.rename(columns={"Season":"A_Season", "OrdinalRank":"A_OrdinalRank"})
B_ordinal_ranking = avg_ordinal_ranking.rename(columns={"Season":"B_Season", "OrdinalRand":"B_OrdinalRank"})

In [None]:
full_joined_df.merge(A_ordinal_ranking, how="left", 
                 left_on=["Season","ATeamID"], 
                 right_on=["A_Season","TeamID"])

In [None]:
A_ordinal_ranking

In [None]:
massey_ordinals = pd.read_csv("../data//Kaggle-Data/MDataFiles_Stage1/MMasseyOrdinals.csv")

massey_ordinals[(massey_ordinals['Season'] == 2021) & (massey_ordinals['TeamID'] == 1124)]

In [None]:
massey_ordinals.groupby('SystemName').mean()