In [5]:
"""
TODO:
* refactor makeDiffVec to optomize for speed
"""

# import modules
import pandas as pd
import numpy as np
import random as rd

In [6]:
# load interim datasets
dfSeasonResultsCom = pd.read_csv("../data/raw/WRegularSeasonCompactResults.csv")
dfStats = pd.read_csv("../data/interim/TeamStats.csv")

dfSeasonResultsCom.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,18,3104,91,3202,41,H,0
1,1998,18,3163,87,3221,76,H,0
2,1998,18,3222,66,3261,59,H,0
3,1998,18,3307,69,3365,62,H,0
4,1998,18,3349,115,3411,35,H,0


## Preparation: Team Vectors & Team Vector Difference
* load data file instead of running every time
* load regular season compact results and for each matchup, find each teams stats from the dfStats and subtract them then save in df and output to file

In [7]:
# get the team stats vector given season & id number
# made global because may want to be called manually at some points
def getTeamVec(dfStats, Season, TeamID): 
    dfReduced = dfStats.loc[(dfStats["Season"]==Season) & (dfStats["TeamID"]==TeamID)]
    dfTeamVec = dfReduced.iloc[:,2:]
    return dfTeamVec

In [8]:
def makeDiffVec(dfStats, dfSeasonMatchups):
    
    # create dfDiffVec with required columns of dfStats & such
    dfMatches = dfSeasonMatchups.copy().drop(columns = ["DayNum", "WScore", "LScore", "WLoc", "NumOT"], axis = 1)
    dfMatches.rename(columns = {'WTeamID': 'Team1', 'LTeamID': 'Team2'}, inplace = True)
    dfMatches.reset_index(drop = True, inplace = True)

    dfDiffVecStats = pd.DataFrame([])
    serWhoWins = pd.Series([], name = "Team1WinPer")
    for index, row in dfSeasonMatchups.iterrows():
        
        # get teamStatsDf
        dfTeamVecW = getTeamVec(dfStats, row["Season"], row["WTeamID"])
        dfTeamVecL = getTeamVec(dfStats, row["Season"], row["LTeamID"])
        
        # reset index so they can be subtracted & subtract
        dfTeamVecW.reset_index(drop = True, inplace = True)
        dfTeamVecL.reset_index(drop = True, inplace = True)
        dfTeamDiffVec = dfTeamVecW.subtract(dfTeamVecL)
        
        dfDiffVecStats = dfDiffVecStats.append(dfTeamDiffVec, ignore_index = True)
    
    dfDiffVec = pd.concat([dfMatches, dfDiffVecStats, pd.Series([1.000] * len(dfMatches.iloc[:,0]), name = "Team1WinPer")], axis = 1) 
    
    # models need more than just one result (i.e. need more than just team1 always winning)
    # use randNumGen to randomly swap team1 <-> team2 and due to that, multiply that diffVec*-1 and then change team1winner to 0.0
    for index, row in dfDiffVec.iterrows():
        randVal = rd.choice([True, False]) # true --> swap, false not swap
        if randVal == True: # swap teams, flip diffVec, and change bool of team1WinPer
            dfDiffVec.loc[index, "Team1"] = row["Team2"].astype(int)
            dfDiffVec.loc[index, "Team2"] = row["Team1"].astype(int)
            dfDiffVec.iloc[index, 3:-1] *= -1 # multiply each feature by -1 indicating t2 - t1 
            dfDiffVec.loc[index, "Team1WinPer"] = 0.000 # originally 1.0 for team1WinPer, but t1 & t2 swap
            
    return dfDiffVec
    
seasons = list(range(2014,2019))
dfDiffVec = makeDiffVec(dfStats, dfSeasonResultsCom.loc[(dfSeasonResultsCom["Season"]==seasons[0]) 
                                                        | (dfSeasonResultsCom["Season"]==seasons[1]) 
                                                        | (dfSeasonResultsCom["Season"]==seasons[2])
                                                        | (dfSeasonResultsCom["Season"]==seasons[3])
                                                        | (dfSeasonResultsCom["Season"]==seasons[4])])

dfDiffVec.to_csv("../data/processed/DifferenceVectors.csv", index = None, header = True)
dfDiffVec.tail(25)

Unnamed: 0,Season,Team1,Team2,WinRatio,PPG,AllowedPPG,Team1WinPer
26070,2018,3138,3141,-0.027,-4.534,-6.781,0.0
26071,2018,3169,3413,-0.261,-6.977,2.108,1.0
26072,2018,3179,3356,0.248,17.733,4.262,1.0
26073,2018,3180,3189,0.014,-7.623,-2.116,0.0
26074,2018,3212,3380,0.042,0.203,2.091,1.0
26075,2018,3219,3251,-0.193,-1.245,7.518,0.0
26076,2018,3225,3294,-0.231,4.468,11.146,0.0
26077,2018,3214,3299,-0.162,-1.026,2.717,0.0
26078,2018,3249,3311,0.168,9.712,-0.672,0.0
26079,2018,3283,3320,0.052,7.91,6.606,0.0
