In [9]:
"""
TODO:
* refactor makeDiffVec to optomize for speed
"""

# import modules
import pandas as pd
import numpy as np
import random as rd

In [10]:
# load interim datasets
dfSeasonResultsDet = pd.read_csv("../data/raw/WRegularSeasonDetailedResults.csv")
dfStats = pd.read_csv("../data/interim/TeamStats.csv")
dfToPredict = pd.read_csv("../data/raw/WSampleSubmissionStage1.csv")

dfSeasonResultsDet.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2010,11,3103,63,3237,49,H,0,23,54,...,13,6,10,11,27,11,23,7,6,19
1,2010,11,3104,73,3399,68,N,0,26,62,...,21,14,27,14,26,7,20,4,2,27
2,2010,11,3110,71,3224,59,A,0,29,62,...,14,19,23,17,23,8,15,6,0,15
3,2010,11,3111,63,3267,58,A,0,27,52,...,26,16,25,22,22,15,11,14,5,14
4,2010,11,3119,74,3447,70,H,1,30,74,...,17,11,21,21,32,12,14,4,2,14


## Preparation: Team Vectors & Team Vector Difference
* load data file instead of running every time
* load regular season compact results and for each matchup, find each teams stats from the dfStats and subtract them then save in df and output to file

In [14]:
allSeasons = dfSeasonResultsDet["Season"].unique()
selectSeasons = allSeasons[np.where(allSeasons >= 2010)]
dfForTraining = dfSeasonResultsDet.loc[dfSeasonResultsDet["Season"] >= selectSeasons[0]]

In [15]:
# get the team stats vector given season & id number
# made global because may want to be called manually at some points
def getTeamVec(dfStats, Season, TeamID): 
    dfReduced = dfStats.loc[(dfStats["Season"] == Season) & (dfStats["TeamID"] == TeamID)]
    dfTeamVec = dfReduced.iloc[:, 2:]
    return dfTeamVec

In [16]:
def addDiffVecs(dfStats, dfMatches, predictionData = False):
    
    # create dfDiffVec with required columns of dfStats & such
    if predictionData == True:
        # parse the df column to split into season, team1, team2, then add Location column w/ team1 being home
        predictMatches = np.array([x.split("_") for x in dfMatches["ID"]])
        dfDiffVecs = pd.DataFrame()
        dfDiffVecs["Season"] = predictMatches[:, 0].astype(int)
        dfDiffVecs["Team1"] = predictMatches[:, 1].astype(int)
        dfDiffVecs["Team2"] = predictMatches[:, 2].astype(int)
        dfDiffVecs["Location"] = [0] * len(predictMatches)
        dfDiffVecs["Pred"] = dfMatches["Pred"]
    else:
        dfDiffVecs = dfMatches[["Season", "WTeamID", "LTeamID", "WLoc"]].copy()
        dfDiffVecs.reset_index(drop = True, inplace = True)
        dfDiffVecs.rename(columns = {"WTeamID": "Team1", "LTeamID": "Team2", "WLoc": "Location"}, inplace = True)
        dfDiffVecs.loc[:, "Location"] = [1 if x == "H" else (-1 if x == "A" else (0 if x == "N" else np.nan)) for x in dfDiffVecs["Location"]] # 1 = Team1 home, -1 Team2 home, 0 = neutral
        dfDiffVecs["Pred"] = 1.0 # predict Team1 to win
        
    # initialize matrix; nans will be overwritten if applicable
    dfDiffVecs = dfDiffVecs[["Season", "Team1", "Team2", "Pred", "Location"]]
    dfDiffVecs = dfDiffVecs.reindex(columns = np.append(dfDiffVecs.columns.values, dfStats.iloc[:,2:].columns.values))

    for row in dfDiffVecs.itertuples():
        
        # get dfTeamVec
        dfTeamVec1 = getTeamVec(dfStats, int(row.Season), int(row.Team1))
        dfTeamVec2 = getTeamVec(dfStats, int(row.Season), int(row.Team2))
        
        # reset index so they can be subtracted & subtract
        dfTeamVec1.reset_index(drop = True, inplace = True)
        dfTeamVec2.reset_index(drop = True, inplace = True)
        dfTeamsDiffVec = dfTeamVec1.subtract(dfTeamVec2)

        # add TeamsDiffVec to dfDiffVecs for each matchup
        dfDiffVecs.iloc[row.Index, 5:] = dfTeamsDiffVec.to_numpy().ravel()
        
        # model needs Team2 to win some games to learn
        if predictionData == False:
            randSwap = rd.choice([True, False])
            if randSwap == True:
                dfDiffVecs.iloc[row.Index, 1] = row.Team2 # set Team1 to Team2
                dfDiffVecs.iloc[row.Index, 2] = row.Team1 # set Team2 to Team1
                dfDiffVecs.iloc[row.Index, 3] = 0.0 # set Team1WinPer to 0
                dfDiffVecs.iloc[row.Index, 4] *= -1 # swap location if 1 or -1, 0 will stay the same
                dfDiffVecs.iloc[row.Index, 5:] *= -1 # flip the sign of location and all stats
        
    return dfDiffVecs
    
dfDiffVecs = addDiffVecs(dfStats, dfForTraining)
filename = "TrainingDataDifferenceVectors.csv"

# dfDiffVecs = addDiffVecs(dfStats, dfToPredict, predictionData = True)
# filename = "PredictionDataDifferenceVectors.csv"

dfDiffVecs.to_csv("../data/processed/" + filename, index = None, header = True)
dfDiffVecs.head(20)

Unnamed: 0,Season,Team1,Team2,Pred,Location,WinRatio,PPG,AllowedPPG,FGPer,Turnovers,Blocks,Rebounds,Assists,PerFouls
0,2010,3103,3237,1.0,1,0.496,10.912,-10.331,0.035,-5.304,-1.493,3.017,4.504,-0.86
1,2010,3399,3104,0.0,0,0.069,1.103,0.655,-0.012,-2.448,-0.517,-0.621,-1.379,1.241
2,2010,3110,3224,1.0,-1,0.102,3.567,-5.136,0.052,-0.569,2.112,-3.077,1.648,-1.257
3,2010,3267,3111,0.0,1,-0.165,-7.109,-1.77,-0.03,-3.778,-2.999,-2.556,-1.521,-2.618
4,2010,3119,3447,1.0,1,0.033,-8.041,-8.317,-0.016,-2.559,-0.442,-1.783,-3.158,-1.983
5,2010,3407,3120,0.0,-1,-0.091,0.875,2.339,-0.002,2.565,-2.7,-2.56,-3.805,3.119
6,2010,3146,3125,0.0,-1,0.197,8.03,1.999,0.028,3.018,-0.426,3.123,-1.244,-0.479
7,2010,3132,3152,1.0,1,0.485,11.949,-6.657,0.014,-5.142,0.024,3.006,-0.643,-1.7
8,2010,3310,3138,0.0,1,0.186,-3.794,-4.481,-0.016,-4.9,-2.01,-6.6,0.188,0.161
9,2010,3140,3430,1.0,1,0.431,6.101,-13.838,0.037,-3.786,1.823,3.796,4.327,1.744
