In [1]:
"""
TODO:
* features to add: PPG, allowed PPG, steals, turnovers, rebounds, assists, FG%, home/away/neutral, conference (1-whatever), title in last 4 yrs?, 
+ https://adeshpande3.github.io/Applying-Machine-Learning-to-March-Madness
"""

# import modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Preparations: Load & Manipulate Data

In [2]:
# load raw datasets
dfTeamNames = pd.read_csv("../data/raw/WTeams.csv")
dfSeasonInfo = pd.read_csv("../data/raw/WSeasons.csv")
dfSeasonResults = pd.read_csv("../data/raw/WRegularSeasonCompactResults.csv")
dfTourneyResults = pd.read_csv("../data/raw/WNCAATourneyCompactResults.csv")
dfSeasonResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
101888,2018,132,3195,68,3239,58,H,0
101889,2018,132,3251,60,3421,42,H,0
101890,2018,132,3311,69,3372,65,N,0
101891,2018,132,3343,63,3335,34,N,0
101892,2018,132,3384,66,3352,56,H,0


In [5]:
# read in possible matchup of teams needed to predict
# ex. 2014_3103_3107, 2014_3103_3113, 2014_3103_3119
dfPred = pd.read_csv('../data/raw/WSampleSubmissionStage1.csv')
dfPred["Season"] = dfPred["ID"].apply(lambda x: x[0:4]).astype(int)
dfPred["Team1"] = dfPred["ID"].apply(lambda x: x[5:9]).astype(int)
dfPred["Team2"] = dfPred["ID"].apply(lambda x: x[10:14]).astype(int)
dfPred = dfPred[["ID","Season","Team1","Team2","Pred"]]
dfPred.head()
dfPred.to_csv("../data/interim/TeamPred.csv", index=None, header=True)

In [6]:
# manipulate datasets
dfSeasonResults["Team1"] = dfSeasonResults["WTeamID"]
dfSeasonResults["Team2"] = dfSeasonResults["LTeamID"]
dfSeasonResults["Pred"] = [1.0]*len(dfSeasonResults.iloc[:,0])
dfSeasonResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,Team1,Team2,Pred
0,1998,18,3104,91,3202,41,H,0,3104,3202,1.0
1,1998,18,3163,87,3221,76,H,0,3163,3221,1.0
2,1998,18,3222,66,3261,59,H,0,3222,3261,1.0
3,1998,18,3307,69,3365,62,H,0,3307,3365,1.0
4,1998,18,3349,115,3411,35,H,0,3349,3411,1.0


## Preparations: Make Features & Target

In [7]:
allSeasons = list(dfSeasonInfo["Season"])

In [8]:
# prepare dfStats aka features
def getStats(df, allSeasons):
    
    def winLossCnt(df):
        dfWinLoss = dfTeamNames[["TeamID"]].copy()
        dfWinLoss = dfWinLoss.reindex(columns = ["TeamID","Wins","Losses"], fill_value=0) # create columns and init w/ 0s
        # find indx of winTeam & loseTeam and add one to their column counts
        for index, row in df.iterrows():
            dfWinLoss.loc[dfWinLoss["TeamID"] == row["WTeamID"], "Wins"] += 1
            dfWinLoss.loc[dfWinLoss["TeamID"] == row["LTeamID"], "Losses"] += 1
        return dfWinLoss
    
    def Ppg(df):
        return
    
    def allowPpg(df):
        return
    
    def homeAwayNeutral(df):
        return
    
    def conf(df):
        return
    
    dfStats = pd.DataFrame([])
    for season in allSeasons:
        dfStatsOneSeason = dfTeamNames[["TeamID"]].copy()
        dfStatsOneSeason["Season"] = [season]*len(dfStatsOneSeason.iloc[:,0])
        
        dfWinLoss = winLossCnt(df.loc[df["Season"]==season])
        
        dfStatsOneSeason = pd.merge(dfStatsOneSeason, dfWinLoss, on=["TeamID"])
        dfStats = pd.concat([dfStats, dfStatsOneSeason], axis=0)
        
    return dfStats
    
dfStats = getStats(dfSeasonResults, allSeasons)
dfStats.head()
dfStats.to_csv("../data/interim/TeamStats.csv", index=None, header=True)

In [10]:
# load raw datasets
dfPred = pd.read_csv("../data/interim/TeamPred.csv")
dfStats = pd.read_csv("../data/interim/TeamStats.csv")
dfTourneyResults = pd.read_csv("../data/raw/WNCAATourneyCompactResults.csv")
dfSeasonResults = pd.read_csv("../data/raw/WRegularSeasonCompactResults.csv")

# dfStats.loc[dfStats["Season"]==2018].head()
# dfPred.tail()
# dfTourneyResults.tail()
dfSeasonResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
101888,2018,132,3195,68,3239,58,H,0
101889,2018,132,3251,60,3421,42,H,0
101890,2018,132,3311,69,3372,65,N,0
101891,2018,132,3343,63,3335,34,N,0
101892,2018,132,3384,66,3352,56,H,0


In [11]:
# get the team stats vector given season & id number
def getTeamVec(dfStats, Season, TeamID): 
    dfReduced = dfStats.loc[(dfStats["Season"]==Season) & (dfStats["TeamID"]==TeamID)]
    teamVec = dfReduced.iloc[:,2:].to_numpy()
    return teamVec

# test
teamVec = getTeamVec(dfStats, 2018, 3101)
teamVec

array([[12, 14]])

In [12]:
def makeDiffVec(dfStats, dfSeasonMatchups):
    
    diffVec = []
    winVec = []
    for index, row in dfSeasonMatchups.iterrows():
        teamVecW = getTeamVec(dfStats, row["Season"], row["WTeamID"])
        teamVecL = getTeamVec(dfStats, row["Season"], row["LTeamID"])
        teamDiffVec = (teamVecW - teamVecL).ravel().tolist() # ravel flattens nparray first, then we can move tolist
        teamWinVec = [1]
        
        diffVec.append(teamDiffVec)
        winVec.append(teamWinVec)
        
    return diffVec, winVec
    
seasons = list(range(2014,2019))
diffVec, winVec = makeDiffVec(dfStats, dfSeasonResults.loc[(dfSeasonResults["Season"]==seasons[0]) 
                                                           | (dfSeasonResults["Season"]==seasons[1]) 
                                                           | (dfSeasonResults["Season"]==seasons[2])
                                                           | (dfSeasonResults["Season"]==seasons[3])
                                                           | (dfSeasonResults["Season"]==seasons[4])])
dfVectors = pd.DataFrame([diffVec, winVec]).transpose()
dfVectors.columns = ["DiffVec","WinVec"]
dfVectors.head()
dfVectors.to_csv("../data/interim/TrainingVectors.csv", index=None, header=True)

## Model

In [None]:
def MatchupDf()

In [13]:
# feature selection
x = diffVec
# target
y = winVec
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 6)

logReg = LogisticRegression()
logReg.fit(xTrain, yTrain)
pred = logisticRegr.predict(xTest)
score = logisticRegr.score(xTest, yTest)

  y = column_or_1d(y, warn=True)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1