In [10]:
import pandas as pd 
import numpy as np
import team, game as g
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
# import pydotplus

In [11]:
def getNCAATeamIds():
    """
    Read in results of NCAA games and assign each result an id 
    for the winning team and losing team. Output a dictionary with each team's ID
    """
    ncaaTourneyTeams = {}
    ncaaTournResults = pd.read_csv("data/NCAATourneyCompactResults.csv")
    for index, row in ncaaTournResults.iterrows():
        season = row["Season"]
        dayNum = row["DayNum"]
        wTeamId = row["WTeamID"]
        lTeamId = row["LTeamID"]
        customWId = str(wTeamId) + "_" + str(season)
        customLId = str(lTeamId) + "_" + str(season)

        if customWId not in ncaaTourneyTeams:
            ncaaTourneyTeams[customWId] = 1
        if customLId not in ncaaTourneyTeams:
            ncaaTourneyTeams[customLId] = 1
    return ncaaTourneyTeams

def getTeamNames():
    """
    Return dictionary where key is team ID and value is team name
    """
    names = {}
    teams = pd.read_csv("Data/Teams.csv")
    for index, row in teams.iterrows():
        teamId = row["TeamID"]
        name = row["TeamName"]
        names[teamId] = name
    return names

In [12]:
def getSeasonStats(ncaaTourneyTeams):
    """
    Use regular season results and RPI rankings to create a 
    dictionary where key is the team's ID and the value is a 
    Team object. Team objects contain yearly avg stats for each 
    team in various categories.
    """
    teams = {}
    names = getTeamNames()
    unfiltRanks = pd.read_csv("data/MasseyOrdinals_Prelim2018.csv")
    rankings = unfiltRanks[(unfiltRanks["SystemName"] == "RPI") & (unfiltRanks["RankingDayNum"] == 133)]
    regSeasonResults = pd.read_csv("data/RegularSeasonDetailedResults.csv")
    for index, row in regSeasonResults.iterrows():
        season = row["Season"]
        dayNum = row["DayNum"]
        wTeamId = row["WTeamID"]
        lTeamId = row["LTeamID"]
        customWId = str(wTeamId) + "_" + str(season)
        customLId = str(lTeamId) + "_" + str(season)
        wRPI = None
        lRPI = None
        try:
            wRPI = rankings[(rankings["Season"] == season) & (rankings["TeamID"] == wTeamId)].iloc[0]["OrdinalRank"]
            lRPI = rankings[(rankings["Season"] == season) & (rankings["TeamID"] == lTeamId)].iloc[0]["OrdinalRank"]
        except Exception as e:
            pass
            # print str(lTeamId) + " " + str(season) + " not found"
        
        if customWId not in teams:
            teams[customWId] = team.Team(customWId)
        if customLId not in teams:
            teams[customLId] = team.Team(customLId)
        wTeam = teams[customWId]
        wTeam.RPI = wRPI
        wTeam.name = names[wTeamId]
        wTeam.updateStats(row, True)
        if customLId in ncaaTourneyTeams:
            wTeam.winsVsTourney += 1
        lTeam = teams[customLId]
        lTeam.name = names[lTeamId]
        lTeam.RPI = lRPI
        lTeam.updateStats(row, False)
    return teams

In [18]:
def getMatchups(teams):
    """
    Use NCAA Tournament results to return data frame of matchups where each row contains data for one matchup between two teams, including their yearly avg totals in statistical categories, RPI, and game result.
    """
    matchups = []
    ncaaTournResults = pd.read_csv("data/NCAATourneyCompactResults.csv")
    for index, row in ncaaTournResults.iterrows():
        season = row["Season"]
        dayNum = row["DayNum"]
        wTeamId = row["WTeamID"]
        lTeamId = row["LTeamID"]
        customWId = str(wTeamId) + "_" + str(season)
        customLId = str(lTeamId) + "_" + str(season)

        if customWId in teams and customLId in teams:
            wTeamData = teams[customWId].objToDict().copy()
            for key in wTeamData.keys():
                wTeamData["w" + key] = wTeamData[key]
                del wTeamData[key]
            lTeamData = teams[customLId].objToDict().copy()
            for key in lTeamData.keys():
                lTeamData["l" + key] = lTeamData[key]
                del lTeamData[key]
            matchupData = wTeamData.copy()
            matchupData.update(lTeamData)
            matchupData["dayNum"] = dayNum
            matchupData["season"] = season
            matchups.append(matchupData)

    df = pd.DataFrame.from_dict(matchups)
    return df

def getMatchupData():
    """
    Returns data frame of historical matchups in NCAA tournament.
    Reads in existing CSV if available. Otherwise, produces data frame by creating Team objects, calculating yearly avg totals for each team, and joining with historical NCAA tourney matchup data
    """
    try:
        matchups = pd.read_csv("Data/output/matchups.csv")
        return matchups
    except Exception as e:
        ncaaTourneyTeams = populateNCAATourneyTeams()
        teamObjs = getSeasonStats(ncaaTourneyTeams)
        matchups = getMatchups(teamObjs)
        matchups.to_csv("Data/output/matchups.csv", index=False)
        return matchups

In [19]:
def findChampionshipMatches():
    """
    Read in NCAA tourney matchups and return data frame containing additional column denoting (True/False) if that matchup was a championship game. 
    """
    matchups = getMatchupData()
    ## group by season and with resulting groupby obj, find whether each row equals the dayNum max for each group
    ## store result as column in matchups defining whether championship played that day
    ## able to pass in functions to transform to perform calculations for each group
    matchups["chipGame"] = matchups.groupby(['season'])['dayNum'].transform(max) == matchups['dayNum']
    return matchups


In [20]:
def getPredictionsChips():
    """
    Outputs predictions for all championship games from 2003-2017 using a Random Forest classifier. Baseline model takes team with lower RPI as winner. 
    Returns a tuple consisting of a data frame containing the model's prediction for every matchup in our test dataset, the baseline model's accuracy, our model's accuracy
    """
    matchups = findChampionshipMatches()
    matchups["baseline"] = matchups["wRPI"] < matchups["lRPI"]
    cols = list(matchups.columns)

    train = matchups[matchups["chipGame"] == False]
    test = matchups[matchups["chipGame"] == True]
    baselineAcc = 1.0*sum(test["baseline"]) / test.shape[0]
    
    trainLabels = np.array(train["baseline"])
    testLabels = np.array(test["baseline"])
    testNames = np.column_stack((test["lname"], test["l_id"], test["wname"], test["w_id"]))
    # Drop qualitative & output columns
    train = train.drop(["w_id", "l_id", "baseline", "wname", "lname", "season", "dayNum", "chipGame"], axis = 1)
    test = test.drop(["w_id", "l_id", "baseline", "wname", "lname", "season", "dayNum", "chipGame"], axis = 1)
    feature_names = train.columns
    trainFeatures = np.array(train)
    testFeatures = np.array(test)
    maxFeatures = int(len(feature_names)**0.5)

    rf = RandomForestClassifier(n_estimators = 1000, random_state=42, oob_score=True, max_features=maxFeatures)
    rf.fit(trainFeatures, trainLabels)
    ## Draw sample classification tree
    # drawTree(rf, "sampleTree")

    predictions = rf.predict(testFeatures)
    predictProbs = rf.predict_proba(testFeatures)
    modelAcc = 1.0*sum(~(predictions ^ testLabels)) / predictions.shape[0]
    stack = np.column_stack((predictions.T, testLabels.T, testNames[:,0], testNames[:,1], testNames[:,2], testNames[:,3], predictProbs[:,0], predictProbs[:,1]))
    return stack[stack[:,0].argsort()], baselineAcc, modelAcc

### Utilize historical matchup data to build RF model. 
def getPredictions(year, train=None, test=None):
    """
    Outputs predictions for games from test data set using a Random Forest classifier. Baseline model takes team with lower RPI as winner. 
    Returns a tuple consisting of a data frame containing the model's prediction for every matchup in our test dataset, the baseline model's accuracy, our model's accuracy
    """
    matchups = getMatchupData()
    matchups["baseline"] = matchups["wRPI"] < matchups["lRPI"]
    cols = list(matchups.columns)
    train = matchups[~matchups["w_id"].str.contains(year)]
    test = matchups[matchups["w_id"].str.contains(year)]
    baselineAcc = 1.0*sum(test["baseline"]) / test.shape[0]
    
    trainLabels = np.array(train["baseline"])
    testLabels = np.array(test["baseline"])
    testNames = np.column_stack((test["lname"], test["l_id"], test["wname"], test["w_id"]))
    # Drop qualitative & output columns
    train = train.drop(["w_id", "l_id", "baseline", "wname", "lname", "season", "dayNum"], axis = 1)
    test = test.drop(["w_id", "l_id", "baseline", "wname", "lname", "season", "dayNum"], axis = 1)
    feature_names = train.columns
    trainFeatures = np.array(train)
    testFeatures = np.array(test)
    maxFeatures = int(len(feature_names)**0.5)

    rf = RandomForestClassifier(n_estimators = 1000, random_state=42, oob_score=True, max_features=maxFeatures)
    rf.fit(trainFeatures, trainLabels)
    ## Draw sample classification tree
    # drawTree(rf, "sampleTree")

    predictions = rf.predict(testFeatures)
    predictProbs = rf.predict_proba(testFeatures)
    modelAcc = 1.0*sum(~(predictions ^ testLabels)) / predictions.shape[0]
    stack = np.column_stack((predictions.T, testLabels.T, testNames[:,0], testNames[:,1], testNames[:,2], testNames[:,3], predictProbs[:,0], predictProbs[:,1]))
    return stack[stack[:,0].argsort()], baselineAcc, modelAcc

In [21]:
def drawTree(rf, treeName):
    dot_data = StringIO()
    export_graphviz(rf.estimators_[0], out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=feature_names)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("{}.pdf".format(treeName))

In [25]:
indPredicts = [["Predict", "Actual", "L Name", "L ID", "W Name", "W ID", "Prob For", "Prob Against"]]
baseAccs = []
modelAccs = []
# for i in range(2003, 2005):
#     output, baselineAcc, modelAcc = getPredictions(str(i))
#     baseAccs.append(baselineAcc)
#     modelAccs.append(modelAcc)
#     for row in output:
#         indPredicts.append(row.tolist())
# pd.DataFrame(indPredicts).to_csv("data/output/testResults.csv", index=False, header=False)
# print baseAccs
# print modelAccs

# Chip games testing
outputPreds, baselineAcc, modelAcc = getPredictionsChips()
print (baselineAcc, modelAcc)
for row in output:
    indPredicts.append(row.tolist())
predsDF = pd.DataFrame(indPredicts)
predsDF
# pd.DataFrame(indPredicts).to_csv("data/output/chipTestResults.csv", index=False, header=False)



0.5333333333333333 0.9333333333333333


Unnamed: 0,0,1,2,3,4,5,6,7
0,Predict,Actual,L Name,L ID,W Name,W ID,Prob For,Prob Against
1,False,False,Kansas,1242_2003,Syracuse,1393_2003,0.688,0.312
2,False,False,Illinois,1228_2005,North Carolina,1314_2005,0.585,0.415
3,False,False,Ohio St,1326_2007,Florida,1196_2007,0.642,0.358
4,False,False,Memphis,1272_2008,Kansas,1242_2008,0.632,0.368
5,False,False,Kentucky,1246_2014,Connecticut,1163_2014,0.549,0.451
6,False,False,Wisconsin,1458_2015,Duke,1181_2015,0.534,0.466
7,True,True,Georgia Tech,1210_2004,Connecticut,1163_2004,0.379,0.621
8,True,False,UCLA,1417_2006,Florida,1196_2006,0.402,0.598
9,True,True,Michigan St,1277_2009,North Carolina,1314_2009,0.336,0.664


In [27]:
indPredicts = [["Predict", "Actual", "L Name", "L ID", "W Name", "W ID", "Prob For", "Prob Against"]]
baseAccs = []
modelAccs = []
for i in range(2003, 2018):
    outputPreds, baselineAcc, modelAcc = getPredictions(str(i))
    baseAccs.append(baselineAcc)
    modelAccs.append(modelAcc)
    for row in outputPreds:
        indPredicts.append(row.tolist())
        
# pd.DataFrame(indPredicts).to_csv("data/output/testResults.csv", index=False, header=False)
print (baseAccs, modelAccs)
predsDF = pd.DataFrame(indPredicts)
predsDF



[0.703125, 0.734375, 0.734375, 0.640625, 0.734375, 0.734375, 0.734375, 0.6875, 0.6268656716417911, 0.6567164179104478, 0.6417910447761194, 0.6417910447761194, 0.7313432835820896, 0.7014925373134329, 0.7164179104477612] [1.0, 0.953125, 0.984375, 0.9375, 0.953125, 0.96875, 0.9375, 0.921875, 0.9850746268656716, 0.9253731343283582, 0.8955223880597015, 0.9104477611940298, 0.9402985074626866, 0.9253731343283582, 0.9104477611940298]


Unnamed: 0,0,1,2,3,4,5,6,7
0,Predict,Actual,L Name,L ID,W Name,W ID,Prob For,Prob Against
1,False,False,Kansas,1242_2003,Syracuse,1393_2003,0.521,0.479
2,False,False,Texas,1400_2003,Syracuse,1393_2003,0.514,0.486
3,False,False,Wake Forest,1448_2003,Auburn,1120_2003,0.904,0.096
4,False,False,Louisville,1257_2003,Butler,1139_2003,0.853,0.147
5,False,False,Xavier,1462_2003,Maryland,1268_2003,0.749,0.251
6,False,False,Florida,1196_2003,Michigan St,1277_2003,0.856,0.144
7,False,False,Alabama,1104_2003,Indiana,1231_2003,0.586,0.414
8,False,False,Mississippi St,1280_2003,Butler,1139_2003,0.761,0.239
9,False,False,Stanford,1390_2003,Connecticut,1163_2003,0.545,0.455
