In [32]:
import pandas as pd 
import numpy as np
import team, game as g
from sklearn.ensemble import RandomForestClassifier
# Used for developing visual of Random Forest if desired
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus

# Analysis Summary

#### Our goal from this analysis will be to predict the outcome of March Madness games. To do this, we will be using a Random Forest that takes in historical data from NCAA tourney matchups as input and outputs predictions for the winning team for each game in a March Madness tournament. 

### Baseline Model
#### The baseline model we will use to compare our model's results against will be solely tied to a team's RPI. Our baseline model poses the following hypothesis: : 
That in any NCAA tournament game, the team with the lower RPI will win the game. 
Intuitively, this is a reasonable prediction. RPI (Ratings Percentage Index), ranks teams based on their wins, losses, and strength of schedule for the past season. If team A has a lower RPI than team B at the end of a season, it is fair to assume that team A's performance throughout the season has been at a higher level than team B's. For this reason, we can guess that any new basketball fan watching their first March Madness tournament would choose team A over team B in a game, disregarding any other biases.

### Random Forest Approach
We would like to build on our baseline model and see if we can develop an approach that more accurately predicts the outcome of games. One of the most exciting parts of March Madness is the array of upsets that occur throughout the tournament. In general, upsets occur when a team with a higher RPI beats a team with a lower RPI. We would like to create a model that performs better than our baseline model by more accurately predicting the outcome of games, in particular predicting when upsets occur. For us to accomplish this, we can utilize RPI as well as some additional attributes that provide more information surrounding each team's level of performance in the past season. These factors can be used to build a random forest in order to: 
1) Identify which factors are correlated to predicting the outcome of a game
2) Predict when an upset is going to occur in March Madness

We will build a random forest by passing as input a data frame where each row corresponds to an NCAA tournament game. Each row will contain data regarding each team's yearly averages and totals in statistical categories, RPI, the game's outcome, and whether or not the team with the lowest RPI won. This last piece of information will be our dependent variable. The random forest will utilize each feature in our training data set describing the winning team and losing team's performance during the season, in order to learn which factors are tied to predicting the outcome of a game. Once the random forest has been trained, tournament data where the outcome of the each game has been excluded can be used as input to the model to generate a set of predictions for each game in that year's tournament.

### Data Sources Overview
- Kaggle provided

### Data Cleansing/Preparation
- Description of chosen statistical categories to use in model
- Finding yearly averages/totals for each team

### Random Forest Creation
- Created 15 years worth of predictions
- Parameter Selection
    - OOB when creating RF each time
    - Max features
    - n trees

### Model Results
- 15 Year comparison with baseline model
- Championship game predictions vs. baseline model

### Conclusion




In [33]:
def getNCAATeamiIds():
    """
    Read in results of NCAA games and assign each result an id 
    for the winning team and losing team. Output a dictionary with each team's ID
    """
    ncaaTourneyTeams = {}
    ncaaTournResults = pd.read_csv("data/NCAATourneyCompactResults.csv")
    for index, row in ncaaTournResults.iterrows():
        season = row["Season"]
        dayNum = row["DayNum"]
        wTeamId = row["WTeamID"]
        lTeamId = row["LTeamID"]
        customWId = str(wTeamId) + "_" + str(season)
        customLId = str(lTeamId) + "_" + str(season)

        if customWId not in ncaaTourneyTeams:
            ncaaTourneyTeams[customWId] = 1
        if customLId not in ncaaTourneyTeams:
            ncaaTourneyTeams[customLId] = 1
    return ncaaTourneyTeams

def getTeamNames():
    """
    Return dictionary where key is team ID and value is team name
    """
    names = {}
    teams = pd.read_csv("Data/Teams.csv")
    for index, row in teams.iterrows():
        teamId = row["TeamID"]
        name = row["TeamName"]
        names[teamId] = name
    return names

In [12]:
def getSeasonStats(ncaaTourneyTeams):
    """
    Use regular season results and RPI rankings to create a 
    dictionary where key is the team's ID and the value is a 
    Team object. Team objects contain yearly avg stats for each 
    team in various categories.
    """
    teams = {}
    names = getTeamNames()
    unfiltRanks = pd.read_csv("data/MasseyOrdinals_Prelim2018.csv")
    rankings = unfiltRanks[(unfiltRanks["SystemName"] == "RPI") & (unfiltRanks["RankingDayNum"] == 133)]
    regSeasonResults = pd.read_csv("data/RegularSeasonDetailedResults.csv")
    for index, row in regSeasonResults.iterrows():
        season = row["Season"]
        dayNum = row["DayNum"]
        wTeamId = row["WTeamID"]
        lTeamId = row["LTeamID"]
        customWId = str(wTeamId) + "_" + str(season)
        customLId = str(lTeamId) + "_" + str(season)
        wRPI = None
        lRPI = None
        try:
            wRPI = rankings[(rankings["Season"] == season) & (rankings["TeamID"] == wTeamId)].iloc[0]["OrdinalRank"]
            lRPI = rankings[(rankings["Season"] == season) & (rankings["TeamID"] == lTeamId)].iloc[0]["OrdinalRank"]
        except Exception as e:
            pass
            # print str(lTeamId) + " " + str(season) + " not found"
        
        if customWId not in teams:
            teams[customWId] = team.Team(customWId)
        if customLId not in teams:
            teams[customLId] = team.Team(customLId)
        wTeam = teams[customWId]
        wTeam.RPI = wRPI
        wTeam.name = names[wTeamId]
        wTeam.updateStats(row, True)
        if customLId in ncaaTourneyTeams:
            wTeam.winsVsTourney += 1
        lTeam = teams[customLId]
        lTeam.name = names[lTeamId]
        lTeam.RPI = lRPI
        lTeam.updateStats(row, False)
    return teams

In [18]:
def getMatchups(teams):
    """
    Use NCAA Tournament results to return data frame of matchups where each row contains data for one matchup between two teams, including their yearly avg totals in statistical categories, RPI, and game result.
    """
    matchups = []
    ncaaTournResults = pd.read_csv("data/NCAATourneyCompactResults.csv")
    for index, row in ncaaTournResults.iterrows():
        season = row["Season"]
        dayNum = row["DayNum"]
        wTeamId = row["WTeamID"]
        lTeamId = row["LTeamID"]
        customWId = str(wTeamId) + "_" + str(season)
        customLId = str(lTeamId) + "_" + str(season)

        if customWId in teams and customLId in teams:
            wTeamData = teams[customWId].objToDict().copy()
            for key in wTeamData.keys():
                wTeamData["w" + key] = wTeamData[key]
                del wTeamData[key]
            lTeamData = teams[customLId].objToDict().copy()
            for key in lTeamData.keys():
                lTeamData["l" + key] = lTeamData[key]
                del lTeamData[key]
            matchupData = wTeamData.copy()
            matchupData.update(lTeamData)
            matchupData["dayNum"] = dayNum
            matchupData["season"] = season
            matchups.append(matchupData)

    df = pd.DataFrame.from_dict(matchups)
    return df

def getMatchupData():
    """
    Returns data frame of historical matchups in NCAA tournament.
    Reads in existing CSV if available. Otherwise, produces data frame by creating Team objects, calculating yearly avg totals for each team, and joining with historical NCAA tourney matchup data
    """
    try:
        matchups = pd.read_csv("Data/output/matchups.csv")
        return matchups
    except Exception as e:
        ncaaTourneyTeams = populateNCAATourneyTeams()
        teamObjs = getSeasonStats(ncaaTourneyTeams)
        matchups = getMatchups(teamObjs)
        matchups.to_csv("Data/output/matchups.csv", index=False)
        return matchups

In [19]:
def findChampionshipMatches():
    """
    Read in NCAA tourney matchups and return data frame containing additional column denoting (True/False) if that matchup was a championship game. 
    """
    matchups = getMatchupData()
    ## group by season and with resulting groupby obj, find whether each row equals the dayNum max for each group
    ## store result as column in matchups defining whether championship played that day
    ## able to pass in functions to transform to perform calculations for each group
    matchups["chipGame"] = matchups.groupby(['season'])['dayNum'].transform(max) == matchups['dayNum']
    return matchups


In [42]:
matchups = findChampionshipMatches()
matchups[matchups["chipGame"] == True]

Unnamed: 0,dayNum,lDRB,lEFG,lFTA,lFTP,lMOL,lMOV,lORB,lPOSS,lRPI,...,wTO,wTOF,w_id,wconfTournWins,wdEff,wname,wnumGamesPlayed,woEff,wwinsVsTourney,chipGame
63,154,26.9,0.408333,24.133333,0.661631,5.867986,23.581709,14.3,72.553333,6,...,13.62069,14.448276,1393_2003,2,98.529129,Syracuse,29,112.749541,5,True
127,154,26.09375,0.590909,23.0,0.681974,3.114224,19.534958,11.375,71.66875,16,...,13.787879,12.424242,1163_2004,3,89.8857,Connecticut,33,111.097074,8,True
191,154,23.393939,0.5,17.151515,0.725846,0.033333,17.893417,10.969697,64.618182,2,...,16.677419,19.032258,1314_2005,2,91.615572,North Carolina,31,115.602174,8,True
255,154,22.090909,0.683673,19.212121,0.699394,2.59353,12.360264,10.878788,62.109091,10,...,14.878788,16.272727,1196_2006,4,95.169808,Florida,33,116.501456,7,True
319,154,24.515152,0.5625,19.606061,0.70247,3.323661,16.869421,11.181818,64.115152,1,...,14.0,13.470588,1196_2007,4,92.638035,Florida,34,119.599871,8,True
383,154,26.676471,0.783019,23.852941,0.596792,0.148148,19.755656,14.117647,69.394118,3,...,12.909091,15.848485,1242_2008,4,88.485578,Kansas,33,116.404478,8,True
447,154,24.9375,0.429577,23.3125,0.698263,10.656257,15.333729,13.96875,65.16875,6,...,13.0,15.709677,1314_2009,2,96.797445,North Carolina,31,118.741033,6,True
511,154,23.59375,0.728261,23.9375,0.741678,3.209877,12.074432,9.25,62.91875,12,...,11.117647,14.794118,1181_2010,4,91.111407,Duke,34,115.043575,8,True
578,154,24.064516,0.59,21.451613,0.72485,10.281282,13.577203,10.870968,64.806452,33,...,11.742857,12.142857,1163_2011,5,99.982906,Connecticut,35,110.543668,12,True
645,154,25.787879,0.467213,22.545455,0.696358,5.575745,19.243111,11.121212,66.109091,6,...,11.352941,11.882353,1246_2012,3,86.961502,Kentucky,34,112.715253,11,True


In [20]:
def getPredictionsChips():
    """
    Outputs predictions for all championship games from 2003-2017 using a Random Forest classifier. Baseline model takes team with lower RPI as winner. 
    Returns a tuple consisting of a data frame containing the model's prediction for every matchup in our test dataset, the baseline model's accuracy, our model's accuracy
    """
    matchups = findChampionshipMatches()
    matchups["baseline"] = matchups["wRPI"] < matchups["lRPI"]
    cols = list(matchups.columns)

    train = matchups[matchups["chipGame"] == False]
    test = matchups[matchups["chipGame"] == True]
    baselineAcc = 1.0*sum(test["baseline"]) / test.shape[0]
    
    trainLabels = np.array(train["baseline"])
    testLabels = np.array(test["baseline"])
    testNames = np.column_stack((test["lname"], test["l_id"], test["wname"], test["w_id"]))
    # Drop qualitative & output columns
    train = train.drop(["w_id", "l_id", "baseline", "wname", "lname", "season", "dayNum", "chipGame"], axis = 1)
    test = test.drop(["w_id", "l_id", "baseline", "wname", "lname", "season", "dayNum", "chipGame"], axis = 1)
    feature_names = train.columns
    trainFeatures = np.array(train)
    testFeatures = np.array(test)
    maxFeatures = int(len(feature_names)**0.5)

    rf = RandomForestClassifier(n_estimators = 1000, random_state=42, oob_score=True, max_features=maxFeatures)
    rf.fit(trainFeatures, trainLabels)
    ## Draw sample classification tree
    # drawTree(rf, "sampleTree")

    predictions = rf.predict(testFeatures)
    predictProbs = rf.predict_proba(testFeatures)
    modelAcc = 1.0*sum(~(predictions ^ testLabels)) / predictions.shape[0]
    stack = np.column_stack((predictions.T, testLabels.T, testNames[:,0], testNames[:,1], testNames[:,2], testNames[:,3], predictProbs[:,0], predictProbs[:,1]))
    return stack[stack[:,0].argsort()], baselineAcc, modelAcc

### Utilize historical matchup data to build RF model. 
def getPredictions(year, train=None, test=None):
    """
    Outputs predictions for games from test data set using a Random Forest classifier. Baseline model takes team with lower RPI as winner. 
    Returns a tuple consisting of a data frame containing the model's prediction for every matchup in our test dataset, the baseline model's accuracy, our model's accuracy
    """
    matchups = getMatchupData()
    matchups["baseline"] = matchups["wRPI"] < matchups["lRPI"]
    cols = list(matchups.columns)
    train = matchups[~matchups["w_id"].str.contains(year)]
    test = matchups[matchups["w_id"].str.contains(year)]
    baselineAcc = 1.0*sum(test["baseline"]) / test.shape[0]
    
    trainLabels = np.array(train["baseline"])
    testLabels = np.array(test["baseline"])
    testNames = np.column_stack((test["lname"], test["l_id"], test["wname"], test["w_id"]))
    # Drop qualitative & output columns
    train = train.drop(["w_id", "l_id", "baseline", "wname", "lname", "season", "dayNum"], axis = 1)
    test = test.drop(["w_id", "l_id", "baseline", "wname", "lname", "season", "dayNum"], axis = 1)
    feature_names = train.columns
    trainFeatures = np.array(train)
    testFeatures = np.array(test)
    maxFeatures = int(len(feature_names)**0.5)

    rf = RandomForestClassifier(n_estimators = 1000, random_state=42, oob_score=True, max_features=maxFeatures)
    rf.fit(trainFeatures, trainLabels)

    predictions = rf.predict(testFeatures)
    predictProbs = rf.predict_proba(testFeatures)
    modelAcc = 1.0*sum(~(predictions ^ testLabels)) / predictions.shape[0]
    stack = np.column_stack((predictions.T, testLabels.T, testNames[:,0], testNames[:,1], testNames[:,2], testNames[:,3], predictProbs[:,0], predictProbs[:,1]))
    return stack[stack[:,0].argsort()], baselineAcc, modelAcc

In [21]:
def drawTree(rf, treeName):
    dot_data = StringIO()
    export_graphviz(rf.estimators_[0], out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=feature_names)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("{}.pdf".format(treeName))

In [49]:
indPredicts = [["Predict", "Actual", "L Name", "L ID", "W Name", "W ID", "Prob For", "Prob Against"]]
baseAccs = []
modelAccs = []

# Chip games testing
outputPreds, baselineAcc, modelAcc = getPredictionsChips()
print("Baseline Model Accuracy: {}".format(round(baselineAcc, 2)))
print("Our Model's Accuracy: {}".format(round(modelAcc, 2)))
for row in output:
    indPredicts.append(row.tolist())
predsDF = pd.DataFrame(indPredicts)
predsDF
# pd.DataFrame(indPredicts).to_csv("data/output/chipTestResults.csv", index=False, header=False)



Baseline Model Accuracy: 0.53
Our Model's Accuracy: 0.93


Unnamed: 0,0,1,2,3,4,5,6,7
0,Predict,Actual,L Name,L ID,W Name,W ID,Prob For,Prob Against
1,False,False,Kansas,1242_2003,Syracuse,1393_2003,0.688,0.312
2,False,False,Illinois,1228_2005,North Carolina,1314_2005,0.585,0.415
3,False,False,Ohio St,1326_2007,Florida,1196_2007,0.642,0.358
4,False,False,Memphis,1272_2008,Kansas,1242_2008,0.632,0.368
5,False,False,Kentucky,1246_2014,Connecticut,1163_2014,0.549,0.451
6,False,False,Wisconsin,1458_2015,Duke,1181_2015,0.534,0.466
7,True,True,Georgia Tech,1210_2004,Connecticut,1163_2004,0.379,0.621
8,True,False,UCLA,1417_2006,Florida,1196_2006,0.402,0.598
9,True,True,Michigan St,1277_2009,North Carolina,1314_2009,0.336,0.664


In [56]:
indPredicts = [["Predict", "Actual", "L Name", "L ID", "W Name", "W ID", "Prob For", "Prob Against"]]
baseAccs = []
modelAccs = []
for i in range(2003, 2018):
    outputPreds, baselineAcc, modelAcc = getPredictions(str(i))
    baseAccs.append(round(baselineAcc, 2))
    modelAccs.append(round(modelAcc, 2))
    for row in outputPreds:
        indPredicts.append(row.tolist())
        
# pd.DataFrame(indPredicts).to_csv("data/output/testResults.csv", index=False, header=False)
accDF = pd.DataFrame({"Season": range(2003,2018), "Baseline": baseAccs, "RF": modelAccs})
predsDF = pd.DataFrame(indPredicts)


In [55]:
predsDF.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Predict,Actual,L Name,L ID,W Name,W ID,Prob For,Prob Against
1,False,False,Kansas,1242_2003,Syracuse,1393_2003,0.521,0.479
2,False,False,Texas,1400_2003,Syracuse,1393_2003,0.514,0.486
3,False,False,Wake Forest,1448_2003,Auburn,1120_2003,0.904,0.096
4,False,False,Louisville,1257_2003,Butler,1139_2003,0.853,0.147


In [54]:
accDF

Unnamed: 0,Season,Baseline,RF
0,2003,0.7,1.0
1,2004,0.73,0.95
2,2005,0.73,0.98
3,2006,0.64,0.94
4,2007,0.73,0.95
5,2008,0.73,0.97
6,2009,0.73,0.94
7,2010,0.69,0.92
8,2011,0.63,0.99
9,2012,0.66,0.93
