Import packages and define filepaths.

In [1]:
import math
import pandas as pd
scoreDataFp = '/Users/eotles/Documents/workspace/MarchMadness/2k16/ScoreData_2k16.csv'
teamDataFp  = '/Users/eotles/Documents/workspace/MarchMadness/2k16/TeamData_2k16.csv'

Import game data from this season. Created a modified data file based on data available at http://www.masseyratings.com/. NOTE: I am intentionally ignoring home/away information - maybe I'll get to this next year.

In [2]:
data = pd.read_csv(scoreDataFp)
#ignore home/away
for team_col in ['Team 1', 'Team 2']:
    data[team_col] = data[team_col].str.replace('@', '')
data.head()

Unnamed: 0,Date,Team 1,Team 1 Score,Team 2,Team 2 Score,Unnamed: 5
0,9/25/15,Lakehead,88,Manitoba,81,
1,9/26/15,Lakehead,80,Manitoba,73,
2,10/1/15,Lakehead,78,Winnipeg,73,
3,10/2/15,Concordia QC,77,Queen's ON,76,
4,10/2/15,Winnipeg,76,Lakehead,73,


Import team ranking information.

In [3]:
team_data = pd.read_csv(teamDataFp)
teams = list(team_data['Team Name'].unique())

A class to keep track of score information, it's driver method, and a simple pageRank approximator method.

In [4]:
class score_keeper(object):
    def __init__(self):
        self.score_dict = {}
    
    def add_game(self, game):
        for team1, _ in game.iteritems():
            for team2, score2 in game.iteritems():
                self._add_score(team1, team2, score2)
    
    #team 1 gives points to team 2
    def _add_score(self, team1, team2, score):
        if(self.score_dict.has_key(team1) == False):
            self.score_dict[team1] = {}
        if(self.score_dict[team1].has_key(team2) == False):
            self.score_dict[team1][team2] = 0
        self.score_dict[team1][team2] += score
        

def fill_scores(data, scoreFx):
    scores = score_keeper()

    for row_idx, row in enumerate(data.itertuples()):
        if(row[2] in teams and row[4] in teams):
            s1, s2 = scoreFx(row[3], row[5])
            game = {row[2]: s1, 
                    row[4]: s1}
            scores.add_game(game)
    
    return(scores)

def apprx_pageRank(scores):
    score_data = pd.DataFrame(scores.score_dict).fillna(0)
    score_data = score_data.div(score_data.sum(axis=1), axis=0)

    conv = score_data
    for _ in xrange(50):
        conv = conv.dot(conv)
    return(conv)

We can choose to give teams differing amounts of credit for how they play each game. Here are their descriptions:
rawScore: directly uses the scores from the game - so running up the score is advantageous,
equalizedScore: ensures all games are weighted equally but distributes credit porportional to score,
sigmoidScore: uses a sigmoid function (https://en.wikipedia.org/wiki/Logistic_function) to assign credit based on point-differential.

In [5]:
def rawScore(s1, s2):
    return(int(s1), int(s2))

def equalizedScore(s1, s2):
    s1, s2 = rawScore(s1, s2)
    total = s1+s2
    s1 = float(s1)/total
    s2 = float(s2)/total
    return(s1, s2)

def sigmoidScore(s1, s2):
    x = float(s1 - s2)
    s1 = 0.05 + 0.9/(1+math.exp(x/6))
    s2 = 1-s2
    return(s1, s2)

apr_res = {}
for scoreFx in [rawScore, equalizedScore, sigmoidScore]:
    apr_res[scoreFx.__name__] = apprx_pageRank(fill_scores(data, scoreFx)).ix[1]
apr_res = pd.DataFrame(apr_res)

<function rawScore at 0x10f6976e0>
<function equalizedScore at 0x10f697578>
<function sigmoidScore at 0x10f697668>


Here's a peak at the scores...

In [6]:
apr_res.head()

Unnamed: 0,equalizedScore,rawScore,sigmoidScore
Arizona,0.021282,0.022301,0.027308
Ark Little Rock,0.003356,0.002782,0.003497
Austin Peay,0.003882,0.003925,0.000771
Baylor,0.030564,0.03069,0.02739
Buffalo,0.005407,0.005521,0.001513


Convert scores to ranks and rename some columns.

In [10]:
ranks = apr_res.rank(ascending=False)
ranks = ranks.reset_index()
renamed_cols = {col: col.replace('Score', 'Rank') for col in ranks.columns if 'Score' in col}
renamed_cols['index'] = 'Team Name'
ranks = ranks.rename(columns=renamed_cols)
ranks = ranks.merge(team_data, how='left', on='Team Name')
ranks = ranks.set_index('Team Name')
ranks.index.name = None
ranks = ranks.sort_values(by='Rank')

Here are the calculated ranks alongside the offical rank.

In [9]:
pd.set_option('display.max_rows', len(ranks))
ranks
#pd.reset_option('display.max_rows')

Unnamed: 0,equalizedRank,rawRank,sigmoidRank,Rank
Kansas,2,2,5,1
North Carolina,9,8,11,2
Virginia,31,33,18,3
Oregon,7,7,22,4
Michigan St,28,28,30,5
Oklahoma,6,5,2,6
Villanova,14,13,27,7
Xavier,19,15,37,8
West Virginia,11,10,16,9
Miami FL,34,35,36,10


Really should do some validation to see if one of these models has forecasting power. Maybe next year.  ¯\\_(ツ)_/¯