In [2]:
URL = 'http://www.jhowell.net/cf/scores/Sked2015.htm'

In [3]:
import urllib2
import string
import re
import operator
from bs4 import BeautifulSoup

In [4]:
#Page is cached. Uncomment to reload
response = urllib2.urlopen(URL)
page = BeautifulSoup(response.read(), 'html.parser')

Functions to Extract Games

In [5]:
def get_games(team):
    return team.find_all('tr')[1:]

In [6]:
def get_team_1(team):
    try:
        return team.find_all('p')[0].text
    except:
        return None

In [7]:
def get_score_1(game):
    try:
        score = int(game.find_all('td')[5].text)
        return score
    except:
        return None

In [8]:
def get_score_2(game):
    try:
        score = int(game.find_all('td')[6].text)
        return score
    except:
        return None

In [9]:
def get_opponent(game):
    try:
        opp = game.find_all('td')[3].text
        opp = string.strip(opp, '*')
        return opp
    except:
        return None

In [10]:
def del_conf(team_name):
    my_re = re.compile(r"( \(\w\w+ \w+\)$)|( \(\w+\)$)")
    try:
        return re.sub(my_re, '', team_name)
    except:
        return team_name

In [11]:
def get_game_details(game, team1):
    if get_score_1(game):
        return (del_conf(team1), del_conf(get_opponent(game)), get_score_1(game), get_score_2(game))
    else:
        return

In [12]:
def parse_games(team):
    team1 = get_team_1(team)
    games = [get_game_details(game, team1) for game in get_games(team)[:-1] \
             if get_game_details(game, team1)]
    return games

In [13]:
teams = page.find_all('table')

In [14]:
get_score_1(parse_games(teams[0])[1][0])

#### Now, get all the games

In [15]:
all_games = []
for team in teams:
    all_games.extend(parse_games(team))

In [16]:
len(all_games)

1526

There are approximately twice as many games as there should be because the teams appear in both orders

In [17]:
def alpha(game):
    team1, team2, score1, score2 = game
    if team1 < team2:
        return game
    else:
        return (team2, team1, score2, score1)

In [18]:
all_games_alpha = [alpha(game) for game in all_games]
all_games_alpha = list(set(all_games_alpha))

In [19]:
len(all_games_alpha)

849

It works!

### Random Walk Model

In [20]:
import networkx as nx

First, prepare the edges.

In [21]:
def game_to_edge_ratio(game):
    team1, team2, score1, score2 = game
    if score1 <= score2:
        return (team1, team2, np.sqrt(float(score2-score1)/score2))
    else:
        return (team2, team1, np.sqrt(float(score1-score2)/score1))

In [22]:
def game_to_edge(game):
    team1, team2, score1, score2 = game
    if score1 <= score2:
        return (team1, team2, float(score2-score1))
    else:
        return (team2, team1, float(score1-score2))

In [23]:
edge_list = [game_to_edge(game) for game in all_games_alpha]

Sanity Check: Did I get all the games with no repeats?

In [24]:
print len([edge for edge in edge_list if edge[0]=='Michigan' or edge[1]=='Michigan'])
[edge for edge in edge_list if edge[0]=='Michigan' or edge[1]=='Michigan']

12


[(u'Nevada-Las Vegas', u'Michigan', 21.0),
 (u'Michigan', u'Ohio State', 29.0),
 (u'Michigan', u'Utah', 7.0),
 (u'Maryland', u'Michigan', 28.0),
 (u'Northwestern', u'Michigan', 38.0),
 (u'Rutgers', u'Michigan', 33.0),
 (u'Penn State', u'Michigan', 12.0),
 (u'Minnesota', u'Michigan', 3.0),
 (u'Oregon State', u'Michigan', 28.0),
 (u'Brigham Young', u'Michigan', 31.0),
 (u'Indiana', u'Michigan', 7.0),
 (u'Michigan', u'Michigan State', 4.0)]

In [25]:
G = nx.DiGraph()

In [26]:
G.add_weighted_edges_from(edge_list)

In [27]:
ranked = nx.pagerank_numpy(G, alpha=.65)

I'll need the teams alphabetically sorted later...

In [28]:
page_rank_sorted = sorted(ranked.items(), key=operator.itemgetter(0))

But let's look at what the teams look like sorted by the random walk ranking.

In [29]:
sorted(ranked.items(), key=operator.itemgetter(1))[:-20:-1]

[(u'Michigan State', 0.027315590655555846),
 (u'Clemson', 0.024016553126832024),
 (u'Mississippi', 0.021393158728406494),
 (u'Nebraska', 0.021166713587665323),
 (u'Oklahoma', 0.01953325110156494),
 (u'Alabama', 0.019247611682379314),
 (u'Texas', 0.01656581759414959),
 (u'Ohio State', 0.014897761592585502),
 (u'Stanford', 0.014805920449450682),
 (u'Florida', 0.014299861011530583),
 (u'Florida State', 0.014212947786712076),
 (u'Michigan', 0.014153615545303208),
 (u'Northwestern', 0.013977634777024739),
 (u'Oklahoma State', 0.013062217936205042),
 (u'Iowa', 0.012918017884925255),
 (u'Notre Dame', 0.012584219720175715),
 (u'North Carolina', 0.012269137960045085),
 (u'Utah', 0.012199946394122025),
 (u'Southern California', 0.011959238718724252)]

### Regression Model

In [30]:
import numpy as np

In [31]:
all_teams = set()
for team1, team2, _, _ in all_games_alpha:
    all_teams.add(team1)
    all_teams.add(team2)
all_teams = list(all_teams)

I want to make a matrix with a row for each game. The teams will be the columns. I'll assign each team a number that will correspond to the column index.

In [32]:
all_teams_tuples = zip(all_teams, range(len(all_teams)))
team_dict = dict(all_teams_tuples)

In [33]:
my_matrix = np.empty((0,len(all_teams)), int)
my_vector = np.empty((0, 1))

In [34]:
for team1, team2, score1, score2 in all_games_alpha:
    row = np.zeros(len(all_teams))
    row[team_dict[team1]]=1
    row[team_dict[team2]]=-1
    my_matrix = np.append(my_matrix, [row], axis=0)
    my_vector = np.append(my_vector, [[score1 - score2]], axis=0)

#adding the Massey Fudge Factor
my_matrix = np.append(my_matrix, [np.ones(len(all_teams))], axis=0)
my_vector = np.append(my_vector, [[0]], axis=0)

Sanity check: Do my matrix and vector have compatible dimensions?

In [35]:
print my_matrix.shape
print my_vector.shape

(850, 209)
(850, 1)


Least Squares -- my favorite thing

In [36]:
soln = np.linalg.lstsq(my_matrix, my_vector)

In [37]:
ranks = list(np.ndarray.flatten(soln[0]))

In [38]:
ranked = zip(all_teams, ranks)

In [39]:
sorted_x = sorted(ranked, key=operator.itemgetter(1))

Let's see the results

In [40]:
sorted_x[:-20:-1]

[(u'Oklahoma', 43.892517133063997),
 (u'Alabama', 36.088531668831735),
 (u'Ohio State', 35.737391707293526),
 (u'Baylor', 35.515951626404529),
 (u'Portland State', 34.376327474231836),
 (u'Clemson', 33.573424032981038),
 (u'North Carolina', 33.170438199393089),
 (u'Stanford', 30.998567560992356),
 (u'Notre Dame', 30.658449415711758),
 (u'Texas Christian', 30.393717621607674),
 (u'Mississippi', 30.278975427385006),
 (u'Michigan State', 29.67441738132484),
 (u'Oklahoma State', 29.640534564153739),
 (u'West Virginia', 29.220940046822321),
 (u'Michigan', 28.928575845412027),
 (u'Tennessee', 28.875982671115157),
 (u'Florida State', 28.289389014105325),
 (u'Southern California', 27.79823902570385),
 (u'Iowa', 27.384779267523122)]

Portland State! Are you kidding me? I'll use regularization to correct this apparent over-fitting.

### Regularized Regression

In [41]:
from sklearn import linear_model, grid_search, preprocessing

In [42]:
lm = linear_model.Ridge()
parameters = {'alpha':[0.01, 0.1, 1, 10, 100]}


In [43]:
clf = grid_search.GridSearchCV(lm, parameters)

In [44]:
clf.fit(my_matrix, my_vector)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'alpha': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [45]:
ridge = clf.best_estimator_

In [46]:
ridge.fit(my_matrix, my_vector)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

I have my model, and it's been fited. Let's take a look.

In [47]:
linear_rank = zip(all_teams, list(preprocessing.scale(ridge.coef_[0])))

In [48]:
sorted(linear_rank, key=operator.itemgetter(1))[:-20:-1]

[(u'Oklahoma', 1.9973154915506381),
 (u'Alabama', 1.6442108323922235),
 (u'Ohio State', 1.6405665018802429),
 (u'Baylor', 1.5933948927333106),
 (u'Clemson', 1.533437994312824),
 (u'North Carolina', 1.5209924933965138),
 (u'Portland State', 1.5201599715152863),
 (u'Stanford', 1.4272528786186829),
 (u'Notre Dame', 1.3986416195180393),
 (u'Mississippi', 1.3906418447099382),
 (u'Texas Christian', 1.3854220388407812),
 (u'Michigan State', 1.3482610862597395),
 (u'Oklahoma State', 1.331915809423772),
 (u'West Virginia', 1.3255555932541623),
 (u'Tennessee', 1.3231521782847095),
 (u'Michigan', 1.3108131258562579),
 (u'Florida State', 1.294242932858116),
 (u'Southern California', 1.2748292665186916),
 (u'Washington', 1.2425086328785349)]

It knocked down Portland State a little bit, but not much...

### Ensemble

I need to put the two models on the same scale.

In [49]:
linear_rank_sorted = sorted(linear_rank, key=operator.itemgetter(0))

In [50]:
page_ranks = list(preprocessing.scale(zip(*page_rank_sorted)[1]))

In [51]:
final = zip(linear_rank_sorted, page_ranks)

In [52]:
final = [(team, first + second) for (team, first), second in final]

In [53]:
final = sorted(final, key=operator.itemgetter(1))

In [54]:
final[:-50:-1]

[(u'Michigan State', 6.3305334555398547),
 (u'Clemson', 5.7861921324823227),
 (u'Oklahoma', 5.2586742501493324),
 (u'Mississippi', 5.0632831395480622),
 (u'Alabama', 4.8424059676597313),
 (u'Nebraska', 4.4504476386806999),
 (u'Ohio State', 3.8768765674952488),
 (u'Stanford', 3.643254054838057),
 (u'Michigan', 3.382569710834948),
 (u'Florida State', 3.3791196934709307),
 (u'Texas', 3.3636451291354001),
 (u'North Carolina', 3.176033371871279),
 (u'Oklahoma State', 3.1623309588184769),
 (u'Florida', 3.1563705784657952),
 (u'Notre Dame', 3.1233567245829557),
 (u'Iowa', 3.0253105523967658),
 (u'Northwestern', 2.8744056795091248),
 (u'Southern California', 2.8613419320275528),
 (u'Texas Christian', 2.825229709896675),
 (u'Utah', 2.7358652784753144),
 (u'Houston', 2.5666576383583184),
 (u'Navy', 2.556719178314939),
 (u'Louisiana State', 2.5281781361262841),
 (u'Baylor', 2.5199280791545706),
 (u'Oregon', 2.3813302297935257),
 (u'South Florida', 2.1409994681981166),
 (u'Bowling Green State', 2.