In [2]:
URL = "http://www.jhowell.net/cf/scores/Sked2017.htm"

In [3]:
import urllib
import string
import re
import operator
from bs4 import BeautifulSoup

In [4]:
with urllib.request.urlopen(URL) as response:
   response = response.read()

In [5]:
#Page is cached. Uncomment to reload
page = BeautifulSoup(response, 'html.parser')

In [6]:
#page

Functions to Extract Games

In [7]:
def get_games(team):
    return team.find_all('tr')[1:]

In [8]:
def get_team_1(team):
    try:
        return team.find_all('p')[0].text
    except:
        return None

In [9]:
def get_score_1(game):
    try:
        score = int(game.find_all('td')[5].text)
        return score
    except:
        return None

In [10]:
def get_score_2(game):
    try:
        score = int(game.find_all('td')[6].text)
        return score
    except:
        return None

In [11]:
def get_opponent(game):
    try:
        opp = game.find_all('td')[3].text
        opp = opp.replace('*', '')
        return opp
    except:
        return None

In [12]:
def del_conf(team_name):
    my_re = re.compile(r"( \(\w\w+ \w+\)$)|( \(\w+\)$)")
    try:
        return re.sub(my_re, '', team_name)
    except:
        return team_name

In [13]:
def get_game_details(game, team1):
    if get_score_1(game):
        return (del_conf(team1), del_conf(get_opponent(game)), get_score_1(game), get_score_2(game))
    else:
        return

In [14]:
def parse_games(team):
    team1 = get_team_1(team)
    games = [get_game_details(game, team1) for game in get_games(team)[:-1] \
             if get_game_details(game, team1)]
    return games

In [15]:
teams = page.find_all('table')

#### Now, get all the games

In [19]:
all_games = []
for team in teams:
    all_games.extend(parse_games(team))

In [20]:
len(all_games)

920

In [28]:
all_games[:5]

[('Air Force', 'Virginia Military Institute', 62, 0),
 ('Air Force', 'Michigan', 13, 29),
 ('Air Force', 'San Diego State', 24, 28),
 ('Air Force', 'New Mexico', 38, 56),
 ('Air Force', 'Navy', 45, 48)]

There are approximately twice as many games as there should be because the teams appear in both orders

In [29]:
def alpha(game):
    team1, team2, score1, score2 = game
    if team1 < team2:
        return game
    else:
        return (team2, team1, score2, score1)

In [30]:
all_games_alpha = [alpha(game) for game in all_games]
all_games_alpha = list(set(all_games_alpha))

In [31]:
len(all_games_alpha)

525

It works!

In [32]:
##TODO: Make this object oriented.

### Random Walk Model

In [33]:
import networkx as nx

First, prepare the edges.

In [34]:
def game_to_edge_ratio(game):
    team1, team2, score1, score2 = game
    if score1 <= score2:
        return (team1, team2, np.sqrt(float(score2-score1)/score2))
    else:
        return (team2, team1, np.sqrt(float(score1-score2)/score1))

In [35]:
def game_to_edge(game):
    team1, team2, score1, score2 = game
    if score1 <= score2:
        return (team1, team2, float(score2-score1))
    else:
        return (team2, team1, float(score1-score2))

In [36]:
edge_list = [game_to_edge(game) for game in all_games_alpha]

Sanity Check: Did I get all the games with no repeats?

In [37]:
print(len([edge for edge in edge_list if edge[0]=='Michigan' or edge[1]=='Michigan']))
[edge for edge in edge_list if edge[0]=='Michigan' or edge[1]=='Michigan']

7


[('Purdue', 'Michigan', 18.0),
 ('Indiana', 'Michigan', 7.0),
 ('Michigan', 'Penn State', 29.0),
 ('Florida', 'Michigan', 16.0),
 ('Air Force', 'Michigan', 16.0),
 ('Cincinnati', 'Michigan', 22.0),
 ('Michigan', 'Michigan State', 4.0)]

In [38]:
G = nx.DiGraph()

In [39]:
G.add_weighted_edges_from(edge_list)

In [40]:
ranked = nx.pagerank_numpy(G, alpha=.65)

I'll need the teams alphabetically sorted later...

In [41]:
page_rank_sorted = sorted(ranked.items(), key=operator.itemgetter(0))

But let's look at what the teams look like sorted by the random walk ranking.

In [42]:
sorted(ranked.items(), key=operator.itemgetter(1))[:-20:-1]

[('Georgia', 0.02755634387176749),
 ('Notre Dame', 0.02243333768137154),
 ('Clemson', 0.017777368984828477),
 ('Alabama', 0.016636833055618746),
 ('Mississippi State', 0.015981097638500013),
 ('Iowa State', 0.015928893782690558),
 ('Syracuse', 0.015827012920808355),
 ('Texas Christian', 0.015777768998249377),
 ('Oklahoma', 0.014337028804768984),
 ('Arizona State', 0.013354335517201218),
 ('Penn State', 0.012652002208014065),
 ('Central Florida', 0.012547097675031656),
 ('Auburn', 0.011972696838047876),
 ('Southern California', 0.011960887365065592),
 ('Washington', 0.011781285359938016),
 ('Ohio State', 0.011457010625894326),
 ('Texas', 0.010972944784021624),
 ('North Carolina State', 0.010670798095622734),
 ('South Carolina', 0.010295092293857505)]

### Regression Model

In [43]:
import numpy as np

In [44]:
all_teams = set()
for team1, team2, _, _ in all_games_alpha:
    all_teams.add(team1)
    all_teams.add(team2)
all_teams = list(all_teams)

I want to make a matrix with a row for each game. The teams will be the columns. I'll assign each team a number that will correspond to the column index.

In [45]:
all_teams_tuples = zip(all_teams, range(len(all_teams)))
team_dict = dict(all_teams_tuples)

In [46]:
my_matrix = np.empty((0,len(all_teams)), int)
my_vector = np.empty((0, 1))

In [47]:
for team1, team2, score1, score2 in all_games_alpha:
    row = np.zeros(len(all_teams))
    row[team_dict[team1]]=1
    row[team_dict[team2]]=-1
    my_matrix = np.append(my_matrix, [row], axis=0)
    my_vector = np.append(my_vector, [[score1 - score2]], axis=0)

#adding the Massey Fudge Factor
my_matrix = np.append(my_matrix, [np.ones(len(all_teams))], axis=0)
my_vector = np.append(my_vector, [[0]], axis=0)

Sanity check: Do my matrix and vector have compatible dimensions?

In [48]:
print(my_matrix.shape)
print(my_vector.shape)

(526, 210)
(526, 1)


Least Squares -- my favorite thing

In [49]:
soln = np.linalg.lstsq(my_matrix, my_vector)

In [50]:
ranks = list(np.ndarray.flatten(soln[0]))

In [51]:
ranked = zip(all_teams, ranks)

In [52]:
sorted_x = sorted(ranked, key=operator.itemgetter(1))

Let's see the results

In [53]:
sorted_x[:-20:-1]

[('Alabama', 45.928740962811524),
 ('Ohio State', 44.917722389827276),
 ('Penn State', 44.416209454554092),
 ('Notre Dame', 43.297013876086247),
 ('Clemson', 40.680722081124621),
 ('Georgia', 40.071158174643969),
 ('Central Florida', 36.452915824755884),
 ('Wisconsin', 36.181800714614418),
 ('Oklahoma State', 35.887366427495721),
 ('Virginia Tech', 35.842868735008352),
 ('Texas Christian', 35.780378630715205),
 ('Auburn', 35.215588406437256),
 ('Oklahoma', 33.262606032530144),
 ('Washington', 32.879365223288595),
 ('Western Illinois', 30.792069743221862),
 ('Iowa State', 30.050840263547247),
 ('Stanford', 28.26525899121577),
 ('Mississippi State', 27.771213016342486),
 ('Texas', 26.482788051338474)]

Portland State! Are you kidding me? I'll use regularization to correct this apparent over-fitting.

### Regularized Regression

In [54]:
from sklearn import linear_model, grid_search, preprocessing



In [55]:
lm = linear_model.Ridge()
parameters = {'alpha':[0.01, 0.1, 1, 10, 100]}


In [56]:
clf = grid_search.GridSearchCV(lm, parameters)

In [57]:
clf.fit(my_matrix, my_vector)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [58]:
ridge = clf.best_estimator_

In [59]:
ridge.fit(my_matrix, my_vector)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

I have my model, and it's been fited. Let's take a look.

In [81]:
linear_rank = list(zip(all_teams, list(preprocessing.scale(ridge.coef_[0]))))

In [82]:
sorted(linear_rank, key=operator.itemgetter(1))[:-20:-1]

[('Alabama', 2.7230592832389084),
 ('Ohio State', 2.4911256321972517),
 ('Penn State', 2.3401652504512276),
 ('Notre Dame', 2.2280623790343941),
 ('Georgia', 2.1935288301567959),
 ('Clemson', 2.1017297073294432),
 ('Central Florida', 2.0692236346298518),
 ('Texas Christian', 1.9935344878789933),
 ('Oklahoma State', 1.9420499013734265),
 ('Auburn', 1.9035038376726066),
 ('Washington', 1.8427335422098621),
 ('Wisconsin', 1.8052968033792947),
 ('Virginia Tech', 1.719440248698205),
 ('Oklahoma', 1.6740332789857304),
 ('Iowa State', 1.5158717526253098),
 ('Mississippi State', 1.3827515200707592),
 ('Stanford', 1.3827498113137531),
 ('Miami (Florida)', 1.274610717674209),
 ('Iowa', 1.26012940474303)]

It knocked down Portland State a little bit, but not much...

### Ensemble

I need to put the two models on the same scale.

In [83]:
linear_rank_sorted = sorted(linear_rank, key=operator.itemgetter(0))

In [85]:
page_ranks = list(preprocessing.scale(list(zip(*page_rank_sorted))[1]))

In [86]:
final = list(zip(linear_rank_sorted, page_ranks))

In [87]:
final = [(team, first + second) for (team, first), second in final]

In [88]:
final = sorted(final, key=operator.itemgetter(1))

In [89]:
final[:-50:-1]

[('Georgia', 7.9665924228095619),
 ('Notre Dame', 6.7036410728412452),
 ('Alabama', 5.730578611413125),
 ('Clemson', 5.3981083672598924),
 ('Texas Christian', 4.783481828014696),
 ('Iowa State', 4.3440939100004243),
 ('Penn State', 4.3384611545126504),
 ('Mississippi State', 4.2241951556282107),
 ('Ohio State', 4.1867704219666351),
 ('Oklahoma', 4.0990896596481496),
 ('Central Florida', 4.0409507542107228),
 ('Auburn', 3.7297545740181675),
 ('Washington', 3.6205061988726568),
 ('Syracuse', 3.4071586618211027),
 ('Arizona State', 3.1270282396771449),
 ('Southern California', 2.9416760413458567),
 ('Wisconsin', 2.8799018142456321),
 ('Oklahoma State', 2.8359331215857821),
 ('Stanford', 2.7511014869813439),
 ('North Carolina State', 2.7467891441795267),
 ('Texas', 2.7135753253338102),
 ('Fresno State', 2.4460819072832591),
 ('Virginia Tech', 2.4039525398304544),
 ('Miami (Florida)', 2.35939770419481),
 ('San Diego State', 2.1488598088395801),
 ('South Carolina', 2.137934314688108),
 ('Was