In [15]:
URL = "http://www.jhowell.net/cf/scores/Sked2017.htm"

In [16]:
import urllib
import string
import re
import operator
from bs4 import BeautifulSoup

In [25]:
with urllib.request.urlopen(URL) as response:
   response = response.read()

In [28]:
#Page is cached. Uncomment to reload
page = BeautifulSoup(response, 'html.parser')

In [29]:
page

<html><head><title>2017 Division IA Schedules and Results</title></head>
<body><p><a href="http://www.jhowell.net/cf/scores/ScoresIndex.htm">Return to the all-time scores index</a></p>
<p>Listed below are the Division IA schedules and results for 2017.
<br/>This file was last updated on Sunday, October 22, 2017.
<br/>Conference affiliation is in parenthesis. Conference games are prefixed with an asterisk.</p>
 
<table border="1">
<tr><td bgcolor="#COCOCO" colspan="9"><p align="center">Air Force (MWC)</p></td></tr>
<tr><td align="right">9/2</td><td>Sat</td><td>vs.</td><td>Virginia Military Institute</td><td>W</td><td align="right">62</td><td align="right">0</td></tr>
<tr><td align="right">9/16</td><td>Sat</td><td>@</td><td>Michigan</td><td>L</td><td align="right">13</td><td align="right">29</td></tr>
<tr><td align="right">9/23</td><td>Sat</td><td>vs.</td><td>*San Diego State</td><td>L</td><td align="right">24</td><td align="right">28</td></tr>
<tr><td align="right">9/30</td><td>Sat</td>

Functions to Extract Games

In [None]:
def get_games(team):
    return team.find_all('tr')[1:]

In [None]:
def get_team_1(team):
    try:
        return team.find_all('p')[0].text
    except:
        return None

In [None]:
def get_score_1(game):
    try:
        score = int(game.find_all('td')[5].text)
        return score
    except:
        return None

In [None]:
def get_score_2(game):
    try:
        score = int(game.find_all('td')[6].text)
        return score
    except:
        return None

In [None]:
def get_opponent(game):
    try:
        opp = game.find_all('td')[3].text
        opp = string.strip(opp, '*')
        return opp
    except:
        return None

In [None]:
def del_conf(team_name):
    my_re = re.compile(r"( \(\w\w+ \w+\)$)|( \(\w+\)$)")
    try:
        return re.sub(my_re, '', team_name)
    except:
        return team_name

In [None]:
def get_game_details(game, team1):
    if get_score_1(game):
        return (del_conf(team1), del_conf(get_opponent(game)), get_score_1(game), get_score_2(game))
    else:
        return

In [None]:
def parse_games(team):
    team1 = get_team_1(team)
    games = [get_game_details(game, team1) for game in get_games(team)[:-1] \
             if get_game_details(game, team1)]
    return games

In [None]:
teams = page.find_all('table')

In [None]:
get_score_1(parse_games(teams[0])[1][0])

#### Now, get all the games

In [None]:
all_games = []
for team in teams:
    all_games.extend(parse_games(team))

In [None]:
len(all_games)

There are approximately twice as many games as there should be because the teams appear in both orders

In [None]:
def alpha(game):
    team1, team2, score1, score2 = game
    if team1 < team2:
        return game
    else:
        return (team2, team1, score2, score1)

In [None]:
all_games_alpha = [alpha(game) for game in all_games]
all_games_alpha = list(set(all_games_alpha))

In [None]:
len(all_games_alpha)

It works!

### Random Walk Model

In [None]:
import networkx as nx

First, prepare the edges.

In [None]:
def game_to_edge_ratio(game):
    team1, team2, score1, score2 = game
    if score1 <= score2:
        return (team1, team2, np.sqrt(float(score2-score1)/score2))
    else:
        return (team2, team1, np.sqrt(float(score1-score2)/score1))

In [None]:
def game_to_edge(game):
    team1, team2, score1, score2 = game
    if score1 <= score2:
        return (team1, team2, float(score2-score1))
    else:
        return (team2, team1, float(score1-score2))

In [None]:
edge_list = [game_to_edge(game) for game in all_games_alpha]

Sanity Check: Did I get all the games with no repeats?

In [None]:
print len([edge for edge in edge_list if edge[0]=='Michigan' or edge[1]=='Michigan'])
[edge for edge in edge_list if edge[0]=='Michigan' or edge[1]=='Michigan']

In [None]:
G = nx.DiGraph()

In [None]:
G.add_weighted_edges_from(edge_list)

In [None]:
ranked = nx.pagerank_numpy(G, alpha=.65)

I'll need the teams alphabetically sorted later...

In [None]:
page_rank_sorted = sorted(ranked.items(), key=operator.itemgetter(0))

But let's look at what the teams look like sorted by the random walk ranking.

In [None]:
sorted(ranked.items(), key=operator.itemgetter(1))[:-20:-1]

### Regression Model

In [None]:
import numpy as np

In [None]:
all_teams = set()
for team1, team2, _, _ in all_games_alpha:
    all_teams.add(team1)
    all_teams.add(team2)
all_teams = list(all_teams)

I want to make a matrix with a row for each game. The teams will be the columns. I'll assign each team a number that will correspond to the column index.

In [None]:
all_teams_tuples = zip(all_teams, range(len(all_teams)))
team_dict = dict(all_teams_tuples)

In [None]:
my_matrix = np.empty((0,len(all_teams)), int)
my_vector = np.empty((0, 1))

In [None]:
for team1, team2, score1, score2 in all_games_alpha:
    row = np.zeros(len(all_teams))
    row[team_dict[team1]]=1
    row[team_dict[team2]]=-1
    my_matrix = np.append(my_matrix, [row], axis=0)
    my_vector = np.append(my_vector, [[score1 - score2]], axis=0)

#adding the Massey Fudge Factor
my_matrix = np.append(my_matrix, [np.ones(len(all_teams))], axis=0)
my_vector = np.append(my_vector, [[0]], axis=0)

Sanity check: Do my matrix and vector have compatible dimensions?

In [None]:
print my_matrix.shape
print my_vector.shape

Least Squares -- my favorite thing

In [None]:
soln = np.linalg.lstsq(my_matrix, my_vector)

In [None]:
ranks = list(np.ndarray.flatten(soln[0]))

In [None]:
ranked = zip(all_teams, ranks)

In [None]:
sorted_x = sorted(ranked, key=operator.itemgetter(1))

Let's see the results

In [None]:
sorted_x[:-20:-1]

Portland State! Are you kidding me? I'll use regularization to correct this apparent over-fitting.

### Regularized Regression

In [None]:
from sklearn import linear_model, grid_search, preprocessing

In [None]:
lm = linear_model.Ridge()
parameters = {'alpha':[0.01, 0.1, 1, 10, 100]}


In [None]:
clf = grid_search.GridSearchCV(lm, parameters)

In [None]:
clf.fit(my_matrix, my_vector)

In [None]:
ridge = clf.best_estimator_

In [None]:
ridge.fit(my_matrix, my_vector)

I have my model, and it's been fited. Let's take a look.

In [None]:
linear_rank = zip(all_teams, list(preprocessing.scale(ridge.coef_[0])))

In [None]:
sorted(linear_rank, key=operator.itemgetter(1))[:-20:-1]

It knocked down Portland State a little bit, but not much...

### Ensemble

I need to put the two models on the same scale.

In [None]:
linear_rank_sorted = sorted(linear_rank, key=operator.itemgetter(0))

In [None]:
page_ranks = list(preprocessing.scale(zip(*page_rank_sorted)[1]))

In [None]:
final = zip(linear_rank_sorted, page_ranks)

In [None]:
final = [(team, first + second) for (team, first), second in final]

In [None]:
final = sorted(final, key=operator.itemgetter(1))

In [None]:
final[:-50:-1]