# KING

In [38]:
from __future__ import division
from pandas import concat, read_csv, to_datetime
from ggplot import *
from sklearn import linear_model
import pandas as pd
import numpy as np
from numpy import floor, histogram
from scipy import stats
from scipy.stats import gaussian_kde
from sklearn import linear_model, svm

teams = read_csv('./data/2015/team.csv', index_col=0)
statistics = read_csv('./data/2015/team-game-statistics.csv', index_col=0)
games = teams.join(statistics)

for column in games:
    if 'Unnamed' in column:
        del games[column]

games.Date = to_datetime(games.Date, format='%Y%m%d')

winning_scores = games.groupby('Game Code')['Points'].max()
losing_scores = games.groupby('Game Code')['Points'].min()
scores = pd.DataFrame(data={'Losing Points': losing_scores.values, 'Winning Points': winning_scores.values}, index=winning_scores.index)
games = games.join(scores, on='Game Code')

def is_win(game):
    return game['Points'] > game['Losing Points']

def margin(game):
    if is_win(game):
        return game['Points'] - game['Losing Points']
    else:
        return game['Points'] - game['Winning Points']

games['Margin'] = games.apply(margin, axis=1)
games['Is Win'] = games.apply(is_win, axis=1)

def range_bin(array, step):
    return range(int(array.min() / step) * step, int(array.max() / step) * step + step, step)

def histogram_random(data, bins, samples):
    hist, bins = np.histogram(data, bins=bins*2)

    bin_midpoints = bins[:-1] + np.diff(bins)/2
    cdf = np.cumsum(hist)
    cdf = cdf / cdf[-1]
    values = np.random.rand(samples)
    value_bins = np.searchsorted(cdf, values)
    return bin_midpoints[value_bins]

def kde_random(data, samples):
    def kde(x, x_grid):
        kde = gaussian_kde(x)
        return kde.evaluate(x_grid)
    try:
        x_grid = np.linspace(min(data), max(data), samples)
        pdf = kde(data, x_grid)
        cdf = np.cumsum(pdf)
        cdf = cdf / cdf[-1]
        values = np.random.rand(samples)
        value_bins = np.searchsorted(cdf, values)
        random_from_cdf = x_grid[value_bins]
        return random_from_cdf
    except:
        return [0] * samples

feature_columns = ['Rush Att', 'Rush Yard',
       'Rush TD', 'Pass Att', 'Pass Comp', 'Pass Yard', 'Pass TD',
       'Pass Int', 'Pass Conv', 'Kickoff Ret', 'Kickoff Ret Yard',
       'Kickoff Ret TD', 'Punt Ret', 'Punt Ret Yard', 'Punt Ret TD',
       'Fum Ret', 'Fum Ret Yard', 'Fum Ret TD', 'Int Ret', 'Int Ret Yard',
       'Int Ret TD', 'Misc Ret', 'Misc Ret Yard', 'Misc Ret TD',
       'Field Goal Att', 'Field Goal Made', 'Off XP Kick Att',
       'Off XP Kick Made', 'Off 2XP Att', 'Off 2XP Made', 'Def 2XP Att',
       'Def 2XP Made', 'Safety', 'Punt', 'Punt Yard',
       'Kickoff', 'Kickoff Yard', 'Kickoff Touchback',
       'Kickoff Out-Of-Bounds', 'Kickoff Onside', 'Fumble', 'Fumble Lost',
       'Tackle Solo', 'Tackle Assist', 'Tackle For Loss',
       'Tackle For Loss Yard', 'Sack', 'Sack Yard', 'QB Hurry',
       'Fumble Forced', 'Pass Broken Up', 'Kick/Punt Blocked',
       '1st Down Rush', '1st Down Pass', '1st Down Penalty',
       'Time Of Possession', 'Penalty', 'Penalty Yard', 'Third Down Att',
       'Third Down Conv', 'Fourth Down Att', 'Fourth Down Conv',
       'Red Zone Att', 'Red Zone TD', 'Red Zone Field Goal',
       'First Down Total']

def point_estimator(team):
    clf = linear_model.Lasso()
    train = team[feature_columns]
    model = clf.fit(train, team['Points'])
    return model

def margin_estimator(team):
    clf = linear_model.Lasso()
    train = team[feature_columns]
    model = clf.fit(train, team['Margin'])
    return model

def predict_scores(team, model, iterations, debug=False):
    simulations = pd.DataFrame([kde_random(team[feature], iterations) for feature in feature_columns]).transpose()

    if debug:
        coefficients = pd.DataFrame(model.coef_, feature_columns)
        print coefficients[abs(coefficients[0]) > 0]

    predicted_scores = model.predict(simulations)
    return [max([0, score]) for score in predicted_scores]

k = 10000

bowls = [
    ('Arizona', 'New Mexico'),
    ('Utah', 'BYU'),
    ('Appalachian State', 'Ohio'),
    ('San Jose State', 'Georgia State'),
    ('Louisiana Tech', 'Arkansas State'),
    ('Western Kentucky', 'South Florida'),
    ('Akron', 'Utah State'),
    ('Toledo', 'Temple'),
    ('Boise State', 'Northern Illinois'),
    ('Georgia Southern', 'Bowling Green'),
    ('Western Michigan', 'Middle Tennessee'),
    ('San Diego State', 'Cincinnati'),
    ('Marshall', 'Connecticut'),
    ('Washington State', 'Miami (Florida)'),
    ('Washington', 'Southern Mississippi'),
    ('Duke', 'Indiana'),
    ('Virginia Tech', 'Tulsa'),
    ('Nebraska', 'UCLA'),
    ('Pittsburgh', 'Navy'),
    ('Central Michigan', 'Minnesota'),
    ('Air Force', 'California'),
    ('North Carolina', 'Baylor'),
    ('Nevada', 'Colorado State'),
    ('Texas Tech', 'LSU'),
    ('Memphis', 'Auburn'),
    ('Mississippi State', 'North Carolina State'),
    ('Louisville', 'Texas A&M'),
    ('Wisconsin', 'USC'),
    ('Houston', 'Florida State'),
    ('Clemson', 'Oklahoma'),
    ('Alabama', 'Michigan State'),
    ('Northwestern', 'Tennessee'),
    ('Notre Dame', 'Ohio State'),
    ('Michigan', 'Florida'),
    ('Iowa', 'Stanford'),
    ('Oklahoma State', 'Mississippi'),
    ('Penn State', 'Georgia'),
    ('Kansas State', 'Arkansas'),
    ('Oregon', 'TCU'),
    ('West Virginia', 'Arizona State'),
    ('Clemson', 'Alabama')
]

winners = [
    ('Arizona', 45, 37),
    ('Utah', 35, 28),
    ('Appalachian State', 31, 29),
    ('San Jose State', 27, 16),
    ('Louisiana Tech', 47, 28),
    ('Western Kentucky', 45, 35),
    ('Akron', 23, 21),
    ('Toledo', 32, 17),
    ('Boise State', 55, 7),
    ('Georgia Southern', 58, 27),
    ('Western Michigan', 45, 31),
    ('San Diego State', 42, 7),
    ('Marshall', 16, 10),
    ('Washington State', 20, 14),
    ('Washington', 44, 31),
    ('Duke', 44, 41),
    ('Virginia Tech', 55, 52),
    ('Nebraska', 37, 29),
    ('Navy', 44, 28),
    ('Minnesota', 21, 14),
    ('California', 55, 36),
    ('Baylor', 49, 38),
    ('Nevada', 28, 23),
    ('LSU', 56, 27),
    ('Auburn', 31, 10),
    ('Mississippi State', 51, 28),
    ('Louisville', 27, 21),
    ('Wisconsin', 23, 21),
    ('Houston', 38, 24),
    ('Clemson', 37, 17),
    ('Alabama', 38, 0),
    ('Tennessee', 45, 6),
    ('Ohio State', 44, 28),
    ('Michigan', 41, 7),
    ('Stanford', 45, 16),
    ('Mississippi', 48, 20),
    ('Georgia', 24, 17),
    ('Arkansas', 45, 23),
    ('TCU', 47, 41),
    ('West Virginia', 43, 42)
]

correct = 0

for i, bowl in enumerate(bowls):
    winner = bowl[0]
    team1 = games[games.Name == bowl[0]]
    team2 = games[games.Name == bowl[1]]

    outcome = pd.DataFrame([predict_scores(team1, point_estimator(team1), k), predict_scores(team2, point_estimator(team2), k)]).transpose()
    
    team1_probability = len(outcome[outcome[0] > outcome[1]]) / k
    team2_probability = len(outcome[outcome[1] > outcome[0]]) / k
    spreads = (outcome[0] - outcome[1]).quantile([x / 100 for x in range(0, 100, 10)])

    if team1_probability > team2_probability:
        spreads = (outcome[1] - outcome[0]).quantile([x / 100 for x in range(0, 100, 10)])
        print ', '.join([str(x) for x in [bowl[0], team1_probability, bowl[1], team2_probability]])
    else:
        winner = bowl[1]
        print ', '.join([str(x) for x in [bowl[1], team2_probability, bowl[0], team1_probability]])

    print spreads.iloc[5]
    
    if i < len(winners) and winner == winners[i][0]:
        correct += 1

print 'Accuracy:', correct / len(winners)

Arizona, 0.5913, New Mexico, 0.4087
-5.30054822593
Utah, 0.5187, BYU, 0.4812
-1.13385798144
Appalachian State, 0.7255, Ohio, 0.2741
-14.0479956441
San Jose State, 0.5428, Georgia State, 0.4572
-1.54302027563
Louisiana Tech, 0.5072, Arkansas State, 0.4923
-0.522121422808
Western Kentucky, 0.668, South Florida, 0.332
-9.04857840087
Utah State, 0.6172, Akron, 0.3828
-5.77542781283
Toledo, 0.6518, Temple, 0.3482
-5.71886754818
Boise State, 0.5427, Northern Illinois, 0.4573
-2.69197662378
Bowling Green, 0.755, Georgia Southern, 0.245
-9.19202025275
Western Michigan, 0.518, Middle Tennessee, 0.4796
-1.30622358426
Cincinnati, 0.5266, San Diego State, 0.4734
-1.36392374216
Marshall, 0.6763, Connecticut, 0.3198
-9.34312457996
Washington State, 0.6044, Miami (Florida), 0.3956
-5.10000412776
Southern Mississippi, 0.6869, Washington, 0.3124
-13.8555623706
Indiana, 0.5631, Duke, 0.4369
-2.48402670949
Tulsa, 0.6706, Virginia Tech, 0.3294
-7.47671230182
UCLA, 0.5731, Nebraska, 0.4267
-4.83605289941
N

In [40]:
correct = 0

for i, bowl in enumerate(bowls):
    winner = bowl[0]
    team1 = games[games.Name == bowl[0]]
    team2 = games[games.Name == bowl[1]]

    outcome = pd.DataFrame([predict_scores(team1, margin_estimator(team1), k), predict_scores(team2, margin_estimator(team2), k)]).transpose()
    
    team1_probability = len(outcome[outcome[0] >= outcome[1]]) / k
    team2_probability = len(outcome[outcome[1] >= outcome[0]]) / k
    spreads = (outcome[0]).quantile([x / 100 for x in range(0, 100, 10)])

    if team1_probability > team2_probability:
        spreads = (outcome[1]).quantile([x / 100 for x in range(0, 100, 10)])
        print ', '.join([str(x) for x in [bowl[0], team1_probability, bowl[1], team2_probability]])
    else:
        winner = bowl[1]
        print ', '.join([str(x) for x in [bowl[1], team2_probability, bowl[0], team1_probability]])

    print -1 * spreads.iloc[5]
    
    if i < len(winners) and winner == winners[i][0]:
        correct += 1

print 'Accuracy:', correct / len(winners)

New Mexico, 0.6229, Arizona, 0.5739
-1.49506980399
BYU, 0.5854, Utah, 0.4898
-13.9871831884
Appalachian State, 0.6339, Ohio, 0.4952
-4.8399796237
Georgia State, 0.6289, San Jose State, 0.6015
-2.09376032142
Louisiana Tech, 0.6099, Arkansas State, 0.5266
-6.26895871662
Western Kentucky, 0.6637, South Florida, 0.3715
-11.3219354937
Akron, 0.6208, Utah State, 0.5975
-0.728243074279
Toledo, 0.5775, Temple, 0.4732
-12.9954364572
Boise State, 0.5958, Northern Illinois, 0.5011
-12.4446866863
Bowling Green, 0.6961, Georgia Southern, 0.3674
-8.66834636237
Middle Tennessee, 0.6522, Western Michigan, 0.547
-2.65792137988
San Diego State, 0.6461, Cincinnati, 0.4829
-6.51165179369
Marshall, 0.6936, Connecticut, 0.4812
-0.0
Miami (Florida), 0.5924, Washington State, 0.5896
-6.17827390876
Southern Mississippi, 0.6717, Washington, 0.4771
-7.29361226395
Duke, 0.7353, Indiana, 0.4916
-0.0
Virginia Tech, 0.7819, Tulsa, 0.4015
-0.0
UCLA, 0.6484, Nebraska, 0.4448
-5.82376451505
Navy, 0.6786, Pittsburgh, 0.