In [None]:
import numpy
import pandas
import sklearn
import sklearn.linear_model
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import LeavePGroupsOut
from sklearn.model_selection import cross_validate
from sklearn_pandas import DataFrameMapper

from sportsball.nfl.nfl import Season

season_2018 = Season(
    year=2018,
    sheettitle='NFL 2018 Expected Wins',
)

season_2018.attach_sagarin_ratings()
season_2018.attach_538_ratings()
season_2018.attach_massey_ratings()
season_2018.attach_scorex_ratings()
season_2018.attach_vegas_ratings()


def genratings(season, system):
    for teamname, team in season.teams.items():
        for week, rating in team.ratings[system].items():
            ret = {
                'team_id': team.team_id,
                'system_name': system,
                'week': week,
            }
            ret.update(rating)
            yield ret


ratings_gnrtr = genratings(season_2018, 'massey')
ratings_pdf = pandas.DataFrame(list(ratings_gnrtr))
del ratings_pdf['system_name']
dtf = pandas.DataFrame([{
    'away_team': game.away_team.team_id,
    'home_team': game.home_team.team_id,
    'away_score': game.away_score,
    'home_score': game.home_score,
    'home_adv': 0 if game.neutral else 1,
    'week': game.week,
    'game_id': ii,
} for ii, game in enumerate(season_2018.games) if game.played] +
                       [{
                           'away_team': game.home_team.team_id,
                           'home_team': game.away_team.team_id,
                           'away_score': game.home_score,
                           'home_score': game.away_score,
                           'home_adv': 0 if game.neutral else -1,
                           'week': game.week,
                           'game_id': ii,
                       } for ii, game in enumerate(season_2018.games)
                        if game.played])[[
                            'game_id',
                            'week',
                            'away_team',
                            'home_team',
                            'home_adv',
                            'away_score',
                            'home_score',
                        ]]
dtf = dtf.merge(
    ratings_pdf.rename(
        index=str,
        columns={
            'team_id': 'away_team',
            'def': 'def_away',
            'hfa': 'hfa_away',
            'off': 'off_away',
            'rating': 'rating_away',
        }),
    how='left',
    on=[
        'week',
        'away_team',
    ]).merge(
        ratings_pdf.rename(
            index=str,
            columns={
                'team_id': 'home_team',
                'def': 'def_home',
                'hfa': 'hfa_home',
                'off': 'off_home',
                'rating': 'rating_home',
            }),
        how='left',
        on=[
            'week',
            'home_team',
        ])
dtf['away_team'] = pandas.Categorical(dtf['away_team'])
dtf['home_team'] = pandas.Categorical(dtf['home_team'])
dtf = dtf.loc[dtf.home_score != dtf.away_score]
dtf.loc[dtf.home_adv == 1, 'hfa_away'] = 0
dtf.loc[dtf.home_adv == -1, 'hfa_home'] = 0

y = numpy.array((dtf.home_score > dtf.away_score) * 1)

featurizer = DataFrameMapper(
    [(
        [col],
        None,
    ) for col in (
        'def_away',
        'hfa_away',
        'off_away',
        'def_home',
        'hfa_home',
        'off_home',
    )],
    sparse=True,
)

X = featurizer.fit_transform(dtf)

In [20]:
name = 'hypers'

space = {
    'alpha': hp.loguniform(name + '_alpha', numpy.log(1e-6), numpy.log(1e-1)),
}

In [6]:
from sklearn.model_selection import GroupKFold

In [16]:
cvfolds = list(GroupKFold(n_splits=20).split(dtf, y, numpy.array(dtf.game_id)))

In [19]:
from sklearn.linear_model import RidgeClassifier

In [79]:
from sklearn.calibration import CalibratedClassifierCV

In [91]:
def logistic(x):
    return 1 / (1 + numpy.exp(-x))


def myscorer(estimator, X, y):
    return log_loss(
        y_true=y, y_pred=logistic(estimator.decision_function(X=X)))


def objective(params):
    
    pipe = RidgeClassifier(normalize=True, fit_intercept=True, **params)
    clf_sigmoid = CalibratedClassifierCV(pipe, cv=3, method='sigmoid')

    cvresults = cross_val_score(
        estimator=clf_sigmoid,
        X=X,
        y=y,
        cv=cvfolds,
        scoring='neg_log_loss',
    )

    ret = {
        'loss': -cvresults.mean(),
        'status': STATUS_OK,
    }

    return ret

In [92]:
objective({'alpha': 0.004})

{'loss': 0.6491064515027032, 'status': 'ok'}

In [48]:
from sklearn.metrics import log_loss

In [50]:
pipe = RidgeClassifier(normalize=True, fit_intercept=True, alpha=1.0)

In [54]:
from sklearn.model_selection import cross_val_score

In [53]:
cross_val_score(
    X=X,
    y=y,
    cv=cvfolds,
    estimator=pipe,
    scoring=myscorer,
)

-0.6618573083485888

In [94]:
trials = Trials()

In [97]:
best = fmin(
    objective,
    space=space,
    verbose=1,
    algo=tpe.suggest,
    max_evals=250,
    trials=trials,
)

In [98]:
min(trials.losses())

0.646429245865589

In [99]:
print(best)

{'hypers_alpha': 0.09993161695315884}


In [104]:
pipe = RidgeClassifier(
    normalize=True, fit_intercept=True, alpha=0.09993161695315884)

In [150]:
clf_sigmoid = CalibratedClassifierCV(
    RidgeClassifier(
        normalize=True, fit_intercept=True, alpha=0.09993161695315884),
    cv=5,
    method='sigmoid')

In [151]:
clf_sigmoid.fit(X, y)

CalibratedClassifierCV(base_estimator=RidgeClassifier(alpha=0.09993161695315884, class_weight=None, copy_X=True,
        fit_intercept=True, max_iter=None, normalize=True,
        random_state=None, solver='auto', tol=0.001),
            cv=5, method='sigmoid')

In [134]:
clsfparams = numpy.array([[x.calibrators_[0].a_, x.calibrators_[0].b_]
             for x in clf_sigmoid.calibrated_classifiers_])

In [153]:
clsfparams[:, 0].mean()

-2.0472903800278957

In [154]:
clsfparams[:, 1].mean()

-0.17993980284865016

In [145]:
pipe.fit(X, y)

RidgeClassifier(alpha=0.09993161695315884, class_weight=None, copy_X=True,
        fit_intercept=True, max_iter=None, normalize=True,
        random_state=None, solver='auto', tol=0.001)

In [146]:
pipe.coef_

array([[ 0.03671798, -0.09451312, -0.04866814, -0.03671798,  0.09451312,
         0.04866814]])

In [148]:
featurizer.transformed_names_

['def_away', 'hfa_away', 'off_away', 'def_home', 'hfa_home', 'off_home']

In [156]:
X[0, :]

array([ 4.27,  0.  , 23.41,  2.73,  2.71, 27.68])

In [157]:
clf_sigmoid.predict_proba(X)

array([[0.25914888, 0.74085112],
       [0.30107684, 0.69892316],
       [0.3969018 , 0.6030982 ],
       [0.28296128, 0.71703872],
       [0.48383918, 0.51616082],
       [0.43073678, 0.56926322],
       [0.26118032, 0.73881968],
       [0.34063955, 0.65936045],
       [0.42961343, 0.57038657],
       [0.31073742, 0.68926258],
       [0.3934468 , 0.6065532 ],
       [0.47850972, 0.52149028],
       [0.25870418, 0.74129582],
       [0.25695891, 0.74304109],
       [0.43499305, 0.56500695],
       [0.42789386, 0.57210614],
       [0.4210367 , 0.5789633 ],
       [0.1911304 , 0.8088696 ],
       [0.402972  , 0.597028  ],
       [0.33716305, 0.66283695],
       [0.34622906, 0.65377094],
       [0.40816317, 0.59183683],
       [0.37847315, 0.62152685],
       [0.35725321, 0.64274679],
       [0.27534236, 0.72465764],
       [0.45357863, 0.54642137],
       [0.45386358, 0.54613642],
       [0.36332056, 0.63667944],
       [0.34505367, 0.65494633],
       [0.50492355, 0.49507645],
       [0.

In [107]:
logistic(pipe.decision_function(X[:10, :]))

array([0.62726215, 0.60451023, 0.55047125, 0.62050078, 0.49786658,
       0.52988922, 0.63163079, 0.5899702 , 0.53154812, 0.59826773])

In [147]:
featurizer.transformed_names_

['def_away', 'hfa_away', 'off_away', 'def_home', 'hfa_home', 'off_home']

In [64]:
pipe.coef_

array([[ 0.04151575, -0.09884618, -0.05349901, -0.04151575,  0.09884618,
         0.05349901]])

In [76]:
logistic(numpy.dot(X, pipe.coef_.reshape(-1, 1))[:, 0])

array([0.63650996, 0.61175472, 0.55299253, 0.6265383 , 0.49491411,
       0.52733217, 0.64009533, 0.59454379, 0.52953813, 0.60426524,
       0.55676592, 0.49797992, 0.64580842, 0.64720329, 0.52694077,
       0.52792399, 0.53003192, 0.71101147, 0.54996973, 0.5893482 ,
       0.57601697, 0.54579963, 0.57071236, 0.56894237, 0.62642968,
       0.50956796, 0.50737336, 0.57842916, 0.58210116, 0.48169533,
       0.51458132, 0.58537627, 0.57672749, 0.6225241 , 0.50682946,
       0.63470253, 0.44323643, 0.62369701, 0.57817807, 0.60755674,
       0.5901203 , 0.57253334, 0.55745155, 0.61120584, 0.57096694,
       0.5330553 , 0.62764714, 0.63912915, 0.54050477, 0.45550464,
       0.49585672, 0.62616837, 0.56361034, 0.49465346, 0.44738443,
       0.62286996, 0.46385456, 0.37313795, 0.53749659, 0.61908976,
       0.45064743, 0.63636239, 0.5993137 , 0.53522191, 0.59258387,
       0.62177377, 0.67745366, 0.64076191, 0.63072916, 0.54723483,
       0.55412109, 0.68299862, 0.51959274, 0.59350165, 0.64767