In [1]:
from models.logit.base import get_match_result_data

df, player_mapping, inverse_player_mapping = get_match_result_data()

In [2]:
df.sort('date', inplace=True)

  """Entry point for launching an IPython kernel.


In [3]:
train_df = df[df['date'] < '2016-01-01'].copy()
val_df = df[
    (df['date'] >= '2016-01-01') &
    (df['date'] < '2017-01-01')
].copy()

In [None]:
from models.logit.base import get_X_y

train_X, train_y = get_X_y(train_df, player_mapping)
val_X, val_y = get_X_y(val_df, player_mapping)

In [None]:
%matplotlib inline
import pandas as pd
from matplotlib import pyplot as plt
from models.logit.base import sipko_weights

weights = sipko_weights(train_df['date'].max(), train_df, 0.8)
plt.plot(range(len(weights)), weights)

#### Tune flat time, decay weights
Let's tune

In [None]:
import numpy as np
from ml.prior_logit import NonZeroLogit
from sklearn.metrics import roc_auc_score

N_ATTEMPTS = 50
LMBDA = 100.
PRIOR = -2.

flat_times = np.linspace(0., 2., 11)
decay_weights = np.linspace(0., 2., 21)

perfs = []
for ft in flat_times:
    for dw in decay_weights:
        print (ft, dw)
        weights = sipko_weights(train_df['date'].max(), train_df, dw, flat_time=ft)
        nzl = NonZeroLogit(lmbda=LMBDA, prior=PRIOR, seed=10)
        nzl.fit(train_X, train_y, sample_weight=weights)
        val_preds = nzl.predict_proba(val_X)[:, 1]
        auc = roc_auc_score(val_df['y'], val_preds)
        accuracy = (val_df['y'] == (val_preds > 0.5).astype(int)).mean()
        perfs.append((dw, ft, auc, accuracy))

In [None]:
perf_df = pd.DataFrame(
    perfs,
    columns=[
        'disc',
        'ft',
        'auc',
        'accuracy'
    ]
)
perf_df.sort('auc', ascending=False).head()

In [None]:
perf_df[perf_df['ft'] == 1.].sort('auc', ascending=False).head()

In [None]:
plt.scatter(perf_df['disc'], perf_df['auc'])

In [None]:
plt.scatter(perf_df['ft'], perf_df['auc'])

Looks like a discount factor of 0.6  does the trick...

#### Use Final Model

In [None]:
from ml.prior_logit import NonZeroLogit
from models.logit.base import _get_weights
from sklearn.metrics import roc_auc_score

nzl = NonZeroLogit(lmbda=2., prior=-2.)
weights = _get_weights(train_df['date'].max(), train_df, halflife=365.)

In [None]:
nzl.fit(train_X, train_y, sample_weight=weights)

In [None]:
preds = nzl.predict_proba(val_X)[:, 1]

In [None]:
from sklearn.isotonic import IsotonicRegression

val_df['pred'] = preds
train_df['pred'] = nzl.predict_proba(train_X)[:, 1]

In [None]:
iso = IsotonicRegression()
iso.fit(train_df['pred'], train_df['y'])

In [None]:
val_df['cal_pred'] = iso.predict(val_df['pred'])

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(val_df['y'], val_df['pred'])

In [None]:
accuracy = (val_df['y'] == (val_df['cal_pred'] > 0.5)).mean()
accuracy

In [None]:
%matplotlib inline
from sklearn.calibration import calibration_curve
from matplotlib import pyplot as plt

_x, _y = calibration_curve(val_df['y'], val_df['cal_pred'])
plt.plot(_x, _y)
plt.plot(_x, _x)

#### Evaluate Betting Performance 

In [None]:
def evaluate_betting(val_df, buff=0):
    val_df['bet1'] = (1. / val_df['p1_odds']) < (val_df['cal_pred'] - buff)
    val_df['bet2'] = (1. / val_df['p2_odds']) < (1. - val_df['cal_pred'] - buff)
    bet_revenues = (
        val_df['p1_odds'] * val_df['bet1'] * val_df['y']  +
        val_df['p2_odds'] * val_df['bet2'] * (1. - val_df['y'])
    )
    bet_spending = val_df['bet1'] + val_df['bet2']
    profit_over_time = bet_revenues.cumsum() - bet_spending.cumsum()
    
    total_placed = bet_spending.sum()
    total_won = (val_df['bet1'] * val_df['y']).sum() + (val_df['bet2'] * (1. - val_df['y'])).sum()
    return bet_revenues.sum() - bet_spending.sum(), profit_over_time 

In [None]:
units_won, over_time = evaluate_betting(val_df)
plt.plot(over_time)

In [None]:
units_won

#### Compare to always betting on player 1

Let's make sure we lose money if we just choose a random player

In [None]:
new_val_df = val_df.copy()
new_val_df['cal_pred'] = 0.

units_won, over_time = evaluate_betting(new_val_df)
plt.plot(over_time)

Cool, we lose a ton of money by randomly choosing a player

In [None]:
import numpy as np

val_df['tot_probs'] = ((1 / val_df['p1_odds']) + (1. / val_df['p2_odds']))

In [None]:
val_df[['p1_odds', 'p2_odds', 'winner', 'loser', 'maxw', 'maxl', 'tot_probs']][val_df['tot_probs'] < 1.]