# Prepare
---

In [None]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
alphabetter_path = Path('').resolve().parent
if str(alphabetter_path) not in sys.path:
    sys.path.append(str(alphabetter_path))

import logging
from alphabetter.config import default as config
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from alphabetter.ml import *
from tqdm.notebook import tqdm
from datetime import date, datetime
from pathlib import Path
from pprint import pprint
from IPython.display import display, JSON

logger = logging.getLogger()
logger.setLevel(logging.WARNING)
config.progress_bar_class = tqdm
random_state = None

# Match dataset
---

## Select a dataset of matches

In [None]:
df = select_dataset().match.drop_without_points()

## Save the dataset of matches

In [None]:
print(save_match_dataset(df))

# Outcome predictor
---

## Read a dataset of matches

In [None]:
df = read_match_dataset()

## Describe the dataset of matches

In [None]:
display(pd.DataFrame({'': summarize_dataset(df)}).rename_axis(index='Dataset'))
df.groupby('match.league').size().plot.pie(legend=False, cmap='tab20', title='Matches by league');

## Create an outcome predictor

In [None]:
from sklearn.linear_model import SGDClassifier
predictor = SKLearnPredictor(
    features=[
        LeagueFeatures(),
        TeamFeatures(encode_venue=True),
    ],
    classifier=SGDClassifier(loss='log_loss', penalty='l1'),
)

## Validate the outcome predictor

In [None]:
predictor_cv = PredictorCrossValidator(
    splitter=ShuffleSplitter(
        train_draws=True,
        test_frac=0.5,
        droppers=[
            RareTeamDropper(min_matches=10),
        ],
        random_state=random_state,
    ),
    n_splits=8,
    median_split_metrics='accuracy',
)
predictor_score_df, predicted_df = predictor_cv(predictor, df)
display(predictor_score_df.describe().T.drop(columns='count').round(4))
fig, ax = plt.subplots(figsize=(6, 3), dpi=90)
ax.set_xlabel('accuracy')
predictor_score_df.accuracy.plot.hist(ax=ax)
print(f'Median dataset prediction accuracy: {predicted_df.prediction.accuracy():.4f}')

## Save the outcome predictor

In [None]:
predictor.fit(df)
print(predictor.save())

## Save the dataset of predicted matches

In [None]:
print(save_predicted_match_dataset(predicted_df))

# Better
---

## Read a dataset of predicted matches

In [None]:
predicted_df = read_predicted_match_dataset().odds.dropna()

## Describe the dataset of predicted matches

In [None]:
display(pd.DataFrame({'': summarize_dataset(predicted_df)}).rename_axis(index='Dataset'))
prediction_df = predicted_df.groupby('match.league').apply(
    lambda df: pd.Series({'Prediction accuracy': df.prediction.accuracy(),
                          'Odds accuracy': df.odds.accuracy()})).rename_axis(index=None)
prediction_df['Prediction advantage'] = prediction_df.eval('`Prediction accuracy` - `Odds accuracy`')
display(prediction_df.sort_values('Prediction advantage', ascending=False))
predicted_df.groupby('match.league').size().plot.pie(legend=False, title='Predicted matches by league', cmap='tab20');

## Drop leagues with low prediction accuracy advantage

In [None]:
bet_leagues = {
    'Austrian Football Bundesliga',
    'Belgian Pro League',
    'Bundesliga',
    'EFL Championship',
    'Eredivisie',
    'Ligue 1',
    'La Liga',
    'Premier League',
    'Premier League Russia',
    'Primeira Liga',
    'Scottish Premier League',
    'Segunda División',
    'Serbian SuperLiga',
    'Serie A',
    'Super League Greece',
}
lpal_dropper = LowPredictionAdvantageLeagueDroppeer(0.0)
league_dropper = LeagueDropper(allowed_leagues=bet_leagues)
predicted_df = league_dropper.drop(predicted_df)
lpal_dropper.fit(predicted_df)
predicted_df = lpal_dropper.drop(predicted_df)

## Create a better

In [None]:
better = OPCBetter(
    bet_rate=0.05,
    outcomes=['1', 'X', '2', '1X', '2X'],
    accuracy_factor=2,
    expediency_contrast=3,
)

## Validate the better

In [None]:
better_cv = BetterCrossValidator(
    splitter=ShuffleSplitter(
        test_frac=0.7,
        random_state=random_state,
    ),
    n_splits=50,
    median_split_metrics='roi_per_week',
)
better_score_df, bet_df = better_cv(better, predicted_df.odds.dropna())
fig, axes = plt.subplots(2, 2, figsize=(8, 5.5), dpi=90, sharey=True)
metrics_list = ['roi', 'roi_per_week', 'bets_per_month', 'win_rate']
for ax, metrics in zip(axes.reshape(-1), metrics_list):
    better_score_df[metrics].plot.hist(ax=ax)
    ax.set_xlabel(metrics)
fig.tight_layout()
display(better_score_df.describe().T.drop(columns='count').round(4))

## Describe the dataset of bet matches

In [None]:
league_summary_df = bet_df.bet.drop_null().groupby('match.league').apply(
    lambda df: pd.Series({'N': df.bet.count(),
                          'ROI': df.bet.roi_per_bet(),
                          'Total ROI': df.bet.roi_per_bet() * df.bet.count()}))
fig, axes = plt.subplots(1, 2, figsize=(8,8))
league_summary_df['N'].rename(None).plot.pie(title='Bets by league', cmap='tab20', ax=axes[0]);
league_summary_df = league_summary_df.query('N != 0')
bet_df.bet.drop_null().groupby('bet.outcome').size().plot.pie(legend=False, ax=axes[1], title='Bets by outcome');
fig.tight_layout()
fig, axes = plt.subplots(1, 2, figsize=(8, 2 + len(league_summary_df) / 8), dpi=90, sharey=True)
league_summary_df.rename_axis(index=None).plot.barh(y='ROI', grid=True, legend=False, title='ROI', ax=axes[0]);
league_summary_df.rename_axis(index=None).plot.barh(y='Total ROI', grid=True, legend=False, title='Total ROI', ax=axes[1]);

## Save the better

In [None]:
better.fit(predicted_df.odds.dropna())
print(better.save())

## Save the bet dataset

In [None]:
print(save_bet_match_dataset(bet_df))

# Accountant

## Read a dataset of bet matches

In [None]:
bet_df = read_bet_match_dataset()
pd.DataFrame({'': summarize_dataset(bet_df)})

## Create an accountant

In [None]:
accountant = ParametricAccountant(
    capital=10_000,
    min_investment_fraction=0.05,
    max_investment_fraction=0.15,
    credit=5_000,
    min_investment=300,
    alpha=0.5,
)

## Validate the acountant

In [None]:
accountant_cv = AccountantCrossValidator(
    splitter=ShuffleSplitter(
        test_frac=1,
        random_state=random_state,
    ),
    n_splits=200,
    median_split_metrics='qoc_months',
)
accountant_score_df, accounted_df = accountant_cv(accountant, bet_df)
fig, axes = plt.subplots(2, 2, figsize=(8, 5.5), dpi=90, sharey=True)
metrics_list = ['roi', 'annual_roc', 'doc_months', 'qoc_months']
for ax, metrics in zip(axes.reshape(-1), metrics_list):
    scores = accountant_score_df[metrics]
    scores.plot.hist(ax=ax, alpha=scores.notna().mean())
    ax.set_xlabel(metrics)
fig.tight_layout()
accountant_score_df.describe().T.round(4)

## Save the accountant

In [None]:
accountant.fit(bet_df)
print(accountant.save())