# Bet before game

In [1]:
from typing import List
import os
import glob
import tarfile
import bz2
import betfairlightweight
from betfairlightweight import filters
import pandas as pd
import numpy as np
import datetime
import json
from unittest.mock import patch
import matplotlib.pyplot as plt
from typing import Tuple, List, Dict
from scipy.stats import poisson

In [2]:
%matplotlib inline

In [3]:
# Change this certs path to wherever you're storing your certificates
certs_path = '/Users/guillaume_baquiast/Documents/tmp'

# Change these login details to your own
my_username = "your_email"
my_password = "your_password"
my_app_key = "your_app_key"

trading = betfairlightweight.APIClient(username=my_username,
                                       password=my_password,
                                       app_key=my_app_key,
                                       certs=certs_path)

listener = betfairlightweight.StreamListener(max_latency=None)

# Functions

In [4]:
# loading from tar and extracting files
def load_markets(file_paths: List[str]):
    for file_path in file_paths:
        if os.path.isdir(file_path):
            for path in glob.iglob(file_path + '**/**/*.bz2', recursive=True):
                f = bz2.BZ2File(path, 'rb')
                yield f
                f.close()
        elif os.path.isfile(file_path):
            ext = os.path.splitext(file_path)[1]
            # iterate through a tar archive
            if ext == '.tar':
                with tarfile.TarFile(file_path) as archive:
                    for file in archive:
                        yield bz2.open(archive.extractfile(file))
            # or a zip archive
            elif ext == '.zip':
                with zipfile.ZipFile(file_path) as archive:
                    for file in archive.namelist():
                        yield bz2.open(archive.open(file))

    return None


def retrieve_game_data(market_paths: List) -> Dict:
    data_dict = {
        "event_name": [],
        "event_id": [],
        "market_type": [],
        "market_time": [],
        "open_date": [],
        "market_id": [],
        "publish_time": [],
        "runner_name": [],
        "ltp": [],
        "total_matched": [],
        "in_play": [],
    }

    for file_obj in load_markets(market_paths):
        stream = trading.streaming.create_historical_generator_stream(
            file_path=file_obj,
            listener=listener,
        )

        with patch("builtins.open", lambda f, _: f):   
            gen = stream.get_generator()

            for market_books in gen():
                for market_book in market_books:
                    for runner_idx in range(len(market_book.runners)):
                        data_dict["event_name"].append(market_book.market_definition.event_name)
                        data_dict["event_id"].append(market_book.market_definition.event_id)
                        data_dict["market_type"].append(market_book.market_definition.market_type)
                        data_dict["market_time"].append(market_book.market_definition.market_time)
                        data_dict["open_date"].append(market_book.market_definition.open_date)
                        data_dict["market_id"].append(market_book.market_id)
                        data_dict["publish_time"].append(market_book.publish_time)
                        data_dict["runner_name"].append(market_book.market_definition.runners[runner_idx].name)
                        data_dict["ltp"].append(market_book.runners[runner_idx].last_price_traded)
                        data_dict["total_matched"].append(market_book.runners[runner_idx].total_matched)
                        data_dict["in_play"].append(market_book.inplay)
    
    return data_dict


def preprocess_game_data(data_dict: Dict) -> pd.DataFrame:
    # Create dataframe from data dict
    df_game = pd.DataFrame(data_dict).drop_duplicates()

    # Ensure to have all information at each tick
    df_time_ids = df_game[["publish_time"]].drop_duplicates()
    df_market_ids = df_game[["market_type", "runner_name"]].drop_duplicates()

    df_time_ids["key"] = 1
    df_market_ids["key"] = 1

    df_ids = df_time_ids.merge(df_market_ids, on="key", how="inner").drop(columns="key")

    df_game = df_ids.merge(df_game, on=["publish_time", "market_type", "runner_name"], how="left")

    # Sort and fill na
    df_game = df_game.sort_values(["market_type", "runner_name", "publish_time"])
    df_game["ltp"] = df_game.groupby(["market_type", "runner_name"])["ltp"].fillna(method="ffill")

    # Add odd and proba
    df_game["odd"] = df_game["ltp"] - 1
    df_game["proba"] = 1 / df_game["ltp"]
    
    return df_game

In [5]:
def get_lambdas_for_proba_score(
    proba_00: float, proba_10: float
) -> Tuple[float]:
    lambda_1 = proba_10 / proba_00
    lambda_2 = -np.log(proba_00) - proba_10/proba_00

    lambdas = (lambda_1, lambda_2)
    
    return lambdas


def get_proba_score(
    score: Tuple,
    lambdas: Tuple,
) -> float:
    return np.product(poisson.pmf(k=score, mu=lambdas, loc=0))


def get_proba_scores_matrix(lambdas: Tuple) -> pd.DataFrame():
    proba_scores = {
        "score_1": [],
        "score_2": [],
        "proba": [],
    }

    for score_1 in range(0, 9):
        for score_2 in range(0, 9):
            proba_scores["score_1"].append(score_1)
            proba_scores["score_2"].append(score_2)
            proba_scores["proba"].append(get_proba_score((score_1, score_2), lambdas))
            
    return pd.DataFrame(proba_scores)
            

def get_proba_match(df_proba_scores: pd.DataFrame) -> pd.DataFrame:
    proba_win_1 = df_proba_scores.loc[df_proba_scores["score_1"] > df_proba_scores["score_2"], "proba"].sum()
    proba_win_2 = df_proba_scores.loc[df_proba_scores["score_2"] > df_proba_scores["score_1"], "proba"].sum()
    proba_draw = df_proba_scores.loc[df_proba_scores["score_1"] == df_proba_scores["score_2"], "proba"].sum()
    
    return pd.DataFrame(
        {
            "outcome": ["win_1", "win_2", "draw"],
            "proba": [proba_win_1, proba_win_2, proba_draw],
        }
    )


def get_proba_under_k_goals(df_proba_scores: pd.DataFrame):
    proba_score = {
        "market_type": [],
        "runner_name": [],
        "proba": [],
    }
    
    for k in range(0, 9):
        proba_under_k_goals = df_proba_scores.loc[
            (df_proba_scores["score_1"] + df_proba_scores["score_2"]) <= k, "proba"
        ].sum()
        proba_score["market_type"].append(f"OVER_UNDER_{k}5")
        proba_score["market_type"].append(f"OVER_UNDER_{k}5")
        proba_score["runner_name"].append(f"Under {k}.5 Goals")
        proba_score["proba"].append(proba_under_k_goals)
        proba_score["runner_name"].append(f"Over {k}.5 Goals")
        proba_score["proba"].append(1 - proba_under_k_goals)
            
    return pd.DataFrame(proba_score)

In [6]:
def plot_proba_through_time(
    df_game: pd.DataFrame,
    df_game_filter: pd.Series = None,
    date_time_start_game: str = None,
    x_lim_hours_start_game: Tuple = (5, 2),
    y_lim: Tuple = (0, 1.1),
    title: str = None,
    plot_game_time_limits: bool = True,
    goal_times: List = [],
    arbitrage_bound: int = None,
):
    xlim = None
    if x_lim_hours_start_game:
        xlim = (
            np.datetime64(date_time_start_game) - np.timedelta64(x_lim_hours_start_game[0], 'h'),
            np.datetime64(date_time_start_game) + np.timedelta64(x_lim_hours_start_game[1], 'h'),
        )

    if df_game_filter is None:
        df_game_filter = [True] * df_game.shape[0]
    
    df_game[df_game_filter].plot(
        x="publish_time",
        y="proba",
        xlim=xlim,
        ylim=y_lim,
        figsize=(15, 7)
    )

    if plot_game_time_limits:
        plt.axvline(x=np.datetime64(date_time_start_game), color="orange", label="game_in")
        plt.axvline(x=np.datetime64(date_time_start_game) + np.timedelta64(45+15+45, 'm'), color="orange")


    if goal_times:
        for goal_time in goal_times:
            if goal_time > 45:
                goal_time += 15
            plt.axvline(x=np.datetime64(date_time_start_game) + np.timedelta64(goal_time, 'm'), color="grey",
                        linestyle="--")

    if arbitrage_bound:
        plt.axhline(y=1, color="red")
        plt.axhline(y=1-arbitrage_bound, color="red", linestyle="--")
        plt.axhline(y=1+arbitrage_bound, color="red", linestyle="--")

    plt.legend()
    plt.title(title)
    plt.show()

# Pre game proba

In [7]:
df_metadata = pd.read_csv("/Users/guillaume_baquiast/Documents/tmp/champions_league_metadata.csv")

In [8]:
# event_name = "Barcelona v Paris St-G"
# goal_times = [27, 32, 65, 85, 70]

event_name = "Real Madrid v Liverpool"
df_metadata["market_time"] = pd.to_datetime(df_metadata["market_time"], dayfirst=True)

In [9]:
def get_compare_proba_df(event_name, df_metadata, time_lim=(3, 1)):
    start_game_date_time = df_metadata[df_metadata["event_name"]==event_name]["market_time"].tolist()[0]

    market_paths = df_metadata[df_metadata["event_name"]==event_name]["path"].tolist()

    data_dict = retrieve_game_data(market_paths)
    df_game = preprocess_game_data(data_dict)
    
    time_min = np.datetime64(start_game_date_time) - np.timedelta64(3, 'h')
    time_max = np.datetime64(start_game_date_time) - np.timedelta64(1, 'h')

    filter_time = (df_game["publish_time"] >= time_min) & (df_game["publish_time"] < time_max)

    obs_proba = df_game[filter_time].groupby(["market_type", "runner_name"])["proba"].mean().reset_index()
    
    proba_00 = obs_proba[(obs_proba["market_type"]=="CORRECT_SCORE") & (obs_proba["runner_name"]=="0 - 0")]["proba"].tolist()[0]
    proba_10 = obs_proba[(obs_proba["market_type"]=="CORRECT_SCORE") & (obs_proba["runner_name"]=="1 - 0")]["proba"].tolist()[0]

    lambdas = get_lambdas_for_proba_score(proba_00, proba_10)

    df_proba_per_market = pd.DataFrame(
        {
            "market_type": [],
            "runner_name": [],
            "proba": [],
        }
    )

    # Add CORRECT_SCORE
    df_proba_scores_matrix = get_proba_scores_matrix(lambdas)

    df_proba_per_market = df_proba_scores_matrix.copy()
    df_proba_per_market["runner_name"] = (
        df_proba_per_market["score_1"].astype(str) + " - " +
        df_proba_per_market["score_2"].astype(str)
    )
    df_proba_per_market["market_type"] = "CORRECT_SCORE"
    df_proba_per_market = df_proba_per_market[["market_type", "runner_name", "proba"]]

    # Add MATCH_ODDS
    df_proba_match = get_proba_match(df_proba_scores_matrix)

    df_proba_match["market_type"] = "MATCH_ODDS"
    df_proba_match["runner_name"] = [event_name.split(" v ")[0], event_name.split(" v ")[1], "The Draw"]
    df_proba_per_market = df_proba_per_market.append(df_proba_match[["market_type", "runner_name", "proba"]])

    # Add OVER_UNDER_05
    df_proba_over_under = get_proba_under_k_goals(df_proba_scores_matrix)
    df_proba_per_market = df_proba_per_market.append(df_proba_over_under[["market_type", "runner_name", "proba"]])

    # Add column for plot colors
    df_proba_per_market["market_type_col"] = df_proba_per_market["market_type"]
    df_proba_per_market.loc[df_proba_per_market["market_type_col"].str.contains("OVER_UNDER"), "market_type_col"] = "OVER_UNDER"

    # Merge with obs proba
    df_compare_proba = (
        df_proba_per_market.rename(columns={"proba": "proba_poisson"})
        .merge(obs_proba.rename(columns={"proba": "proba_obs"}), on=["market_type", "runner_name"], how="inner")
    )

    return df_compare_proba

# Get data

In [None]:
df_compare_proba = pd.DataFrame(
    {
        "market_type": [],
        "runner_name": [],
        "proba_poisson"
    }
)

for event_name in df_metadata["event_name"].unique():
    print(event_name)
    
    try:
        df_compare_proba = get_compare_proba_df(event_name=event_name, df_metadata=df_metadata, time_lim=(3, 1))
        
    
    except: 
        print("Something went wrong")
        continue