# Evaluate arbitrage opportunities

In [1]:
from typing import List
import os
import glob
import tarfile
import bz2
import betfairlightweight
from betfairlightweight import filters
import pandas as pd
import numpy as np
import datetime
import json
from unittest.mock import patch
import matplotlib.pyplot as plt
from typing import Tuple, List, Dict

In [2]:
%matplotlib inline

In [3]:
# Change this certs path to wherever you're storing your certificates
certs_path = '/Users/guillaume_baquiast/Documents/tmp'

# Change these login details to your own
my_username = "your_email"
my_password = "your_password"
my_app_key = "your_app_key"

trading = betfairlightweight.APIClient(username=my_username,
                                       password=my_password,
                                       app_key=my_app_key,
                                       certs=certs_path)

listener = betfairlightweight.StreamListener(max_latency=None)


In [4]:
# loading from tar and extracting files
def load_markets(file_paths: List[str]):
    for file_path in file_paths:
        if os.path.isdir(file_path):
            for path in glob.iglob(file_path + '**/**/*.bz2', recursive=True):
                f = bz2.BZ2File(path, 'rb')
                yield f
                f.close()
        elif os.path.isfile(file_path):
            ext = os.path.splitext(file_path)[1]
            # iterate through a tar archive
            if ext == '.tar':
                with tarfile.TarFile(file_path) as archive:
                    for file in archive:
                        yield bz2.open(archive.extractfile(file))
            # or a zip archive
            elif ext == '.zip':
                with zipfile.ZipFile(file_path) as archive:
                    for file in archive.namelist():
                        yield bz2.open(archive.open(file))

    return None

In [5]:
def plot_proba_through_time(
    df_game: pd.DataFrame,
    df_game_filter: pd.Series = None,
    date_time_start_game: str = None,
    x_lim_hours_start_game: Tuple = (5, 2),
    y_lim: Tuple = (0, 1.1),
    title: str = None,
    plot_game_time_limits: bool = True,
    goal_times: List = [],
    arbitrage_bound: int = None,
):
    xlim = None
    if x_lim_hours_start_game:
        xlim = (
            np.datetime64(date_time_start_game) - np.timedelta64(x_lim_hours_start_game[0], 'h'),
            np.datetime64(date_time_start_game) + np.timedelta64(x_lim_hours_start_game[1], 'h'),
        )

    if df_game_filter is None:
        df_game_filter = [True] * df_game.shape[0]
    
    df_game[df_game_filter].plot(
        x="publish_time",
        y="proba",
        xlim=xlim,
        ylim=y_lim,
        figsize=(15, 7)
    )

    if plot_game_time_limits:
        plt.axvline(x=np.datetime64(date_time_start_game), color="orange", label="game_in")
        plt.axvline(x=np.datetime64(date_time_start_game) + np.timedelta64(45+15+45, 'm'), color="orange")


    if goal_times:
        for goal_time in goal_times:
            if goal_time > 45:
                goal_time += 15
            plt.axvline(x=np.datetime64(date_time_start_game) + np.timedelta64(goal_time, 'm'), color="grey",
                        linestyle="--")

    if arbitrage_bound:
        plt.axhline(y=1, color="red")
        plt.axhline(y=1-arbitrage_bound, color="red", linestyle="--")
        plt.axhline(y=1+arbitrage_bound, color="red", linestyle="--")

    plt.legend()
    plt.title(title)
    plt.show()

# Load metadata

In [6]:
df_metadata = pd.read_csv("/Users/guillaume_baquiast/Documents/tmp/champions_league_metadata.csv")

# Guess time of start of game
df_metadata["market_time"] = pd.to_datetime(df_metadata["date"] + " 20:00")

In [7]:
df_metadata.head(2)

Unnamed: 0,event_name,event_id,market_time,open_date,path,date,league,country
0,Barcelona v Paris St-G,30186199.0,2021-02-16 20:00:00,16/02/2021 20:00,/Users/guillaume_baquiast/Documents/tmp/data/B...,16/02/2021,Champions League,Europe
1,RB Leipzig v Liverpool,30186224.0,2021-02-16 20:00:00,16/02/2021 20:00,/Users/guillaume_baquiast/Documents/tmp/data/B...,16/02/2021,Champions League,Europe


# Load game

In [8]:
def retrieve_game_data(market_paths: List) -> Dict:
    data_dict = {
        "event_name": [],
        "event_id": [],
        "market_type": [],
        "market_time": [],
        "open_date": [],
        "market_id": [],
        "publish_time": [],
        "runner_name": [],
        "ltp": [],
        "total_matched": [],
        "in_play": [],
    }

    for file_obj in load_markets(market_paths):
        stream = trading.streaming.create_historical_generator_stream(
            file_path=file_obj,
            listener=listener,
        )

        with patch("builtins.open", lambda f, _: f):   
            gen = stream.get_generator()

            for market_books in gen():
                for market_book in market_books:
                    for runner_idx in range(len(market_book.runners)):
                        data_dict["event_name"].append(market_book.market_definition.event_name)
                        data_dict["event_id"].append(market_book.market_definition.event_id)
                        data_dict["market_type"].append(market_book.market_definition.market_type)
                        data_dict["market_time"].append(market_book.market_definition.market_time)
                        data_dict["open_date"].append(market_book.market_definition.open_date)
                        data_dict["market_id"].append(market_book.market_id)
                        data_dict["publish_time"].append(market_book.publish_time)
                        data_dict["runner_name"].append(market_book.market_definition.runners[runner_idx].name)
                        data_dict["ltp"].append(market_book.runners[runner_idx].last_price_traded)
                        data_dict["total_matched"].append(market_book.runners[runner_idx].total_matched)
                        data_dict["in_play"].append(market_book.inplay)
    
    return data_dict


def preprocess_game_data(data_dict: Dict) -> pd.DataFrame:
    # Create dataframe from data dict
    df_game = pd.DataFrame(data_dict).drop_duplicates()

    # Ensure to have all information at each tick
    df_time_ids = df_game[["publish_time"]].drop_duplicates()
    df_market_ids = df_game[["market_type", "runner_name"]].drop_duplicates()

    df_time_ids["key"] = 1
    df_market_ids["key"] = 1

    df_ids = df_time_ids.merge(df_market_ids, on="key", how="inner").drop(columns="key")

    df_game = df_ids.merge(df_game, on=["publish_time", "market_type", "runner_name"], how="left")

    # Sort and fill na
    df_game = df_game.sort_values(["market_type", "runner_name", "publish_time"])
    df_game["ltp"] = df_game.groupby(["market_type", "runner_name"])["ltp"].fillna(method="ffill")

    # Add odd and proba
    df_game["odd"] = df_game["ltp"] - 1
    df_game["proba"] = 1 / df_game["ltp"]
    
    return df_game

In [9]:
event_name = "Barcelona v Paris St-G"

market_paths = df_metadata[df_metadata["event_name"]==event_name]["path"].tolist()

data_dict = retrieve_game_data(market_paths)
df_game = preprocess_game_data(data_dict)

In [10]:
# Inputs
x_lim_hours_start_game=(5, 2)
date_time_start_game = df_metadata[df_metadata["event_name"]==event_name]["market_time"].tolist()[0]


def _get_time_filter(
    date_time_start_game: str,
    x_lim_hours_start_game: Tuple,
) -> pd.Series:
    min_start_time = (np.datetime64(date_time_start_game) - np.timedelta64(x_lim_hours_start_game[0], 'h'))
    max_start_time = (np.datetime64(date_time_start_game) + np.timedelta64(x_lim_hours_start_game[1], 'h'))
    return (
        (df_proba_per_market["publish_time"] >= min_start_time)
        & (df_proba_per_market["publish_time"] <= max_start_time)
    )


def get_market_liquidity():
    return 

def get_vanilla_arbitrage_frequency(
    df_game: pd.DataFrame,
    date_time_start_game: str = None,
    x_lim_hours_start_game: Tuple = (5, 2),
    threshold_arbitrage: float = .02,
) -> pd.DataFrame:
    # Get time filter
    if date_time_start_game and x_lim_hours_start_game:
        time_filter = _get_time_filter(date_time_start_game, x_lim_hours_start_game)
    else:
        time_filter = [True] * df_proba_per_market.shape[0]
        
    df_proba_per_market = (
        df_game[time_filter]
        .groupby(["market_type", "market_id", "publish_time"])["proba"].sum()
        .reset_index()
    )
    
    # Compute liquidity statistics
    df_nb_ticks = (
        df_proba_per_market[time_filter]
        .groupby("market_type")["market_id"].count()
        .sort_values(ascending=False)
        .reset_index()
        .rename(columns={"market_id": "nb_ticks"})
    )
    max_nb_ticks = df_nb_ticks["nb_ticks"].max()
    df_nb_ticks["liquidity_pc"] = df_nb_ticks["nb_ticks"] / max_nb_ticks
    
    # Compute arbitrage frequency
    arbitrage_freq_dict = {
        "market_type": [],
        "arbitrage_freq": [],
    }
    
    for market_type in df_nb_ticks["market_type"]:
        market_filter = (df_proba_per_market["market_type"] == market_type)

        arbitrage_pc = (
            sum((df_proba_per_market.loc[market_filter, "proba"] - 1).abs() > threshold_arbitrage) / max_nb_ticks
        )

        arbitrage_freq_dict["market_type"].append(market_type)
        arbitrage_freq_dict["arbitrage_freq"].append(arbitrage_pc)
    
    arbitrage_freq_df = pd.DataFrame(arbitrage_freq_dict)
    
    return arbitrage_freq_df.merge(df_nb_ticks, on="market_type", how="left").sort_values("arbitrage_freq", ascending=False)

In [11]:
df_arbitrage = get_vanilla_arbitrage_frequency(
    df_game=df_game,
    date_time_start_game=df_metadata[df_metadata["event_name"]==event_name]["market_time"].tolist()[0],
)

df_arbitrage

NameError: name 'df_proba_per_market' is not defined

In [None]:
def get_all_scores_under_i_goals(nb_goals_max):
    return [f"{i} - {k}" for i in range(0, nb_goals_max+1) for k in range(0, nb_goals_max-i+1)]


def get_arbitrage_under_score(df_game) -> pd.DataFrame:
    # Get time filter
    if date_time_start_game and x_lim_hours_start_game:
        time_filter = (
            (df_proba_per_market["publish_time"] >= (np.datetime64(date_time_start_game) - np.timedelta64(x_lim_hours_start_game[0], 'h')))
            & (df_proba_per_market["publish_time"] <= (np.datetime64(date_time_start_game) + np.timedelta64(x_lim_hours_start_game[1], 'h')))
        )
    else:
        time_filter = [True] * df_proba_per_market.shape[0]
    
    # Compute arbitrage frequency
    arbitrage_freq_dict = {
        "market_type": [],
        "arbitrage_freq": [],
    }
    
    for i in range(0, 9):
        df_game["tmp_arbitrage_group"] = np.where(
            ((df_game["market_type"] == f"OVER_UNDER_{i}5") & (df_game["runner_name"] == f"Under {i}.5 Goals")) |
            ((df_game["market_type"] == "CORRECT_SCORE") & ~df_game["runner_name"].isin(get_all_scores_under_i_goals(i))),
            1, np.nan
        )

        df_proba_per_market = (
            df_game[df_game["tmp_arbitrage_group"]==1]
            .groupby(["publish_time"])["proba"].sum()
            .reset_index()
        )
        
        arbitrage_pc = (
            sum((df_proba_per_market.loc[time_filter, "proba"] - 1).abs() > threshold_arbitrage) / max_nb_ticks
        )
        

        
    return

In [None]:
df_arbitrage.plot.scatter(x="liquidity_pc", y="arbitrage_freq")

In [None]:
df_arbitrage.sort_values("liquidity_pc", ascending=False)

In [None]:
df_proba_per_market.groupby("market_type")["market_type"].count().sort_values(ascending=False).max()

In [None]:
df_proba_per_market[df_proba_filter].plot(x="publish_time", y="proba")

In [None]:
df_proba_per_market[df_proba_filter] > 1