In [None]:
# default_exp data

# Data

> Load transaction data into a usable format

In [None]:
#hide
from nbdev.showdoc import *

## Overview

The data module can be used to load in transaction data sets from multiple sources into a format that the gamba library can use. This module contains a number of data loading functions for existing public data repositories so you can replicate or extend work right from the start.

## Reading CSV Files

This can be done using the `read_csv` method, which is a wrapper around the [pandas library's method](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) of the same name.

In [None]:
#export
def read_csv(file, parse_dates=[], index_col=None, delimiter=",", dummy_data=False):

    df = pd.read_csv(
        file, parse_dates=parse_dates, index_col=index_col, delimiter=delimiter
    )

    return df

This can be used to read a regular CSV file as you'd expect, but can also be used for tab-separated files - as some of [the transparency project](http://www.thetransparencyproject.org/download_index.php)'s data sets are.

## Setting Column Names

The gamba library's methods expect a dataframe with certain column names. The most important task after loading data as a [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) is to set these column names according to the type of data the column contains. These names should match the following table for basic data;

| Column Name       | Description                                          |
|-------------------|------------------------------------------------------|
| ***player_id***   | a unique identifier for each player                  |
| ***bet_size***    | the size of the bet (in raw currency form, e.g. USD) |
| ***bet_time***    | the datetime the bet was placed                      |
| ***payout_size*** | the size of the payout (also in raw currency)        |

Advanced data sets may contain more information about each bet. These additional columns can be included using names from the table below. Note that methods in other parts of the library will reject dataframes which contain column names not in one these two tables.

| Column Name        | Description                                                                                                                              |
|--------------------|------------------------------------------------------------------------------------------------------------------------------------------|
| ***payout_time***  | the [timestamp]((https://www.unixtimestamp.com/) that the payout was paid                                                                |
| ***decimal_odds*** | the [decimal odds](https://en.wikipedia.org/wiki/Odds#Decimal_odds) for the given bet                                                    |
| ***house_edge***   | the percentage taken by the house (value of 3 for 3% house edge)                                                                         |
| ***game_type***    | the game being played as a string e.g. 'coinflip', 'roulette' - doesn't have to be one of a fixed set but should be unique per game type |
| ***provider***     | the operator's name - this is useful for mixed operator datasets                                                                         |

## Loading Existing Datasets

Several public repositories provide transaction data that can be loaded by the gamba library (see Public Repositories in the menu). The data module contains methods for loading some of these sets into the correct format, which are used in the respective replications. If you're loading in a similar data set, feel free to explore the source code of these methods to see how it's done, and modify them for your own needs!

In [None]:
#export
def prepare_labrie_data(filename, savedir="labrie_individuals/", loud=False, year=2008):
    "Splits the original labrie data into CSV files for each individual's transactions and renames the columns to be compatable with the rest of the gamba library."

    labrie_data = None

    if year == 2008:
        labrie_data = pd.read_csv(filename, delimiter="\t", parse_dates=["Date"])
    elif year == 2007:
        labrie_data = pd.read_csv(filename, parse_dates=["Date"])

    # rename columns to make them compatable with gamba.measures
    if loud:
        print("original columns:", list(labrie_data.columns))

    if year == 2008:
        labrie_data.columns = [
            "player_id",
            "bet_time",
            "bet_size",
            "payout_size",
            "bet_count",
        ]
        labrie_data.to_csv("gamba_ready_labrie_data_2008.csv", index=False)
    elif year == 2007:
        labrie_data.columns = [
            "player_id",
            "bet_time",
            "product_id",
            "bet_size",
            "payout_size",
            "bet_count",
        ]
        labrie_data.to_csv("gamba_ready_labrie_data_2007.csv", index=False)

    if loud:
        print("better columns:", list(labrie_data.columns))

    # split_individual_transactions(labrie_data, savedir)

    if loud:
        print("LaBrie data ready to use!")

    return labrie_data

In [None]:
#export
def prepare_braverman_data(filename, loud=False):
    "Splits the original Braverman and Shaffer data into CSV files for each indivdiual's transactions, and renames the columns to be compatable with the rest of the gamba library."
    
    braverman_data = pd.read_csv(filename, parse_dates=["TimeDATE"], delimiter="\t")

    braverman_data.columns = [
        "player_id",
        "bet_time",
        "bet_size",
        "payout_size",
        "bet_count",
    ]

    # split_individual_transactions(raw_data, 'braverman_individuals/')

    if loud:
        print("Braverman data ready to use!")

    braverman_data.to_csv("gamba_ready_braverman_data.csv", index=False)
    return braverman_data

In [None]:
#export
def prepare_philander_data(filename, loud=False):
    "Loads in the analytic data set of high-risk internet gamblers and removes the UserID, Sereason, random, and clustering columns as described in Philander's 2014 study."
    
    analytic_data = read_csv(filename, delimiter='\t')
    philander_data = analytic_data.copy()
    philander_data['self_exclude'] = np.where(philander_data['Sereason'] == 3, 1, 0) # apply the binary self-exclude technique (middle of page 5)
    philander_data.drop(labels=['Sereason','random','p2clusteringactivity','p2clusterhalf1','p2clusterhalf2'], axis=1, inplace=True)
    philander_data.columns = ['player_id', 'country', 'gender','age','total_wagered','num_bets','frequency','duration','bets_per_day','net_loss',
                              'intensity','variability','frequency_1m','trajectory',
                              'z_intensity','z_variability','z_frequency','z_trajectory','self_exclude']
    if loud:
        print(len(philander_data), 'players loaded')

    return philander_data

## Final Checks

It's good practice to check that your column names match those used by the gamba library, and make sure that no extra columns exist. The `check_data` method below can be given the dataframe, and it will raise an error if anything isn't as it should be;

In [None]:
#export
def check_data(dataframe):

    acceptable_names = ['player_id','bet_size','bet_time','payout_size','payout_time','decimal_odds','house_edge','game_type','provider']
    
    for name in dataframe.columns:
        if name not in acceptable_names:
            raise Exception('invalid column name provided, expecting one of', acceptable_names)

    print(column_names)

## Plotting

The data module contains some basic visualisation methods which can be applied before any behavioural measures are calculated. This is useful for showing the distributions of player bet sizes, times, payouts, and so on.

In [None]:
#export
import matplotlib.pyplot as plt
def plot_player_career(player_df, savename=None):
    "Creates a candlestick-style plot of a players betting activity over the course of their career. This works best on regularly-spaced sequential data but can also provide insight into intra-session win/loss patterns."
    
    plt.figure(figsize=[5, 3])
    previous_y_end = 0
    for i, bet in player_df.iterrows():
        bet_size = bet["bet_size"]
        payout_size = bet["payout_size"]
        bet_time = bet["bet_time"]
        payout_time = bet["payout_time"]

        start_y = previous_y_end
        end_y = 0

        # if bet loses
        if payout_size < bet_size:
            end_y = start_y - bet_size
            # plt.plot([2*i, 2*i + 1], [bet_size, payout_size], marker='o', color='red')
            plt.plot(
                [i, i], [start_y, end_y], marker="o", color="#d30505", markersize=12
            )
        else:
            end_y = start_y + payout_size
            # plt.plot([2*i, 2*i + 1], [bet_size, payout_size], marker='o', color='green')
            plt.plot(
                [i, i], [start_y, end_y], marker="o", color="#00B007", markersize=12
            )

        previous_y_end = end_y

    plt.xlabel(None)
    if savename != None:
        plt.savefig(savename, dpi=200, transparent=True)

    return plt

In [None]:
#export
def plot_player_career_split(player_df):
    "Plot a player's betting and payout trajectory on a single plot, with green indicating payouts (top) and red indicating bets (bottom). A cumulative value line is also plotted between the two. Note that the player_df must include both "
    
    plt.figure()

    previous_y_end = 0
    for i, bet in player_df.iterrows():
        bet_size = bet["bet_size"]
        payout_size = bet["payout_size"]
        bet_time = bet["bet_time"]
        payout_time = bet["payout_time"]

        start_y = previous_y_end
        end_y = 0

        # if bet loses
        if payout_size < bet_size:
            end_y = start_y - bet_size
            # plt.plot([2*i, 2*i + 1], [bet_size, payout_size], marker='o', color='red')
            plt.plot([i, i], [start_y, end_y], marker="o", color="red")
        else:
            end_y = start_y + payout_size
            # plt.plot([2*i, 2*i + 1], [bet_size, payout_size], marker='o', color='green')
            plt.plot([i, i], [start_y, end_y], marker="o", color="green")

        previous_y_end = end_y

    bets = player_df["bet_size"].values
    payouts = player_df["payout_size"].values
    plt.plot(
        range(len(bets)),
        np.cumsum(-bets),
        marker="o",
        color="red",
        label="Cumulative Bets",
    )
    plt.plot(
        range(len(payouts)),
        np.cumsum(payouts),
        marker="o",
        color="green",
        label="Cumulative Payouts",
    )
    plt.legend()
    plt.xlim(0, len(bets) * 1.02)
    plt.ylim(-max([sum(bets), sum(payouts)]), max([sum(bets), sum(payouts)]))
    return plt

In [None]:
#export
def visualise_provider_dates(player_bets, providers, provider_labels=None):
    """
    Visualises the start and end dates of bets from one or more providers on a stacked gantt style plot.

    """
    fig = plt.figure(figsize=[8,1.5])
    
    for i, provider in enumerate(providers):
        print(provider)
        provider_bets = player_bets[(player_bets['provider'] == provider)]
        start = provider_bets["bet_time"].min()
        end = provider_bets["bet_time"].max()
        
        plt.plot([start, end], [i,i], label=provider)
        plt.scatter([start, end], [i,i], s=100, marker='|')
    
    ax = fig.axes[0]
    ax.xaxis.set_major_locator(mdates.MonthLocator([1,4,7,10])) # ticks on first day of each quarter
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y')) # format of only month (3 letter abbr)
    if provider_labels == None:
        plt.yticks([0,1,2], providers)
    else:
        plt.yticks(range(len(providers)), provider_labels)
    plt.ylim(-0.5, len(providers)-0.5)
    plt.grid(True)
    return plt