In [130]:
!pip3 install tabula-py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [131]:
path = "/content/drive/Othercomputers/My MacBook Pro/data-science/projects/pools-dashboard/"


In [132]:
import pandas as pd
import tabula
import re
import numpy as np

In [133]:
def process(df):
    # rename
    # print('hi', df.columns.values)
    df = df.copy().reset_index()
    df_cols = list(df.columns)
    a = [re.sub('\\r|\&|\/|\.', '', x).strip() for x in df_cols]
    a = ["_".join(x.split()).lower() for x in a]
    df.columns = a

    # datatypes
    df['amount'] = df['amount'].str.split(' ').str[-1]
    df['payout_winnings'] = df['payout_winnings'].str.split(' ').str[-1]

    df['transaction_date_time'] = df['transaction_date_time'].str.split('\\r').str[0] + ' ' + \
                                  df['transaction_date_time'].str.split('\\r').str[1]
    df['draw_eventdate_time'] = df['draw_eventdate_time'].str.split('\\r').str[0] + ' ' + \
                                df['draw_eventdate_time'].str.split('\\r').str[1]
    df['status_receiptno'] = df['status_receiptno'].str.split('\\r').str[0]
    df['type'] = df['type'].str.split('\\r').str[0]

    df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
    df['payout_winnings'] = pd.to_numeric(df['payout_winnings'], errors='coerce')
    df['transaction_date_time'] = pd.to_datetime(df['transaction_date_time'], format='%d %b %Y %I:%M %p')
    df['draw_eventdate_time'] = pd.to_datetime(df['draw_eventdate_time'], format='%d %b %Y %I:%M %p')

    def date_filter(df):
        if (df['type'] == 'Withdrawal') or (df['type'] == 'Deposit'):
            return df['transaction_date_time']
        else:
            return df['draw_eventdate_time']

    df['datetime'] = df.apply(date_filter, axis=1)

    df = df.drop(['channel'], axis=1)

    return df


def football_table(df):
    df = df.copy()
    df = df.loc[(df['type'] == 'Football') | (df['type'] == 'Deposit') | (df['type'] == 'Withdrawal')]
    df = df.sort_values(by=['datetime'])

    # df['returns']= df[['payout_winnings','amount']].max(axis=1)
    # df['returns']= np.where(df['payout_winnings']== 0,  df['returns']*-1, df['returns'])
    # df['returns'] = np.where(df['type'] == 'Withdrawal', df['returns'] * -1, df['returns'])

    def returns_filter(df):
        if (df['type'] == 'Football') and (df['status_receiptno'] == 'Settled'):
            return df['payout_winnings'] - df['amount']
        elif df['type'] == 'Withdrawal':
            return -df['amount']
        elif (df['type'] == 'Deposit'):
            return df['amount']

    df['returns'] = df.apply(returns_filter, axis=1)
    df['cum_sum'] = df['returns'].cumsum()
    df['perc_returns'] = df['returns'] / df['cum_sum'].shift(periods=1)

    # selection_details split into columns
    df['selection_details'] = df['selection_details'] \
        .apply(lambda x: re.sub('\\r|\&|\@|\([a-zA-Z]+\)', ' ', x).strip())
    df['live'] = np.where(df['selection_details'].str.contains('live'), 1, 0)

    df['league'] = df['selection_details'].str.split('-').str[0]
    df['match'] = df['selection_details'].str.split('-').str[1]
    df['home'] = df['match'].str.split('vs').str[0]
    df['away'] = df['match'].str.split('vs').str[1]
    df['bet_side'] = df['selection_details'].str.split('-').str[2]
    df[['league', 'match', 'bet_side', 'home', 'away']] = \
        df[['league', 'match', 'bet_side', 'home', 'away']].apply(lambda x: x.str.strip())
    df['sub_type'] = df['bet_side'].str.split(' ').str[0]
    df['bet_side'] = df['bet_side'].str.split(' ').str[1]
    df['odds'] = df['selection_details'].str.split(' ').str[-1]
    df['odds'] = pd.to_numeric(df['odds'], errors='coerce')
    df['imp_prob'] = 1 / df['odds']

    df['home'].fillna('Unknown', inplace=True)
    df['away'].fillna('Unknown', inplace=True)

    def bet_filter(df):
        if df['bet_side'] == 'Draw':
            return 'D'
        elif str(df.bet_side) in str(df.home):
            return 'H'
        else:
            return 'A'

    df['bet_side'] = df.apply(bet_filter, axis=1)

    df = df.drop(['selection_details'], axis=1)

    return df


def win_ratio_table(df):
    df = df.copy()
    df = df.loc[df['status_receiptno'] == 'Settled']
    df['win_ind'] = np.where(df['returns'] > 0, 1, 0)
    return df

def return_by_match_table(df):
    df = df.copy()
    df = df[['datetime', 'home', 'away', 'amount', 'payout_winnings', 'imp_prob', 'bet_side', 'win_ind', 'returns']]
    df['match_total_bet'] = df.groupby(['home', 'away', 'datetime'])['amount'].transform(np.sum)
    df['match_total_returns'] = df.groupby(['home', 'away', 'datetime'])['payout_winnings'].transform(np.sum)
    df['bet_per_match'] = df['amount'] / df['match_total_bet']
    df['imp_prob_weighted'] = df['imp_prob'] * df['bet_per_match']
    df['date'] = df['datetime'].dt.date

    # groupby
    # aggregate bets for each match, since same event
    bet_side_one_hot = pd.get_dummies(df['bet_side'])
    df = pd.concat([df, bet_side_one_hot], axis=1)
    # df[['bet_side_A', 'bet_side_D', 'bet_side_H']] = df[['bet_side_A', 'bet_side_D', 'bet_side_H']]\
    df[['A', 'D', 'H']] = df[['A', 'D', 'H']].apply(lambda x: x * df['bet_per_match'])

    df = df.drop(['match_total_returns', 'match_total_bet', 'bet_per_match', 'imp_prob'], axis=1)
    df['num_of_bets'] = 1
    df = df.groupby(['home', 'away', 'date']).sum()

    df['frac_bets_win'] = df['win_ind'] / df['num_of_bets']
    df['agg_win_ind'] = np.where((df['payout_winnings'] - df['amount']) > 0, 1, 0)
    a = df[['A', 'D', 'H']]
    df = df.assign(side_agg=a.idxmax(axis=1), max_side_bet=a.max(axis=1))
    df['exact_frac_win'] = df['max_side_bet'] * df['agg_win_ind']

    return df.reset_index()


def return_by_match_table_simple(df):
    df['match_name'] = df['home'] + ' v ' + df['away']
    df = df[['match_name', 'date', 'returns']]
    return df


In [134]:
def fetch_eg_csv(path):
    # note that file path starts from pools-dashboard directory
    df_list = tabula.read_pdf(path + "data/TransactionHistory.pdf", pages='all')
    df = pd.concat(df_list)
    return df

def convert_to_df(df_input):
    if df_input is not None:
        df = df_input.copy()
        df = process(df)
        df = football_table(df)
        df_wr = win_ratio_table(df)
        ## wr= charts.win_ratio(df_wr)
        # df_match = return_by_match_table(df_wr)
        # df_match2 = return_by_match_table_simple(df_match)
    return df


In [149]:
df = fetch_eg_csv(path)

Nov 25, 2022 6:55:47 AM org.apache.pdfbox.pdmodel.font.PDType1Font <init>



In [136]:
df.head()

Unnamed: 0,TRANSACTI\rON DATE &\rTIME,TYPE,CHANNEL,SELECTION / DETAILS,AMOUNT,DRAW /\rEVENT\rDATE &\rTIME,STATUS / RECEIPT\rNO.,PAYOUT /\rWINNINGS
0,11 Aug 2022\r12:22 AM,Withdrawal,Web,eNETS Transaction Fee,$ 0.80,-,-,-
1,11 Aug 2022\r12:22 AM,Deposit,Web,Deposit - eNETS,$ 110.80,-,-,-
2,11 Aug 2022\r02:56 AM,TOTO\rTOTO\rOrdinary,Web,"13,16,18,33,42,45\rSelf Pick",$ 1.00,11 Aug 2022\r06:30 PM,Settled\rB/0111562/0000001,$ 0.00
3,11 Aug 2022\r03:01 AM,Football,Web,English Premier\r(Outrights/Specials) - Englis...,$ 1.00,29 May 2023\r12:00 AM,Placed\rO/0115148/0000089,-
4,11 Aug 2022\r03:01 AM,Football,Web,English Premier\r(Outrights/Specials) - Englis...,$ 6.00,29 May 2023\r12:00 AM,Placed\rO/0115148/0000088,-


In [137]:
df_clean = process(df)
df_clean.head()

Unnamed: 0,index,transaction_date_time,type,selection_details,amount,draw_eventdate_time,status_receiptno,payout_winnings,datetime
0,0,2022-08-11 00:22:00,Withdrawal,eNETS Transaction Fee,0.8,NaT,-,,2022-08-11 00:22:00
1,1,2022-08-11 00:22:00,Deposit,Deposit - eNETS,110.8,NaT,-,,2022-08-11 00:22:00
2,2,2022-08-11 02:56:00,TOTO,"13,16,18,33,42,45\rSelf Pick",1.0,2022-08-11 18:30:00,Settled,0.0,2022-08-11 18:30:00
3,3,2022-08-11 03:01:00,Football,English Premier\r(Outrights/Specials) - Englis...,1.0,2023-05-29 00:00:00,Placed,,2023-05-29 00:00:00
4,4,2022-08-11 03:01:00,Football,English Premier\r(Outrights/Specials) - Englis...,6.0,2023-05-29 00:00:00,Placed,,2023-05-29 00:00:00


In [138]:
df_clean[df_clean['status_receiptno']=='Placed'].head()['selection_details']

3    English Premier\r(Outrights/Specials) - Englis...
4    English Premier\r(Outrights/Specials) - Englis...
5    English Premier\r(Outrights/Specials) - Englis...
6    Spanish League\r(Outrights/Specials) - Spanish...
7    Italian League (Outrights/Specials) -\rItalian...
Name: selection_details, dtype: object

In [188]:
def open_bets(df):
  df = df.copy()
  df = df.loc[(df['type'] == 'Football') | (df['type'] == 'Deposit') | (df['type'] == 'Withdrawal')]
  df = df.sort_values(by=['datetime'])
  df= df[df['status_receiptno']=='Placed']
  df = df[['draw_eventdate_time', 'selection_details','amount']]
  # make odds and win column
  df['odds'] = df[['selection_details']].apply(lambda x: x.str.split('@').str[-1])
  df['odds'] = pd.to_numeric(df['odds'])
  df['potential_win'] = df['odds'] * df['amount']
  # match column
  df['match'] = df['selection_details'] \
      .apply(lambda x: re.sub('\\r|\&|\@|\([a-zA-Z]+\)', ' ', x).strip()) 
  df['match'] = df['match'].str.split('-').str[1] + df['match'].str.split('-').str[2] 
  df['match'] =  df['match'].str.rsplit(' ',1).str[0]
  df= df.drop(columns=['selection_details'])
  # add totals
  df= df[['draw_eventdate_time','match','odds','amount','potential_win']]
  df['draw_eventdate_time'] = pd.to_datetime(df['draw_eventdate_time']).dt.normalize()
  df=df.reset_index(drop=True)
  return df

In [194]:
df_open = open_bets(df_clean)
df_open.head()

Unnamed: 0,draw_eventdate_time,match,odds,amount,potential_win
0,2023-05-29,English Premier 2022/2023 Championship Winne...,8.0,6.0,48.0
1,2023-05-29,English Premier 2022/2023 Championship Winne...,1.55,15.0,23.25
2,2023-05-29,English Premier 2022/2023 Championship Winne...,9.0,2.0,18.0
3,2023-05-29,English Premier 2022/2023 Championship Winne...,3.4,1.0,3.4
4,2023-06-05,Spanish League 2022/2023 Championship Winner...,1.9,4.0,7.6


  return reduction(axis=axis, out=out, **passkwargs)


31.849999999999998

In [141]:
df_closed[df_closed['status_receiptno']=='Placed']

Unnamed: 0,index,transaction_date_time,type,amount,draw_eventdate_time,status_receiptno,payout_winnings,datetime,returns,cum_sum,perc_returns,live,league,match,home,away,bet_side,sub_type,odds,imp_prob
4,4,2022-08-11 03:01:00,Football,6.0,2023-05-29,Placed,,2023-05-29,,,,0,English Premier (Outrights/Specials),English Premier 2022/2023,English Premier 2022/2023,Unknown,A,Championship,8.0,0.125
5,5,2022-08-11 03:01:00,Football,15.0,2023-05-29,Placed,,2023-05-29,,,,0,English Premier (Outrights/Specials),English Premier 2022/2023,English Premier 2022/2023,Unknown,A,Championship,1.55,0.645161
26,13,2022-08-27 12:21:00,Football,2.0,2023-05-29,Placed,,2023-05-29,,,,0,English Premier (Outrights/Specials),English Premier 2022/2023,English Premier 2022/2023,Unknown,A,Championship,9.0,0.111111
3,3,2022-08-11 03:01:00,Football,1.0,2023-05-29,Placed,,2023-05-29,,,,0,English Premier (Outrights/Specials),English Premier 2022/2023,English Premier 2022/2023,Unknown,A,Championship,3.4,0.294118
6,6,2022-08-11 03:13:00,Football,4.0,2023-06-05,Placed,,2023-06-05,,,,0,Spanish League (Outrights/Specials),Spanish League 2022/2023,Spanish League 2022/2023,Unknown,A,Championship,1.9,0.526316
7,7,2022-08-11 03:16:00,Football,1.0,2023-06-05,Placed,,2023-06-05,,,,0,Italian League (Outrights/Specials),Italian League 2022/2023,Italian League 2022/2023,Unknown,A,Championship,8.0,0.125


In [142]:
df_wr = win_ratio_table(df_clean)
df_wr.head()

KeyError: ignored