In [1]:


import utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import asyncio
import plotly.express as px 
import plotly.graph_objects as go
import statsmodels.formula.api as smf


%autoawait asyncio


/Users/gchebi/muun/python-clients/curiosity/curiosity/notebooks


In [54]:
# This data was obtained using the query from analytics_query.sql (using BigQuery)
analytics_df = (pd.read_csv('~/Downloads/bq-results-20240320-165335-1710953676042.csv')
                .reset_index())

In [44]:
# Retrieving swaps and user data. (User data is needed to map user_uuid to user_id: 
# swaps has the uuid reference, analytics has the id reference) 
swaps_start_date = '2024-02-02'
swaps_end_date = '2024-02-28'
user_creation_start_date = '2020-01-01'
user_creation_end_date = '2024-05-01'

dfs = await asyncio.gather(
    utils.get_swap_data(swaps_start_date, swaps_end_date),
    utils.get_user_and_sessions_data(user_creation_start_date, user_creation_end_date),
    utils.get_onchain_operations(swaps_start_date, swaps_end_date)
)

In [45]:
# Simplifying id to uuid tables
del dfs[1]['session_uuid']
user_id_to_uuid = dfs[1].drop_duplicates()


In [46]:
# Adding ids to swaps
swaps_df = pd.merge(dfs[0], user_id_to_uuid, left_on='sender_uuid', right_on='user_uuid')

In [55]:
analytics_df.columns

Index(['index', 'user_id', 'ts', 's_new_op_confirmation-routingFeeInSat',
       's_new_op_confirmation-onchainFee',
       's_new_op_confirmation-sats_per_virtual_byte',
       's_new_op_confirmation-fee', 's_new_op_confirmation-amount',
       's_new_op_confirmation-type', 's_new_op_confirmation-outputAmountInSat',
       's_new_op_confirmation-total',
       's_new_op_confirmation-outputPaddingInSat', 's_new_op_error-type',
       'e_new_op_action-type', 's_new_op_loading-type'],
      dtype='object')

In [56]:
# Matching events within swaps.
# Note: PR 11328 is adding swap_uuid data into analytics. That should simplify this 
# matching.

# This is the max lag to be considered after the swap timestamp
MAX_TIME_FORWARD_DIFF_IN_SECS = 1.5 * 60 * 60 # 1.5 hours
# This is the time window size to consider previous to the swap timestamp
MAX_TIME_BACKWARDS_DIFF_IN_SECS = 15

error_lists = []
matched_analytics_row_lists = []
relevant_lists = []
action_lists = []
full_lists = []
for _, swap_row in tqdm(swaps_df.iterrows()):
    swap_ts = swap_row['timestamp']
    user_id = swap_row['user_id']
    max_event_time = swap_ts + pd.Timedelta(MAX_TIME_FORWARD_DIFF_IN_SECS, unit='s')
    min_event_time = swap_ts - pd.Timedelta(MAX_TIME_BACKWARDS_DIFF_IN_SECS, unit='s')
    user_events_df = (
        analytics_df.
        query(f'user_id == {user_id}').
        query(f'ts >= \'{min_event_time}\'').
        query(f'ts <= \'{max_event_time}\'')
    )
    
    # We want to track several lists of ordered events for the given swap
    error_list = []
    matched_analytics_row_list = []
    action_list = []
    full_list = []
    relevant_list = []
    for _, analytics_row in user_events_df.iterrows():
        error_value = analytics_row['s_new_op_error-type']
        action_value = analytics_row['e_new_op_action-type']
        row_ix = analytics_row['index']
        loading_value = analytics_row['s_new_op_loading-type']
        if not pd.isna(error_value):
            error_list.append(error_value)
            full_list.append(error_value)
            relevant_list.append(error_value)
        if not pd.isna(action_value):
            action_list.append(action_value)
            full_list.append(action_value)
        if not pd.isna(loading_value):
            relevant_list.append(loading_value)
            full_list.append(loading_value)
        if not pd.isna(analytics_row['s_new_op_confirmation-total']):
            matched_analytics_row_list.append(row_ix)
            full_list.append(row_ix)
            relevant_list.append(row_ix)
    error_lists.append(error_list)
    matched_analytics_row_lists.append(matched_analytics_row_list)
    relevant_lists.append(relevant_list)
    action_lists.append(action_list)
    full_lists.append(full_list)
    

25593it [01:20, 317.63it/s]


In [57]:
swaps_df['errors'] = error_lists
swaps_df['matched_analytics_fees_index'] = matched_analytics_row_lists
swaps_df['relevant'] = relevant_lists
swaps_df['actions'] = action_lists
swaps_df['full'] = full_lists


In [58]:
# This function gets the result of trying to find the "relevant" event for a given list
# of events corresponding to a swap. This function returns one of the following:
# - the corresponding index of the analytics table with the confirmation screen (which 
# contains the onchain fees)
# - the error for which the user could not see the confirmation fee
# - an indication of missing events (i.e. we don't know what happened)
def get_relevant_event(event_list):
    # No events case
    if len(event_list) == 0:
        return 'no_events'
    
    # Looking for the first submarine swap
    first_submarine_swap_ix = None
    for ix, event in enumerate(event_list):
        if isinstance(event, str) and event.lower() == 'submarine_swap':
            first_submarine_swap_ix = ix
            break
    
    # No submarine swap case
    if first_submarine_swap_ix is None:
        return 'no_submarine_swap'
    
    event_sublist = event_list[first_submarine_swap_ix:]
    
    # No relevant event between submarine swaps
    if (len(event_sublist) == 1 or (isinstance(event_sublist[1], str) and 
            event_sublist[1].lower() in ['submarine_swap','to_address'])):
        return 'submarine_swap_without_next_event'
    
    # Happy case
    return event_sublist[1]
    

In [59]:
swaps_df['relevant_event'] = swaps_df['relevant'].map(get_relevant_event)

In [60]:
del analytics_df['user_id']
del swaps_df['errors'],
del swaps_df['matched_analytics_fees_index']
del swaps_df['relevant']
del swaps_df['actions']
del swaps_df['full']
merged = pd.merge(swaps_df, analytics_df, left_on='relevant_event', right_on='index', 
            how='left')


In [61]:
# Getting onchain payments only from the selected group of users
onchain_operations = pd.merge(dfs[2], user_id_to_uuid[['user_id']])

In [62]:
# Concatenating + sorting swaps and onchain operation
merged = (pd.concat([merged, onchain_operations], ignore_index=True).sort_values
         (['user_id', 'timestamp']))


In [63]:
## Grouping swaps: many times a user tries again when a swap fails. We want to consider
# all different attempts as the same swap group.
TOLERANCE_IN_SECS = 5 * 60
group_ixs = []
merged.sort_values(['user_id', 'timestamp'], inplace=True)
group_ix = 0
last_cancelled_ts = None
last_user = None
for _, row in merged.iterrows():
    # If the user is new, we define a new group
    if last_user is None or row.user_id != last_user:
        group_ix += 1
        group_ixs.append(group_ix)
        last_user = row.user_id
    else:
        # If there is a recent previous swap that was cancelled by this user, we use 
        # the same group. If not, then we use a new group.
        if (last_cancelled_ts is not None and row.timestamp - last_cancelled_ts <= pd
                .Timedelta(seconds=TOLERANCE_IN_SECS)):
            group_ixs.append(group_ix)
        else:
            group_ix += 1
            group_ixs.append(group_ix)

    # Updating last_cancelled_ts according to current status
    if row.status == 'CANCELLED':
        last_cancelled_ts = row.timestamp
    else:
        last_cancelled_ts = None
        
merged['group_ix'] = group_ixs


In [64]:
# Throwing away groups without swaps
selected_groups = (merged.groupby('group_ix').agg({'swap_uuid': (lambda series: series
                                                             .dropna().shape[0])})
                   .query('swap_uuid > 0')).index

In [65]:
(merged.query('relevant_event == \'insufficient_funds\'').status == 'CANCELLED').mean()


0.9881138975966562

In [66]:
merged = merged[merged.group_ix.isin(selected_groups)]

Questions:
1. What's our data coverage? What's the match accuracy?
2. Create a funnel structure to describe this data.
3. Do users try to fix the insufficient_funds swap?
4. What's the probability that a swap will be submitted? (using fees as features)

1. What's our swap data coverage?

In [67]:
only_swaps = merged[~pd.isna(merged.swap_uuid)].copy()

In [68]:
only_swaps['is_matched'] = only_swaps['s_new_op_confirmation-amount'] == only_swaps['amount']


In [69]:
only_swaps['is_covered'] = np.logical_or(only_swaps['is_matched'], only_swaps['relevant_event']
                                         == 
                                     'insufficient_funds')

In [70]:
only_swaps.is_covered.mean()
# Note: there should be ways to improve this number

0.8711366389247059

In [71]:
## Correcting relevant event when the amounts do not match
merged['relevant_event'] = merged.apply(lambda row: 'unmatched_amounts' if (not pd
             .isna(row['s_new_op_confirmation-amount']) and not pd.isna(row['amount']) 
and row['s_new_op_confirmation-amount'] != row['amount']) else row.relevant_event, 
             axis=1)

In [72]:
(merged.relevant_event == 'unmatched_amounts').mean()

0.0034541644027012344

2. Defining a funnel structure

User creates a swap
    1. First swap involves an onchain transaction
        1.1. Operation is successful in the first attempt.
        1.2. Operation is successful after more than one attempt (how many?).
            1.2.1. User changed to onchain.
            1.2.2. User decreased the amount so that the operation did not need an 
            onchain tx.
            1.2.3. User decreased the amount and the swap still needed an onchain tx.
            1.2.4. None of the above.
        1.3. Operation failed (how many attempts?)
            1.3.1. User only had 'insufficient_funds' errors, so never go to see 
            confirmation screen.
            1.3.2. User saw both the 'insufficient_funds' error and the confirmation 
            screen, but did not hit submit.
            1.3.3. User only saw the confirmation screen, but did not hit submit.
            1.3.4. None of the above (can happen if there is a data coverage problem)
    2. First swap does not involve an onchain transaction 
        Same categories as 1.

In [74]:
# Adding indexes within groups
merged['within_group_ix'] = merged.groupby('group_ix')['swap_uuid'].cumcount()

In [75]:
# Checking that all first operations were swaps
pd.isna(merged.query('within_group_ix == 0').swap_uuid).mean()

0.0

In [76]:
# Total number of groups
total = merged.query('within_group_ix == 0').shape[0]
print(f'Total number of swap operation groups: {total}')

Total number of swap operation groups: 16887


In [77]:
num_1 = merged.query('within_group_ix == 0').query('debt_type != \'LEND\'').shape[0]
print(f'Total number of swap operations groups that involved an onchain tx: {num_1}. '
      f'This represents {format(num_1/total, ".1%")} of all cases')

Total number of swap operations groups that involved an onchain tx: 10193. This represents 60.4% of all cases


Non-lend cases

In [78]:
onchain_group_ixs = merged.query('within_group_ix == 0').query('debt_type != \'LEND\'')['group_ix']
merged_onchain = merged[merged.group_ix.isin(onchain_group_ixs)].copy()
merged_onchain_grouped = merged_onchain.groupby('group_ix').agg(
    num_attempts = pd.NamedAgg('within_group_ix', lambda series: max(series) + 1),
    was_finished = pd.NamedAgg('status', lambda series: max([pd.isna(x) or x == 
                                                             'FINISHED' for x in series]))
)
num_1_1 = merged_onchain_grouped.query('num_attempts == 1').query('was_finished == '
                                                                  'True').shape[0]
num_1_2 = merged_onchain_grouped.query('num_attempts > 1').query('was_finished == '
                                                                 'True').shape[0]
num_1_3 = merged_onchain_grouped.query('was_finished == False').shape[0]

print(f'Total number of cases of submitted swaps in one attempt: {num_1_1}. This '
      f'corresponds to {format(num_1_1 / num_1, ".1%")} of the operation groups that '
      f'required an onchain tx.' )

print(f'Total number of cases of submitted swaps in more than one attempt: {num_1_2}. '
      f'This corresponds to {format(num_1_2 / num_1, ".1%")} of the operation groups that '
      f'required an onchain tx.' )

print(f'Total number of cases of cancelled swaps: {num_1_3}. '
      f'This corresponds to {format(num_1_3 / num_1, ".1%")} of the operation groups '
      f'that required an onchain tx.' )

Total number of cases of submitted swaps in one attempt: 4857. This corresponds to 47.7% of the operation groups that required an onchain tx.
Total number of cases of submitted swaps in more than one attempt: 1663. This corresponds to 16.3% of the operation groups that required an onchain tx.
Total number of cases of cancelled swaps: 3673. This corresponds to 36.0% of the operation groups that required an onchain tx.


In [79]:
groups_1_2 = merged_onchain_grouped.query('num_attempts > 1').query('was_finished == '
                                                                  'True').index
merged_1_2 = merged[merged.group_ix.isin(groups_1_2)].copy()

groups_1_2_1 = []
groups_1_2_2 = []
groups_1_2_3 = []

for group_ix in groups_1_2:
    group_df = merged_1_2.query(f'group_ix == {group_ix}')
    init_amount = group_df.iloc[0, :].loc['amount']
    end_amount =  group_df.iloc[-1, :].loc['amount']
    end_was_onchain = 0 if pd.isna(group_df.iloc[-1, :].loc['onchain_fee']) else 1
    end_debt_type = group_df.iloc[-1, :].loc['debt_type']
    if end_was_onchain == 1:
        groups_1_2_1.append(group_ix)
        continue
    if end_amount < init_amount:
        if end_debt_type == 'LEND':
            groups_1_2_2.append(group_ix)
        else:
            groups_1_2_3.append(group_ix)

num_1_2_1 = len(groups_1_2_1)
num_1_2_2 = len(groups_1_2_2)
num_1_2_3 = len(groups_1_2_3)
num_1_2_4 = num_1_2 - num_1_2_1 - num_1_2_2 - num_1_2_3

print(f'Total number of cases where the user switched to onchain: {num_1_2_1}. This '
      f'corresponds to {format(num_1_2_1 / num_1_2, ".1%")} of the operation '
      f'groups that required more than one attempt to submit.' )

print(f'Total number of cases where the user lowered the amount so that the operation '
      f'become a LEND: {num_1_2_2}. This corresponds to {format(num_1_2_2 / num_1_2, ".1%")} of the operation '
      f'groups that required more than one attempt to submit.' )

print(f'Total number of cases where the user lowered the amount but the operation did '
      f'not become a LEND: {num_1_2_3}. This corresponds to {format(num_1_2_3 / num_1_2, ".1%")} of the operation '
      f'groups that required more than one attempt to submit.' )


Total number of cases where the user switched to onchain: 165. This corresponds to 9.9% of the operation groups that required more than one attempt to submit.
Total number of cases where the user lowered the amount so that the operation become a LEND: 187. This corresponds to 11.2% of the operation groups that required more than one attempt to submit.
Total number of cases where the user lowered the amount but the operation did not become a LEND: 905. This corresponds to 54.4% of the operation groups that required more than one attempt to submit.


In [80]:
groups_1_3 =  merged_onchain_grouped.query('was_finished == False').index
merged_1_3 = merged[merged.group_ix.isin(groups_1_3)].copy()

groups_1_3_1 = []
groups_1_3_2 = []
groups_1_3_3 = []

for group_ix in groups_1_3:
    group_df = merged_1_3.query(f'group_ix == {group_ix}')
    relevant_events = group_df['relevant_event']
    num_insufficient_funds = len([x for x in relevant_events if x == 'insufficient_funds'])
    num_confirmation_screen = len([x for x in relevant_events if isinstance(x, int)])
    num_others = len(relevant_events) - num_confirmation_screen - num_insufficient_funds
    if num_others > 0:
        continue
    if num_confirmation_screen == 0:
        groups_1_3_1.append(group_ix)
    elif num_insufficient_funds == 0:
        groups_1_3_3.append(group_ix)
    else:
        groups_1_3_2.append(group_ix)
        
        
num_1_3_1 = len(groups_1_3_1)
num_1_3_2 = len(groups_1_3_2)
num_1_3_3 = len(groups_1_3_3)
num_1_3_4 = num_1_3 - num_1_3_1 - num_1_3_2 - num_1_3_3

print(f'Total number of cases where the user only saw the insufficient_funds_error: '
      f'{num_1_3_1}. This corresponds to {format(num_1_3_1 / num_1_3, ".1%")} of the '
      f'operation groups that were never submitted.' )   

print(f'Total number of cases where the user saw the insufficient_funds_error and a '
      f'confirmation screen: {num_1_3_2}. '
      f'This corresponds to {format(num_1_3_2 / num_1_3, ".1%")} of the '
      f'operation groups that were never submitted.' )   

print(f'Total number of cases where the user only saw the confirmation screen: '
      f'{num_1_3_3}. '
      f'This corresponds to {format(num_1_3_3 / num_1_3, ".1%")} of the '
      f'operation groups that were never submitted.' )   



Total number of cases where the user only saw the insufficient_funds_error: 1423. This corresponds to 38.7% of the operation groups that were never submitted.
Total number of cases where the user saw the insufficient_funds_error and a confirmation screen: 265. This corresponds to 7.2% of the operation groups that were never submitted.
Total number of cases where the user only saw the confirmation screen: 1449. This corresponds to 39.5% of the operation groups that were never submitted.


Lend cases

In [81]:
num_2 = total - num_1

In [82]:
merged_lend = merged[~merged.group_ix.isin(onchain_group_ixs)].copy()
merged_lend_grouped = merged_lend.groupby('group_ix').agg(
    num_attempts = pd.NamedAgg('within_group_ix', lambda series: max(series) + 1),
    was_finished = pd.NamedAgg('status', lambda series: max([pd.isna(x) or x == 
                                                             'FINISHED' for x in series]))
)
num_2_1 = merged_lend_grouped.query('num_attempts == 1').query('was_finished == '
                                                                  'True').shape[0]
num_2_2 = merged_lend_grouped.query('num_attempts > 1').query('was_finished == '
                                                                 'True').shape[0]
num_2_3 = merged_lend_grouped.query('was_finished == False').shape[0]

print(f'Total number of cases of submitted swaps in one attempt: {num_2_1}. This '
      f'corresponds to {format(num_2_1 / num_2, ".1%")} of the lend operation groups')

print(f'Total number of cases of submitted swaps in more than one attempt: {num_2_2}. '
      f'This corresponds to {format(num_2_2 / num_2, ".1%")} of the lend operation '
      f'groups.')

print(f'Total number of cases of cancelled swaps: {num_2_3}. '
      f'This corresponds to {format(num_2_3 / num_2, ".1%")} of the lend operation groups.' )

Total number of cases of submitted swaps in one attempt: 4568. This corresponds to 68.2% of the lend operation groups
Total number of cases of submitted swaps in more than one attempt: 880. This corresponds to 13.1% of the lend operation groups.
Total number of cases of cancelled swaps: 1246. This corresponds to 18.6% of the lend operation groups.


In [83]:
groups_2_3 =  merged_lend_grouped.query('was_finished == False').index
merged_2_3 = merged[merged.group_ix.isin(groups_2_3)].copy()

groups_2_3_1 = []
groups_2_3_2 = []
groups_2_3_3 = []

for group_ix in groups_2_3:
    group_df = merged_2_3.query(f'group_ix == {group_ix}')
    relevant_events = group_df['relevant_event']
    num_insufficient_funds = len([x for x in relevant_events if x == 'insufficient_funds'])
    num_confirmation_screen = len([x for x in relevant_events if isinstance(x, int)])
    num_others = len(relevant_events) - num_confirmation_screen - num_insufficient_funds
    if num_others > 0:
        continue
    if num_confirmation_screen == 0:
        groups_2_3_1.append(group_ix)
    elif num_insufficient_funds == 0:
        groups_2_3_3.append(group_ix)
    else:
        groups_2_3_2.append(group_ix)

num_2_3_1 = len(groups_2_3_1)
num_2_3_2 = len(groups_2_3_2)
num_2_3_3 = len(groups_2_3_3)
num_2_3_4 = num_2_3 - num_2_3_1 - num_2_3_2 - num_2_3_3

print(f'Total number of cases where the user only saw the insufficient_funds_error: '
      f'{num_2_3_1}. '
      f'This corresponds to {format(num_2_3_1 / num_2_3, ".1%")} of the '
      f'lend operation '
      f'groups that were never submitted.' )   

print(f'Total number of cases where the user saw the insufficient_funds_error and a '
      f'confirmation screen: {num_2_3_2}. '
      f'This corresponds to {format(num_2_3_2 / num_2_3, ".1%")} of the '
      f'lend operation '
      f'groups that were never submitted.' )   

print(f'Total number of cases where the user only saw the confirmation screen: '
      f'{num_2_3_3}. '
      f'This corresponds to {format(num_2_3_3 / num_2_3, ".1%")} of the '
      f'lend operation '
      f'groups that were never submitted.' )   



Total number of cases where the user only saw the insufficient_funds_error: 556. This corresponds to 44.6% of the lend operation groups that were never submitted.
Total number of cases where the user saw the insufficient_funds_error and a confirmation screen: 20. This corresponds to 1.6% of the lend operation groups that were never submitted.
Total number of cases where the user only saw the confirmation screen: 447. This corresponds to 35.9% of the lend operation groups that were never submitted.


In [86]:
label_to_desc = {
    "0": 'All swaps',
    "1": 'Swaps that required an onchain tx',
    "2": 'Swaps that did not require an onchain tx',
    "11": 'Submitted in one attempt',
    "12": 'Submitted in more than one attempt',
    "13": 'Never submitted',
    "21": 'Submitted in one attempt',
    "22": 'Submitted in more than one attempt',
    "23": 'Never submitted',
    "121": 'Changed to onchain',
    "122": 'Decreased the amount so <br> that op became a lend',
    "123": 'Decreased the amount but <br> still required an onchain tx', 
    "124": 'Others', 
    "131": 'Only got insufficient_funds errors',
    "132": 'Got insufficient_funds error <br> and saw confirmation screen',
    "133": 'Only saw confirmation screen',
    "134": 'Others',
    "231": 'Only got insufficient_funds errors',
    "232": 'Got insufficient_funds error <br> and saw confirmation screen',
    "233": 'Only saw confirmation screen',
    "234": 'Others',
    # "111": '',
    # "211": '', 
    # "221": ''
}
label_to_ix_dict = {k: ix for ix, k in enumerate(label_to_desc.keys())}
COL_1_X = 0.15
COL_2_X = 0.45
COL_3_X = 1
MAX_Y = 1.15
MIN_Y = -0.1
transitions = [
    ("0", "1", num_1, num_1/total, COL_1_X, 0.3),
    ("0", "2", num_2, num_2/total, COL_1_X, 0.8),
    ("1", "11", num_1_1, num_1_1 / num_1, COL_2_X, MIN_Y + 0/5 * (MAX_Y - MIN_Y)),
    ("1", "12", num_1_2, num_1_2 / num_1, COL_2_X, MIN_Y + 1/5 * (MAX_Y - MIN_Y)),
    ("1", "13", num_1_3, num_1_3 / num_1, COL_2_X, MIN_Y + 2/5 * (MAX_Y - MIN_Y)),
    ("2", "21", num_2_1, num_2_1 / num_2, COL_2_X, MIN_Y + 3/5 * (MAX_Y - MIN_Y)),
    ("2", "22", num_2_2, num_2_2 / num_2, COL_2_X, MIN_Y + 4/5 * (MAX_Y - MIN_Y)),
    ("2", "23", num_2_3, num_2_3 / num_2, COL_2_X, MIN_Y + 5/5 * (MAX_Y - MIN_Y)),
    ("12", "121", num_1_2_1, num_1_2_1 / num_1_2, COL_3_X, MIN_Y + 0/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("12", "122", num_1_2_2, num_1_2_2 / num_1_2, COL_3_X, MIN_Y + 1/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("12", "123", num_1_2_3, num_1_2_3 / num_1_2, COL_3_X, MIN_Y + 2/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("12", "124", num_1_2_4, num_1_2_4 / num_1_2, COL_3_X, MIN_Y + 3/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("13", "131", num_1_3_1, num_1_3_1 / num_1_3, COL_3_X, MIN_Y + 4/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("13", "132", num_1_3_2, num_1_3_2 / num_1_3, COL_3_X, MIN_Y + 5/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("13", "133", num_1_3_3, num_1_3_3 / num_1_3, COL_3_X, MIN_Y + 6/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("13", "134", num_1_3_4, num_1_3_4 / num_1_3, COL_3_X, MIN_Y + 7/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("23", "231", num_2_3_1, num_2_3_1 / num_2_3, COL_3_X, MIN_Y + 8/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("23", "232", num_2_3_2, num_2_3_2 / num_2_3, COL_3_X, MIN_Y + 9/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("23", "233", num_2_3_3, num_2_3_3 / num_2_3, COL_3_X, MIN_Y + 10/11 * (MAX_Y - 
                                                                           MIN_Y)),
    ("23", "234", num_2_3_4, num_2_3_4 / num_2_3, COL_3_X, MIN_Y + 11/11 * (MAX_Y - 
                                                                           MIN_Y)),
    # ("11", "111", num_1_1, None),
    # ("21", "211", num_2_1, None),
    # ("22", "221", num_2_2, None),
]

node_to_prop = {tup[1]: format(tup[3], '.1%') for tup in transitions if tup[3] is not 
                None}
node_to_x = {tup[1]: tup[4] for tup in transitions}
node_to_x["0"] = 0.01

node_to_y = {tup[1]: tup[5] for tup in transitions}
node_to_y["0"] = 0.5

labels_with_props = {k : label_to_desc[k] + ' (' + node_to_prop[k] + ')' if k in 
node_to_prop else label_to_desc[k] for k in label_to_desc.keys()}

xs = [node_to_x[node] for node in label_to_desc.keys()]
ys = [node_to_y[node] for node in label_to_desc.keys()]

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      x = xs, 
      y = ys,
      line = dict(color = "black", width = 0.5),
      label = list(labels_with_props.values()),
      color = 'blue'
    ),
    link = dict(
      source = [label_to_ix_dict[tup[0]] for tup in transitions],
      target = [label_to_ix_dict[tup[1]] for tup in transitions],
      value = [tup[2] for tup in transitions]
  ))])

fig.update_layout(title_text="Outgoing lightning payments user flow", 
                  template='plotly_white', font_size=10)
fig.show()




3. Analyze probability of submission given fee and fee rate

In [87]:
conf_status = merged[~pd.isna(merged['s_new_op_confirmation-fee'])][['amount', 
                                                       's_new_op_confirmation-fee', 
                                                       'status']].copy()


In [88]:
conf_status['relative_fee'] = (conf_status['s_new_op_confirmation-fee'] / 
                            conf_status['amount'])

In [89]:
conf_status['is_cancelled'] = conf_status.status.map(lambda x: 1 if x == 'CANCELLED' 
else 0)
conf_status.rename(columns={'s_new_op_confirmation-fee':'fee'}, inplace=True)

In [90]:
logreg = smf.logit('is_cancelled ~ relative_fee + amount + fee', data = conf_status).fit()
print(logreg.summary())


Optimization terminated successfully.
         Current function value: 0.497912
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:           is_cancelled   No. Observations:                14728
Model:                          Logit   Df Residuals:                    14724
Method:                           MLE   Df Model:                            3
Date:                Mon, 25 Mar 2024   Pseudo R-squ.:                 0.07580
Time:                        14:28:36   Log-Likelihood:                -7333.2
converged:                       True   LL-Null:                       -7934.7
Covariance Type:            nonrobust   LLR p-value:                1.763e-260
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -1.6927      0.028    -61.220      0.000      -1.747      -1.639
relative_fee     0.0161

In [141]:
amounts = [1, 5, 10, 50, 100, 1000, 10000]
fees = range(1, 3000)
results = {}
for amount in amounts:
    df = pd.DataFrame({'amount': amount, 'fee': fees})
    df['relative_fee'] = df['fee'] / df['amount']
    predictions = logreg.get_prediction(df)
    results[f'amount = {amount}'] = predictions.summary_frame(alpha=0.05).predicted.values

results = pd.DataFrame(results)

# data
fig = px.line(results, x=fees, y=results.columns)
fig.update_layout(
    template='plotly_white',
    xaxis_title="Fee",
    yaxis_title="Predicted probability of cancel",
    yaxis_range=[0,1]
)

fig.show()


In [145]:
conf_status

Unnamed: 0,amount,fee,status,relative_fee,is_cancelled
13841,97365.0,5100.0,FINISHED,0.05238,0
15739,109039.0,6201.0,FINISHED,0.05687,0
25189,10.0,0.0,FAILED,0.00000,0
25146,10000.0,15.0,FAILED,0.00150,0
25191,1000.0,1.0,FAILED,0.00100,0
...,...,...,...,...,...
25460,20000.0,3318.0,FINISHED,0.16590,0
25268,1.0,2997.0,PAYED,2997.00000,0
25263,24987.0,12.0,FINISHED,0.00048,0
25341,2500.0,2454.0,CANCELLED,0.98160,1


In [152]:
(conf_status.status == 'CANCELLED').mean()


0.24069860813704497

In [153]:
epss = np.linspace(0, 1, 1000)
mean_preds = []
for eps in np.linspace(0, 1, 1000):
    mod_conf_status = conf_status.copy()
    mod_conf_status['fee'] = mod_conf_status['fee'] * eps 
    mod_conf_status['relative_fee'] = mod_conf_status['fee'] / mod_conf_status['amount'] 
    mean_preds.append(logreg.predict(mod_conf_status).mean())

In [158]:
mean_preds[0], mean_preds[-1]

(0.1462497157738863, 0.22956273764258567)

In [157]:
fig = px.line(x=epss, y=mean_preds)
fig.update_layout(template = 'plotly_white', yaxis_title = 'Expected proportion of '
                                                           'cancellations', xaxis_title = 
'Fee multiplier', yaxis_range=[0,1])

In [142]:
fig.write_image('probas.jpeg')


In [143]:
## How much does this affect acceptance in our data?

## Number of attempts

In [102]:
grouped = merged.groupby('group_ix').agg({'within_group_ix': lambda ser: ser.max() + 1})
fig = px.histogram(grouped, x = 'within_group_ix')
fig.update_layout(template = 'plotly_white', xaxis_title = 'Number of attempts', 
                  xaxis_range=[0.5,10.5])


In [110]:
grouped

Unnamed: 0_level_0,within_group_ix
group_ix,Unnamed: 1_level_1
1,1
3,1
4,1
8,1
9,1
...,...
42033,6
42034,1
42039,2
42041,1


In [140]:
a = (grouped.groupby('within_group_ix').agg(prop_cases = pd.NamedAgg('within_group_ix',
                                                                     'count')) / tot)
a = a[:5].reset_index()
b = pd.DataFrame({'within_group_ix': ['6 or more'], 'prop_cases': [1 - a.prop_cases.sum
()]})
a = pd.concat([a,b], ignore_index=True)
a['within_group_ix'] = a['within_group_ix'].astype(str)
a['text'] = a.prop_cases.map(lambda x: format(x, '.1%'))
fig = px.bar(a, x = 'within_group_ix', y = 'prop_cases', text='text')
fig.update_layout(template = 'plotly_white', xaxis_title='Number of attempts', 
                  yaxis_title = 'Proportion of OLP groups')