# Creating a unique payments table

In [None]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn
import psycopg2
import io
from sqlalchemy import create_engine

## Extracting payment information

In [None]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

In [None]:
payments = []
keys = (['note', 'action', 'status', 'date_created', 'id',
         'merchant_split_purchase', 'audience', 'date_completed'])
subdictionary_keys = ['target', 'actor']
# Onle including the keys in the payment target subdictionary that contains values
target_keys = ['redeemable_target', 'type']
user_key = ['user']
actor_key = ['id']

for transaction in initial_5pct:
    payment = {}
    payment_details = transaction['payment']
    for key, val in payment_details.items():
        if key in keys:
            unpacked = f'{key}'
            payment[unpacked] = val
        elif key in subdictionary_keys:
            for subkey, subval in val.items():
                if subkey in target_keys:
                    subkey_unpacked = f'{key}_{subkey}'
                    payment[subkey_unpacked] = subval
                elif subkey in user_key:
                    subkey_unpacked = f'{key}_{subkey}_{actor_key[0]}'
                    # Some transactions don't have end users and as such they are deemed
                    # as pending or cancelled. However, these should not be dropped because 
                    # the user still made a transaction.
                    try:
                        subkey_unpacked_val = transaction['payment'][f'{key}'][f'{subkey}'][f'{actor_key[0]}']
                        payment[subkey_unpacked] = subkey_unpacked_val
                    except TypeError:
                        continue
                elif subkey in actor_key:
                    subkey_unpacked = f'{key}_{subkey}'
                    payment[subkey_unpacked] = subval
                else:
                    pass
        else:
            pass
    payments.append(payment.copy())

In [None]:
payments_df = pd.DataFrame(payments)

In [None]:
payments_df['status'].value_counts()

In [None]:
# Identify payers who have pending or cancelled transactions
payer_ids = payments_df.loc[payments_df['status'] != 'settled']['actor_id']

In [None]:
# Extract the payers that have at least one pending/cancelled transaction
unique_payer_ids = payer_ids.unique()

In [None]:
f'There are {len(unique_payer_ids)} payers who have made at least one unsettled transaction'

In [None]:
x = payments_df.loc[payments_df['actor_id'].isin(unique_payer_ids)]

In [None]:
x = x[['actor_id', 'date_completed', 'date_created', 'id', 'note', 'status']]
x['date_completed'] = pd.to_datetime(x['date_completed'], format='%Y-%m-%dT%H:%M:%S')
x['date_created'] = pd.to_datetime(x['date_created'], format='%Y-%m-%dT%H:%M:%S')

In [None]:
x = x.sort_values(['actor_id', 'date_created'])

In [None]:
x

In [None]:
# Identify the actors that made more than one transaction given that they had at least one unsettled transaction 
actor_ids = set()
for actor in x['actor_id']:
    actor_specific_df = x.loc[x['actor_id'] == f'{actor}']
    if actor_specific_df.shape[0] > 1:
        actor_ids.add(actor)
    else:
        continue

In [None]:
len(actor_ids)

In [None]:
x['more_than_1'] = [1 if actor in actor_ids else 0 for actor in x['actor_id']]

In [None]:
multiple_trans = x.loc[x['more_than_1'] == 1]

In [None]:
multiple_trans.head(10)

In [None]:
# Select the transactions which users with unsettled payments have made within 10 minutes of each other.

# This 10 minute rule though is not very specific. User 2534007896014848135 waited for more than an hour
# but appears to only want to make one transaction. This is a loopwhole through our functions

settled_transaction_ids = set()
unsettled_transaction_ids = set()

for actor in multiple_trans['actor_id'].unique():
    #Creating actor specific dataframes
    same_hour_trans_df = multiple_trans.loc[multiple_trans['actor_id'] == f'{actor}']
    transaction_dates = [date for date in same_hour_trans_df['date_created']]
    #Separating the dates of created payments for each user
    for i in range(len(transaction_dates)-1):
        time_diff = transaction_dates[i+1] - transaction_dates[i]
        time_diff = time_diff.total_seconds()
        #If the payments are made within 10 minutes then identify those transactions
        if time_diff < 600: #WHY 10 MINUTES THOUGH?
            date_tuple = (transaction_dates[i], transaction_dates[i+1])
            #Create a new dataframe for each user that contains transactions made within 10 minute of each other
            transaction_within_10 = same_hour_trans_df.loc[same_hour_trans_df['date_created'].isin(date_tuple)]
            #Extract the status' of both transactions
            status = [status for status in transaction_within_10['status']]
            for i in status:
            #If one of the status' is settled it means that the rest are duplicates
                if i == 'settled':
                    settled_id = transaction_within_10.loc[transaction_within_10['status'] == i]['id']
                    settled_transaction_ids.add(settled_id.all())
                else:
                    unsettled_id = transaction_within_10.loc[transaction_within_10['status'] == i]['id']
                    unsettled_transaction_ids.add(unsettled_id.all())
            #However, if for a particular user there are no succesful transactions, then only one should be kept
            # CHECK THIS POINT WITH BRIAN
        else:
            continue

In [None]:
settled_transactions_df = multiple_trans.loc[multiple_trans['id'].isin(settled_transaction_ids)]

In [None]:
settled_transactions_df.head()

In [None]:
unsettled_transactions_df = multiple_trans.loc[multiple_trans['id'].isin(unsettled_transaction_ids)]

In [None]:
unsettled_transactions_df

In [None]:
interesting_actors = multiple_trans_in_an_hour_df.head(9)

In [None]:
payment_ids = set()
for actor in interesting_actors['actor_id']:
    int_trans_df = interesting_actors.loc[interesting_actors['actor_id'] == f'{actor}']
    transaction_dates = [date for date in same_hour_trans_df['date_created']]
    for i in range(len(transaction_dates)-1):
        time_diff = transaction_dates[i+1] - transaction_dates[i]
        time_diff = time_diff.total_seconds()
        if time_diff < 1200:
            multiple_trans_in_an_hour_ids.add(actor)
        else:
            continue

If users have only made one unsettled transaction, flag users. 

- If those users opened the account recently, it is less likely that they will make a transaction soon given their bad experience with the app. Moreover, we are looking at a history of 2 months, so if they recently opened an account, made an unsuccessful transaction and haven't made one again then we are better off dropping them as they will just be adding noise. 

- On the other hand, if their account has been active for a longer time period this means that they have probable made more than the unsuccesful transaction in the past. So it is best to keep them.

- If they have made more than one transaction in a close time period, then drop unsucessful and keep succesful one only.

In [None]:
x.loc[x['actor_id'] == f'{actor}']

In [None]:
x.head()

In [None]:
actor_specific_df

In [None]:
x = x[['actor_id', 'date_created']]

In [None]:
x['date_created'] = pd.to_datetime(x['date_created'], format='%Y-%m-%dT%H:%M:%S')

In [None]:
x['diff'] = x.groupby(['actor_id'])['date_created'].diff()
x.drop('date_created', axis=1, inplace=True)
x.dropna(axis=0, inplace=True)
x['diff'] = [date.total_seconds() for date in x['diff']]

In [None]:
x.head()

In [None]:
mean_time_between_transactions_failed_df = pd.DataFrame(x.groupby(['actor_id'])['diff'].mean())
mean_time_between_transactions_failed_df.reset_index(inplace=True)

In [None]:
mean_time_between_transactions_failed_df

In [None]:
x.info()

In [None]:
x['diff'] = [date.total_seconds() for date in x['diff']]

In [None]:
x.head()

In [None]:
payer_ids = payments_df.loc[payments_df['status'] != 'settled']['actor_id']

If they have made only one transaction and it hasn't worked, drop it. Else, if they have made more than one transaction within a certain time window, then keep the succesful one.

In [None]:
users_with_unsettled_payments.sort_values('actor_id').head()

In [None]:
users_with_unsettled_payments['target_type'].value_counts()

In [None]:
payments_df.head()

In [None]:
payments_df.info()

The 2422 values missing in the date_completed and target_user_id col come from those transactions that don't have a payee and as such they are never completed (deemed as pending or cancelled).

In [None]:
# Rename col id to payment_id for easier recognition in the db
payments_df = payments_df.rename(columns = {"id": "payment_id"}) 

In [None]:
# Converting the date_created and date_completed objects into a datetime.datetime field
payments_df['date_completed'] = pd.to_datetime(payments_df['date_completed'], format='%Y-%m-%dT%H:%M:%S')
payments_df['date_created'] = pd.to_datetime(payments_df['date_created'], format='%Y-%m-%dT%H:%M:%S')

In [None]:
# Investigate the non null values in merchant_split_purchase
payments_df.loc[payments_df['merchant_split_purchase'].notnull()].head()

They all appear to be charges instead of payments. We will unpack the merchant_split_purchase into two different cols

In [None]:
payments_df = payments_df.drop('merchant_split_purchase', 1).assign(**payments_df['merchant_split_purchase']
                                                                    .dropna().apply(pd.Series))

In [None]:
payments_df.info()

In [None]:
# Rename to miror the json structure
payments_df = payments_df.rename(columns = {"authorization_id": "merchant_authorization_id"})

In [None]:
# Investigate the non null values in target_redeemable_target
payments_df.loc[payments_df['target_redeemable_target'].notnull()]['target_redeemable_target'].head()

Same thought process as with the merchant_split_purchase col

In [None]:
payments_df = payments_df.drop('target_redeemable_target', 1).assign(**payments_df['target_redeemable_target']
                                                                     .dropna().apply(pd.Series))

In [None]:
# Rename to miror the json structure
payments_df = payments_df.rename(columns = {"display_name": "target_redeemable_target_display_name",
                                            "type": "target_redeemable_target_type"})

In [None]:
payments_df.info()

## Dropping resulting payments table into the venmo_transactions db

In [None]:
# Retrieve information about the venmo_transactions db
keys = fn.get_keys("/Users/jjherranzsarrion/.secret/local_info.json")
username = keys['username']
password = keys['password']

In [None]:
# Move payments_df table into the database
engine = create_engine(f'postgresql://{username}:{password}@localhost/venmo_transactions')
payments_df.to_sql('payments', engine)