# Performing EDA on Venmo data

In [None]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn

Data has to be exported from a Mongo DB

In [None]:
# Retrieve the venmo transaction collection from the MongoDB
venmo = fn.collection()

In [None]:
# Count all transactions in the venmo data
venmo.count()

In [None]:
# Inspect the first transaction
venmo.find_one()

In [None]:
#pass in initial_5pct function to generate a pickle with the first ~350k transactions
#initial_5pct = fn.initial_5pct(venmo)

In [None]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

In [None]:
first = venmo.find_one()

In [None]:
first

In [None]:
transaction_df = []
transaction = {}
keys = ['mentions', 'likes', 'comments','app']
payment_keys = (['amount', 'note', 'action', 'status', 'date_created', 'date_reminded',
                 'id', 'date_authorized', 'merchant_split_purchase', 'audience', 'date_completed'])
payment_inner_keys = ['target', 'actor']
target_keys = ['redeemable_target', 'merchant', 'phone', 'email', 'type']
actor_keys = ["username", "friends_count", "is_active", "display_name", "friend_status", "email",
              "first_name", "identity", "last_name", "is_blocked", "about", "profile_picture_url", "id",
              "phone", "trust_request", "date_joined", "is_group"]    

for key, val in first.items():
    if key in keys:
        for subkeys, subvals in val.items():
            unpacked = f'{key}_{subkeys}'
            transaction[unpacked] = subvals
    elif key == 'payment':
        for payment_subkeys, payment_subvals in val.items():
            if payment_subkeys in payment_keys:
                payments_unpacked = f'{key}_{payment_subkeys}'
                transaction[payments_unpacked] = payment_subvals
            elif payment_subkeys in payment_inner_keys:
                for payment_target_actor_subkeys, payment_target_actor_subvalues in payment_subvals.items():
                    if payment_target_actor_subkeys in target_keys:
                        payment_target_subkeys_unpacked = f'{key}_{payment_subkeys}_{payment_target_actor_subkeys}'
                        transaction[payment_target_subkeys_unpacked] = payment_target_actor_subvalues
                    elif payment_target_actor_subkeys in actor_keys:
                        payment_actor_subkeys_unpacked = f'{key}_{payment_subkeys}_{payment_target_actor_subkeys}'
                        transaction[payment_actor_subkeys_unpacked] = payment_target_actor_subvalues
                    elif payment_target_actor_subkeys == 'user':
                        for payment_target_user_subkeys, payment_target_user_subvalues in payment_target_actor_subvalues.items():
                            payment_actor_user_subkeys_unpacked = (
                                f'{key}_{payment_subkeys}_{payment_target_actor_subkeys}_{payment_target_user_subkeys}'
                            )
                            transaction[payment_actor_user_subkeys_unpacked] = payment_target_user_subvalues
                    else:
                        continue
            else:
                continue
        continue
    else:
        transaction[key] = val
transaction_df.append(transaction.copy())

In [None]:
transaction_df

## Inspecting payer information

In [None]:
len(initial_5pct)

In [None]:
# Collect the unique ids for each payer in the intial_5pct of transactions
payer_ids = set()
for transaction in initial_5pct:
    actor = transaction['payment']['actor']
    actor_id = actor['id']
    payer_ids.add(actor_id)

In [None]:
f'The number of unique payers in the first 5% of transactions is {len(payer_ids)}'

In [None]:
# Transform user information into a dataframe
payers = []
payer_ids = set()
counter = 0
for transaction in initial_5pct:
    actor = transaction['payment']['actor']
    actor_id = actor['id']
    if actor_id in payer_ids:
        continue
    else:
        payer_ids.add(actor_id)
        payer = {}
        for key, val in transaction['payment']['actor'].items():
            payer[key] = val
        payers.append(payer.copy())

payers_df = pd.DataFrame(payers)
payers_df['payer'] = 1

In [None]:
payers_df.head()

In [None]:
# Investigate the dataframe
payers_df.info()

In [None]:
# Isolate nulls to see them more clearly
payers_df.isna().sum()

In [None]:
# Inspect the null value for about, looks like it could be the same row as for date_joined and username.
payers_df.loc[payers_df['about'].isna()]

In [None]:
# Given that it is we are going to drop said value.
payers_df.drop(axis=0, index=294315, inplace=True)

In [None]:
# Drop columns that only have null values
null_cols = ['email', 'friend_status', 'friends_count', 'identity', 'phone', 'trust_request']
payers_df.drop(labels = null_cols, axis=1, inplace=True)

In [None]:
payers_df.info()

In [None]:
payers_df['about'].value_counts()

In [None]:
# Converting the date joined object into a datetime.datetime field

payers_df['date_joined'] = pd.to_datetime(df['date'])
new_dates = [datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S') for x in payers_df['date_joined']]