# Performing EDA on Venmo data

In [None]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn

Data has to be exported from a Mongo DB

In [None]:
# Retrieve the venmo transaction collection from the MongoDB
venmo = fn.collection()

In [None]:
# Count all transactions in the venmo data
venmo.count()

In [None]:
# Inspect the first transaction
venmo.find_one()

In [None]:
#pass in initial_5pct function to generate a pickle with the first ~350k transactions
#initial_5pct = fn.initial_5pct(venmo)

In [None]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

In [None]:
first = venmo.find_one()

In [None]:
transaction_df = []
transaction = {}
keys = ['mentions', 'likes', 'comments','app']
payment_keys = (['amount', 'note', 'action', 'status', 'date_created', 'date_reminded',
                 'id', 'date_authorized', 'merchant_split_purchase', 'audience', 'date_completed'])
payment_inner_keys = ['target', 'actor']
target_keys = ['redeemable_target', 'merchant', 'phone', 'email', 'type']
actor_keys = ["username", "friends_count", "is_active", "display_name", "friend_status", "email",
              "first_name", "identity", "last_name", "is_blocked", "about", "profile_picture_url", "id",
              "phone", "trust_request", "date_joined", "is_group"]    

for key, val in first.items():
    if key in keys:
        for subkeys, subvals in val.items():
            unpacked = f'{key}_{subkeys}'
            transaction[unpacked] = subvals
    elif key == 'payment':
        for payment_subkeys, payment_subvals in val.items():
            if payment_subkeys in payment_keys:
                payments_unpacked = f'{key}_{payment_subkeys}'
                transaction[payments_unpacked] = payment_subvals
            elif payment_subkeys in payment_inner_keys:
                for payment_target_actor_subkeys, payment_target_actor_subvalues in payment_subvals.items():
                    if payment_target_actor_subkeys in target_keys:
                        payment_target_subkeys_unpacked = f'{key}_{payment_subkeys}_{payment_target_actor_subkeys}'
                        transaction[payment_target_subkeys_unpacked] = payment_target_actor_subvalues
                    elif payment_target_actor_subkeys in actor_keys:
                        payment_actor_subkeys_unpacked = f'{key}_{payment_subkeys}_{payment_target_actor_subkeys}'
                        transaction[payment_actor_subkeys_unpacked] = payment_target_actor_subvalues
                    elif payment_target_actor_subkeys == 'user':
                        for payment_target_user_subkeys, payment_target_user_subvalues in payment_target_actor_subvalues.items():
                            payment_actor_user_subkeys_unpacked = (
                                f'{key}_{payment_subkeys}_{payment_target_actor_subkeys}_{payment_target_user_subkeys}'
                            )
                            transaction[payment_actor_user_subkeys_unpacked] = payment_target_user_subvalues
                    else:
                        continue
            else:
                continue
        continue
    else:
        transaction[key] = val
transaction_df.append(transaction.copy())

In [None]:
transaction_df

## Inspecting payer information

In [None]:
len(initial_5pct)

In [None]:
# Collect the unique ids for each payer in the intial_5pct of transactions
payer_ids = set()
for transaction in initial_5pct:
    actor = transaction['payment']['actor']
    actor_id = actor['id']
    payer_ids.add(actor_id)

In [None]:
f'The number of unique payers in the first 5% of transactions is {len(payer_ids)}'

In [None]:
# Transform actor information into a dataframe
payers = []
payer_ids = set()
for transaction in initial_5pct:
    actor = transaction['payment']['actor']
    actor_id = actor['id']
    if actor_id in payer_ids:
        continue
    else:
        payer_ids.add(actor_id)
        payer = {}
        for key, val in transaction['payment']['actor'].items():
            payer[key] = val
        payers.append(payer.copy())

payers_df = pd.DataFrame(payers)
payers_df['payer'] = 1

In [None]:
payers_df.head()

In [None]:
# Investigate the dataframe
payers_df.info()

In [None]:
# Isolate nulls to see them more clearly
payers_df.isna().sum()

In [None]:
# Check for any duplicates
payers_df.duplicated().sum()

In [None]:
# Inspect the null value for about, looks like it could be the same row as for date_joined and username.
payers_df.loc[payers_df['about'].isna()]

It would be insteresting to see if said user has made any transactions

In [None]:
# Given that it is we are going to drop said value.
payers_df.drop(axis=0, index=294315, inplace=True)

In [None]:
# Drop columns that only have null values
null_cols = ['email', 'friend_status', 'friends_count', 'identity', 'phone', 'trust_request']
payers_df.drop(labels = null_cols, axis=1, inplace=True)

In [None]:
payers_df.info()

In [None]:
payers_df['about'].value_counts()

In [None]:
# Converting the date joined object into a datetime.datetime field
payers_df['date_joined'] = pd.to_datetime(payers_df['date_joined'], format='%Y-%m-%dT%H:%M:%S')

In [None]:
payers_df.info()

In [None]:
payers_df['is_active'].value_counts()

In [None]:
payers_df['is_blocked'].value_counts()

In [None]:
payers_df['is_group'].value_counts()

In [None]:
# Investigating whether or not display_name is just a sum of first_name and last_name
payers_df['new_name'] = (payers_df['first_name'] + ' ' + payers_df['last_name'])

In [None]:
len(payers_df) - sum(payers_df['display_name'] == payers_df['new_name'])

In [None]:
# Investigating the cases in which display name is not equal to the combination of first_name and last_name
payers_df.loc[payers_df['display_name'] != payers_df['new_name']]

In [None]:
payers_df.drop('new_name', axis=1, inplace=True)

Another revelation has just been made, although no null values seem to appear in the first and last name columns, there are blank values in it. Lets investigate this further.

In [None]:
payers_df.loc[payers_df['first_name'] == '']

All columns that are groups don't have either a first_name or last_name associated with it.

In [None]:
payers_df['profile_picture_url'].value_counts()

In [None]:
payers_df['username'].value_counts()

In [None]:
payers_df.loc[payers_df['username'] == 'Omar-Sanchez-74']

## Inspecting payee information

In [None]:
# Transform user information into a dataframe
payees = []
payee_ids = set()
counter = 0
for transaction in initial_5pct:
    user = transaction['payment']['target']['user']
    try:
        user_id = user['id']
    except TypeError:
        counter += 1
        continue
    if user_id in payee_ids:
        continue
    else:
        payee_ids.add(user_id)
        payee = {}
        for key, val in transaction['payment']['target']['user'].items():
            payee[key] = val
        payees.append(payee.copy())

payees_df = pd.DataFrame(payees)
payees_df['payee'] = 1

In [None]:
f'The number of transactions made to no users is {counter}'

In [None]:
len(payer_ids)-len(payee_ids)

In [None]:
payees_df.head()

In [None]:
payees_df.info()

As expected from the actors, there are no emails, friend_status, friends_count, identity, phone, trust and requests

In [None]:
payees_df.isna().sum()

In [None]:
payees_df.duplicated().sum()

In [None]:
# Drop columns that only have null values
null_cols = ['email', 'friend_status', 'friends_count', 'identity', 'phone', 'trust_request']
payees_df.drop(labels = null_cols, axis=1, inplace=True)

In [None]:
payees_df['about'].value_counts()

In [None]:
# Converting the date joined object into a datetime.datetime field
payees_df['date_joined'] = pd.to_datetime(payees_df['date_joined'], format='%Y-%m-%dT%H:%M:%S')

In [None]:
payees_df.info()

In [None]:
payees_df['is_active'].value_counts()

In [None]:
payees_df['is_blocked'].value_counts()

In [None]:
payees_df['is_group'].value_counts()

Makes sense that there are more payees that are goups given that companies can receive venmos but not transact with said money, just extract it

In [None]:
# Investigating whether or not display_name is just a sum of first_name and last_name
payees_df['new_name'] = (payees_df['first_name'] + ' ' + payees_df['last_name'])

In [None]:
# Investigating the cases in which display name is not equal to the combination of first_name and last_name
payees_df.loc[payees_df['display_name'] != payees_df['new_name']]

In [None]:
payees_df.loc[(payees_df['display_name'] != payees_df['new_name'])  & (payees_df['is_group'] == False)]

In [None]:
payees_df.drop('new_name', axis=1, inplace=True)

In [None]:
payees_df.loc[(payees_df['first_name'] == '') & (payees_df['is_group'] == False)]

In [None]:
payees_df['profile_picture_url'].value_counts()

In [None]:
payees_df['username'].value_counts()

## Create a dataframe with unique payer and payees data

In [None]:
# Identifying the payees that have not been payers to make a complete user list

payees_not_payers = set()
for payee_id in payee_ids:
    if payee_id not in payer_ids:
        payees_not_payers.add(payee_id)

In [None]:
payees_not_payers_df = payees_df.loc[payees_df['id'].apply(lambda x: x in payees_not_payers)]

In [None]:
unique_users = pd.concat([payers_df, payees_not_payers_df], axis=0)

In [None]:
len(payers_df) + len(payees_not_payers) == len(unique_users)

In [None]:
unique_users.duplicated().sum()

In [None]:
unique_users.head()

### Store said dataframe

In [None]:
# store the calculated df
with open('initial_5pct_unique_users.pkl', 'wb') as f:
    pickle.dump(unique_users, f)