# Performing EDA on Venmo data

In [None]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn

Data has to be exported from a Mongo DB

In [None]:
# Retrieve the venmo transaction collection from the MongoDB
venmo = fn.collection()

In [None]:
# Count all transactions in the venmo data
venmo.count()

In [None]:
# Inspect the first transaction
venmo.find_one()

In [None]:
first = venmo.find_one()

In [None]:
#pass in initial_5pct function to generate a pickle with the first ~350k transactions
#initial_5pct = fn.initial_5pct(venmo)

In [None]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

## Inspecting payer information

In [None]:
len(initial_5pct)

In [None]:
# Collect the unique ids for each payer in the intial_5pct of transactions
payer_ids = set()
for transaction in initial_5pct:
    actor = transaction['payment']['actor']
    actor_id = actor['id']
    payer_ids.add(actor_id)

In [None]:
f'The number of unique payers in the first 5% of transactions is {len(payer_ids)}'

In [None]:
# Transform actor information into a dataframe
payers = []
payer_ids = set()
for transaction in initial_5pct:
    actor = transaction['payment']['actor']
    actor_id = actor['id']
    if actor_id in payer_ids:
        continue
    else:
        payer_ids.add(actor_id)
        payer = {}
        for key, val in transaction['payment']['actor'].items():
            payer[key] = val
        payers.append(payer.copy())

payers_df = pd.DataFrame(payers)
payers_df['payer'] = 1

In [None]:
payers_df.head()

In [None]:
# Investigate the dataframe
payers_df.info()

In [None]:
# Isolate nulls to see them more clearly
payers_df.isna().sum()

In [None]:
# Check for any duplicates
payers_df.duplicated().sum()

In [None]:
# Inspect the null value for about, looks like it could be the same row as for date_joined and username.
payers_df.loc[payers_df['about'].isna()]

It would be insteresting to see if said user has made any transactions

In [None]:
# Given that it is we are going to drop said value.
payers_df.drop(axis=0, index=294315, inplace=True)

In [None]:
# Drop columns that only have null values
null_cols = ['email', 'friend_status', 'friends_count', 'identity', 'phone', 'trust_request']
payers_df.drop(labels = null_cols, axis=1, inplace=True)

In [None]:
payers_df.info()

In [None]:
payers_df['about'].value_counts()

In [None]:
# Converting the date joined object into a datetime.datetime field
payers_df['date_joined'] = pd.to_datetime(payers_df['date_joined'], format='%Y-%m-%dT%H:%M:%S')

In [None]:
payers_df.info()

In [None]:
payers_df['is_active'].value_counts()

In [None]:
payers_df['is_blocked'].value_counts()

In [None]:
payers_df['is_group'].value_counts()

In [None]:
# Investigating whether or not display_name is just a sum of first_name and last_name
payers_df['new_name'] = (payers_df['first_name'] + ' ' + payers_df['last_name'])

In [None]:
len(payers_df) - sum(payers_df['display_name'] == payers_df['new_name'])

In [None]:
# Investigating the cases in which display name is not equal to the combination of first_name and last_name
payers_df.loc[payers_df['display_name'] != payers_df['new_name']]

In [None]:
payers_df.drop('new_name', axis=1, inplace=True)

Another revelation has just been made, although no null values seem to appear in the first and last name columns, there are blank values in it. Lets investigate this further.

In [None]:
payers_df.loc[payers_df['first_name'] == '']

All columns that are groups don't have either a first_name or last_name associated with it.

In [None]:
payers_df['profile_picture_url'].value_counts()

In [None]:
payers_df['username'].value_counts()

In [None]:
payers_df.loc[payers_df['username'] == 'Omar-Sanchez-74']

## Inspecting payee information

In [None]:
# Transform user information into a dataframe
payees = []
payee_ids = set()
counter = 0
for transaction in initial_5pct:
    user = transaction['payment']['target']['user']
    try:
        user_id = user['id']
    except TypeError:
        counter += 1
        continue
    if user_id in payee_ids:
        continue
    else:
        payee_ids.add(user_id)
        payee = {}
        for key, val in transaction['payment']['target']['user'].items():
            payee[key] = val
        payees.append(payee.copy())

payees_df = pd.DataFrame(payees)
payees_df['payee'] = 1

In [None]:
f'The number of transactions made to no users is {counter}'

In [None]:
len(payer_ids)-len(payee_ids)

In [None]:
payees_df.head()

In [None]:
payees_df.info()

As expected from the actors, there are no emails, friend_status, friends_count, identity, phone, trust and requests

In [None]:
payees_df.isna().sum()

In [None]:
payees_df.duplicated().sum()

In [None]:
# Drop columns that only have null values
null_cols = ['email', 'friend_status', 'friends_count', 'identity', 'phone', 'trust_request']
payees_df.drop(labels = null_cols, axis=1, inplace=True)

In [None]:
payees_df['about'].value_counts()

In [None]:
# Converting the date joined object into a datetime.datetime field
payees_df['date_joined'] = pd.to_datetime(payees_df['date_joined'], format='%Y-%m-%dT%H:%M:%S')

In [None]:
payees_df.info()

In [None]:
payees_df['is_active'].value_counts()

In [None]:
payees_df['is_blocked'].value_counts()

In [None]:
payees_df['is_group'].value_counts()

Makes sense that there are more payees that are goups given that companies can receive venmos but not transact with said money, just extract it

In [None]:
# Investigating whether or not display_name is just a sum of first_name and last_name
payees_df['new_name'] = (payees_df['first_name'] + ' ' + payees_df['last_name'])

In [None]:
# Investigating the cases in which display name is not equal to the combination of first_name and last_name
payees_df.loc[payees_df['display_name'] != payees_df['new_name']]

In [None]:
payees_df.loc[(payees_df['display_name'] != payees_df['new_name'])  & (payees_df['is_group'] == False)]

In [None]:
payees_df.drop('new_name', axis=1, inplace=True)

In [None]:
payees_df.loc[(payees_df['first_name'] == '') & (payees_df['is_group'] == False)]

In [None]:
payees_df['profile_picture_url'].value_counts()

In [None]:
payees_df['username'].value_counts()

## Inspecting transaction information


In [None]:
first

In [1]:
transactions = []
transaction = {}
keys = ['mentions', 'likes', 'comments','app']
payment_keys = (['amount', 'note', 'action', 'status', 'date_created', 'date_reminded',
                 'id', 'date_authorized', 'merchant_split_purchase', 'audience', 'date_completed'])
payment_inner_keys = ['target', 'actor']
target_keys = ['redeemable_target', 'merchant', 'phone', 'email', 'type']
target_user_keys = ['user']
actor_keys = ["id"]    

for details in initial_5pct:
    transaction = {}
    for key, val in details.items():
        if key in keys:
            for subkeys, subvals in val.items():
                unpacked = f'{key}_{subkeys}'
                transaction[unpacked] = subvals
        elif key == 'payment':
            for payment_subkeys, payment_subvals in val.items():
                if payment_subkeys in payment_keys:
                    payments_unpacked = f'{key}_{payment_subkeys}'
                    transaction[payments_unpacked] = payment_subvals
                elif payment_subkeys == payment_inner_keys[0]:
                    for payment_target_subkeys, payment_target_subvalues in payment_subvals.items():
                        if payment_target_subkeys in target_keys:
                            payment_target_subkeys_unpacked = f'{key}_{payment_subkeys}_{payment_target_subkeys}'
                            transaction[payment_target_subkeys_unpacked] = payment_target_subvalues
                        elif payment_target_subkeys in target_user_keys:
                            try:
                                payment_target_user_id = details[f'{key}'][f'{payment_subkeys}'][f'{payment_target_subkeys}']['id']
                                payment_target_user_id_unpacked = (f'{key}_{payment_subkeys}_{payment_target_subkeys}_id')
                                transaction[payment_target_user_id_unpacked] = payment_target_user_id
                            except TypeError:
                                continue
                        else:
                            continue
                elif payment_subkeys == payment_inner_keys[1]:
                    for payment_actor_subkeys, payment_actor_subvalues in payment_subvals.items():
                        if payment_actor_subkeys in actor_keys:
                            payment_target_subkeys_unpacked = f'{key}_{payment_subkeys}_{payment_actor_subkeys}'
                            transaction[payment_target_subkeys_unpacked] = payment_actor_subvalues
                        else:
                            continue
                else:
                    continue
        else:
            continue
    transactions.append(transaction.copy())

NameError: name 'initial_5pct' is not defined

In [None]:
transactions_df = pd.DataFrame(transactions)

In [None]:
transactions_df.head().T

In [None]:
transactions_df.info()

In [None]:
transactions_df.isna().sum()

In [None]:
transactions_df['_id'].value_counts()

In [None]:
transactions_df['app_description'].value_counts()

In [None]:
transactions_df['app_id'].value_counts()

In [None]:
transactions_df['app_image_url'].value_counts()

In [None]:
transactions_df['app_name'].value_counts()

In [None]:
transactions_df['audience'].value_counts()

In [None]:
transactions_df['comments_count'].value_counts()

In [None]:
transactions_df.loc[transactions_df['comments_count'] == 1]

In [None]:
transactions_df['date_created'].min()

In [None]:
transactions_df['date_created'].max()

In [None]:
transactions_df['date_created'].value_counts()

In [None]:
transactions_df['id'].value_counts()

In [None]:
transactions_df['likes_count'].value_counts()

In [None]:
transactions_df.loc[transactions_df['likes_count'] == 2].T

In [None]:
transactions_df['note'].value_counts()

In [None]:
transactions_df['payment_action'].value_counts()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Investigate payers and the number of transactions they have made
val_counts = transactions_df['payment_actor_id'].value_counts()
payed_df = val_counts.rename_axis('unique_values').reset_index(name='counts')
payed_df

In [None]:
payed_more_than_one = len(payed_df.loc[payed_df['counts'] > 1])

In [None]:
f'The number of users that have payed more than 1 transaction in the time period is {payed_more_than_one}'

In [None]:
# Investigate payees and the number of transactions they have received
val_counts = transactions_df['payment_target_user_id'].value_counts()
received_df = val_counts.rename_axis('unique_values').reset_index(name='counts')
received_df

In [None]:
received_more_than_one = len(received_df.loc[received_df['counts'] > 1])

In [None]:
f'The number of users that have received more than 1 transaction in the time period is {received_more_than_one}'

In [None]:
transactions_df['payment_audience'].value_counts()

In [None]:
transactions_df.loc[transactions_df['payment_date_completed'].isna()].T

In [None]:
transactions_df.info()

In [None]:
transactions_df['payment_id'].value_counts()

In [None]:
transactions_df.loc[transactions_df['payment_merchant_split_purchase'] != None]

In [None]:
sum(transactions_df['note'] == transactions_df['payment_note'])

In [None]:
transactions_df.loc[transactions_df['payment_note'].starts].T

In [None]:
payment_note = []
for note in transactions_df['payment_note']:
    if note.startswith('for'):
        payment_note.append(note)

In [None]:
len(payment_note)

There seems to be a glitch in the note column. There are 16 transactions that don't have the same note and payment_note and that is because the note column adds an extra 'for' string to the 16 payment notes that start with note. Interesting but nothing conclusive.

In [None]:
transactions_df['payment_status'].value_counts()

The payments that are not settled are those that don't have a user_id either, consider dropping those transactions.

In [None]:
transactions_df.loc[transactions_df['payment_status'] == 'cancelled'].T

In [None]:
transactions_df.loc[transactions_df['payment_status'] == 'pending'].T

In [None]:
transactions_df['payment_target_type'].value_counts()

In [None]:
transactions_df['type'].value_counts()