# Performing EDA on Venmo data

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn

Data has to be exported from a Mongo DB

In [2]:
# Retrieve the venmo transaction collection from the MongoDB
venmo = fn.collection()

In [3]:
# Count all transactions in the venmo data
venmo.count()

  


7076585

In [4]:
# Inspect the first transaction
venmo.find_one()

{'_id': ObjectId('5bb7bdce1bed297da9fcb251'),
 'mentions': {'count': 0, 'data': []},
 'comments': {'count': 0, 'data': []},
 'date_created': datetime.datetime(2018, 7, 26, 18, 48, 10),
 'audience': 'public',
 'type': 'payment',
 'authorization': None,
 'id': '2532209455978775150',
 'payment': {'note': 'for utilities',
  'date_created': '2018-07-26T18:48:10',
  'id': '2532209455660008361',
  'amount': None,
  'date_authorized': None,
  'status': 'settled',
  'actor': {'first_name': 'Nicole',
   'last_name': 'Andrews',
   'phone': None,
   'username': 'Nicole-Andrews-23',
   'id': '2206066431492096327',
   'profile_picture_url': 'https://s3.amazonaws.com/venmo/no-image.gif',
   'identity': None,
   'trust_request': None,
   'email': None,
   'is_blocked': False,
   'display_name': 'Nicole Andrews',
   'friend_status': None,
   'about': ' ',
   'is_group': False,
   'date_joined': '2017-05-02T19:00:12',
   'is_active': True,
   'friends_count': None},
  'merchant_split_purchase': None,
  

In [None]:
#pass in initial_5pct function to generate a pickle with the first ~350k transactions
#initial_5pct = fn.initial_5pct(venmo)

In [5]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

In [53]:
first = venmo.find_one()

In [None]:
transaction_df = []
transaction = {}
keys = ['mentions', 'likes', 'comments','app']
payment_keys = (['amount', 'note', 'action', 'status', 'date_created', 'date_reminded',
                 'id', 'date_authorized', 'merchant_split_purchase', 'audience', 'date_completed'])
payment_inner_keys = ['target', 'actor']
target_keys = ['redeemable_target', 'merchant', 'phone', 'email', 'type']
actor_keys = ["username", "friends_count", "is_active", "display_name", "friend_status", "email",
              "first_name", "identity", "last_name", "is_blocked", "about", "profile_picture_url", "id",
              "phone", "trust_request", "date_joined", "is_group"]    

for key, val in first.items():
    if key in keys:
        for subkeys, subvals in val.items():
            unpacked = f'{key}_{subkeys}'
            transaction[unpacked] = subvals
    elif key == 'payment':
        for payment_subkeys, payment_subvals in val.items():
            if payment_subkeys in payment_keys:
                payments_unpacked = f'{key}_{payment_subkeys}'
                transaction[payments_unpacked] = payment_subvals
            elif payment_subkeys in payment_inner_keys:
                for payment_target_actor_subkeys, payment_target_actor_subvalues in payment_subvals.items():
                    if payment_target_actor_subkeys in target_keys:
                        payment_target_subkeys_unpacked = f'{key}_{payment_subkeys}_{payment_target_actor_subkeys}'
                        transaction[payment_target_subkeys_unpacked] = payment_target_actor_subvalues
                    elif payment_target_actor_subkeys in actor_keys:
                        payment_actor_subkeys_unpacked = f'{key}_{payment_subkeys}_{payment_target_actor_subkeys}'
                        transaction[payment_actor_subkeys_unpacked] = payment_target_actor_subvalues
                    elif payment_target_actor_subkeys == 'user':
                        for payment_target_user_subkeys, payment_target_user_subvalues in payment_target_actor_subvalues.items():
                            payment_actor_user_subkeys_unpacked = (
                                f'{key}_{payment_subkeys}_{payment_target_actor_subkeys}_{payment_target_user_subkeys}'
                            )
                            transaction[payment_actor_user_subkeys_unpacked] = payment_target_user_subvalues
                    else:
                        continue
            else:
                continue
        continue
    else:
        transaction[key] = val
transaction_df.append(transaction.copy())

In [None]:
transaction_df

## Inspecting payer information

In [6]:
len(initial_5pct)

353829

In [7]:
# Collect the unique ids for each payer in the intial_5pct of transactions
payer_ids = set()
for transaction in initial_5pct:
    actor = transaction['payment']['actor']
    actor_id = actor['id']
    payer_ids.add(actor_id)

In [8]:
f'The number of unique payers in the first 5% of transactions is {len(payer_ids)}'

'The number of unique payers in the first 5% of transactions is 329112'

In [16]:
# Transform actor information into a dataframe
payers = []
payer_ids = set()
for transaction in initial_5pct:
    actor = transaction['payment']['actor']
    actor_id = actor['id']
    if actor_id in payer_ids:
        continue
    else:
        payer_ids.add(actor_id)
        payer = {}
        for key, val in transaction['payment']['actor'].items():
            payer[key] = val
        payers.append(payer.copy())

payers_df = pd.DataFrame(payers)
payers_df['payer'] = 1

In [17]:
payers_df.head()

Unnamed: 0,about,date_joined,display_name,email,first_name,friend_status,friends_count,id,identity,is_active,is_blocked,is_group,last_name,phone,profile_picture_url,trust_request,username,payer
0,,2017-05-02T19:00:12,Nicole Andrews,,Nicole,,,2206066431492096327,,True,False,False,Andrews,,https://s3.amazonaws.com/venmo/no-image.gif,,Nicole-Andrews-23,1
1,,2017-04-24T23:57:10,Christy Faw,,Christy,,,2200417693859840681,,True,False,False,Faw,,https://s3.amazonaws.com/venmo/no-image.gif,,Christy-Faw,1
2,,2017-12-19T22:56:09,Vladan Stankovic,,Vladan,,,2373608382922752189,,True,False,False,Stankovic,,https://venmopics.appspot.com/u/v1/s/3c4263b0-...,,Vladan-Stankovic-1,1
3,No Short Bio,2015-04-24T20:33:48,Elizabeth Giordano,,Elizabeth,,,1670504276557824171,,True,False,False,Giordano,,https://venmopics.appspot.com/u/v1/s/215499bc-...,,egiordano,1
4,,2016-05-24T17:22:23,Brenda Carobini,,Brenda,,,1957419122950144676,,True,False,False,Carobini,,https://venmopics.appspot.com/u/v1/s/92d9c5b4-...,,Brenbini,1


In [18]:
# Investigate the dataframe
payers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329112 entries, 0 to 329111
Data columns (total 18 columns):
about                  329111 non-null object
date_joined            329111 non-null object
display_name           329112 non-null object
email                  0 non-null object
first_name             329112 non-null object
friend_status          0 non-null object
friends_count          0 non-null object
id                     329112 non-null object
identity               0 non-null object
is_active              329112 non-null bool
is_blocked             329112 non-null bool
is_group               329112 non-null bool
last_name              329112 non-null object
phone                  0 non-null object
profile_picture_url    329112 non-null object
trust_request          0 non-null object
username               329111 non-null object
payer                  329112 non-null int64
dtypes: bool(3), int64(1), object(14)
memory usage: 38.6+ MB


In [19]:
# Isolate nulls to see them more clearly
payers_df.isna().sum()

about                       1
date_joined                 1
display_name                0
email                  329112
first_name                  0
friend_status          329112
friends_count          329112
id                          0
identity               329112
is_active                   0
is_blocked                  0
is_group                    0
last_name                   0
phone                  329112
profile_picture_url         0
trust_request          329112
username                    1
payer                       0
dtype: int64

In [85]:
# Check for any duplicates
payers_df.duplicated().sum()

0

In [20]:
# Inspect the null value for about, looks like it could be the same row as for date_joined and username.
payers_df.loc[payers_df['about'].isna()]

Unnamed: 0,about,date_joined,display_name,email,first_name,friend_status,friends_count,id,identity,is_active,is_blocked,is_group,last_name,phone,profile_picture_url,trust_request,username,payer
294315,,,Colleen Methvin,,Colleen,,,2407520882655232799,,False,False,False,Methvin,,https://s3.amazonaws.com/venmo/no-image.gif,,,1


It would be insteresting to see if said user has made any transactions

In [21]:
# Given that it is we are going to drop said value.
payers_df.drop(axis=0, index=294315, inplace=True)

In [22]:
# Drop columns that only have null values
null_cols = ['email', 'friend_status', 'friends_count', 'identity', 'phone', 'trust_request']
payers_df.drop(labels = null_cols, axis=1, inplace=True)

In [23]:
payers_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329111 entries, 0 to 329111
Data columns (total 12 columns):
about                  329111 non-null object
date_joined            329111 non-null object
display_name           329111 non-null object
first_name             329111 non-null object
id                     329111 non-null object
is_active              329111 non-null bool
is_blocked             329111 non-null bool
is_group               329111 non-null bool
last_name              329111 non-null object
profile_picture_url    329111 non-null object
username               329111 non-null object
payer                  329111 non-null int64
dtypes: bool(3), int64(1), object(8)
memory usage: 26.1+ MB


In [24]:
payers_df['about'].value_counts()

                                                           316122
No Short Bio                                                11265
No short bio                                                   64
\n                                                             12
 \n                                                             8
                                                                7
No Short Bio\n                                                  7
💸                                                               6
💰💰💰                                                             6
$$$                                                             6
Hi                                                              6
💰                                                               6
 .                                                              5
 hi                                                             5
hi                                                              4
😎         

In [26]:
# Converting the date joined object into a datetime.datetime field
payers_df['date_joined'] = pd.to_datetime(payers_df['date_joined'], format='%Y-%m-%dT%H:%M:%S')

In [29]:
payers_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329111 entries, 0 to 329111
Data columns (total 12 columns):
about                  329111 non-null object
date_joined            329111 non-null datetime64[ns]
display_name           329111 non-null object
first_name             329111 non-null object
id                     329111 non-null object
is_active              329111 non-null bool
is_blocked             329111 non-null bool
is_group               329111 non-null bool
last_name              329111 non-null object
profile_picture_url    329111 non-null object
username               329111 non-null object
payer                  329111 non-null int64
dtypes: bool(3), datetime64[ns](1), int64(1), object(7)
memory usage: 26.1+ MB


In [31]:
payers_df['is_active'].value_counts()

True    329111
Name: is_active, dtype: int64

In [30]:
payers_df['is_blocked'].value_counts()

False    329111
Name: is_blocked, dtype: int64

In [32]:
payers_df['is_group'].value_counts()

False    329083
True         28
Name: is_group, dtype: int64

In [37]:
# Investigating whether or not display_name is just a sum of first_name and last_name
payers_df['new_name'] = (payers_df['first_name'] + ' ' + payers_df['last_name'])

In [42]:
len(payers_df) - sum(payers_df['display_name'] == payers_df['new_name'])

35

In [40]:
# Investigating the cases in which display name is not equal to the combination of first_name and last_name
payers_df.loc[payers_df['display_name'] != payers_df['new_name']]

Unnamed: 0,about,date_joined,display_name,first_name,id,is_active,is_blocked,is_group,last_name,profile_picture_url,username,payer,new_name
1822,,2017-06-30 16:04:20,Murray SC Boys U14 Orange,,2248739683041280434,True,False,True,,https://s3.amazonaws.com/venmo/placeholder-ima...,murrayscboysu14,1,
5643,,2017-08-11 18:12:52,Psyclones Valor,,2279244956696576648,True,False,True,,https://venmopics.appspot.com/u/v1/s/b9f644d5-...,PsyclonesValor,1,
16604,,2017-03-09 06:00:12,"Special Guest App, Inc.","Special Guest App, Inc.",2166535955152897010,True,False,False,Account,https://s3.amazonaws.com/venmo/no-image.gif,SpecialGuestAppInc,1,"Special Guest App, Inc. Account"
17467,,2017-07-11 05:00:19,APRL,APRL,2256378005553152080,True,False,False,Account,https://s3.amazonaws.com/venmo/no-image.gif,APRL,1,APRL Account
19578,,2017-02-28 21:08:37,BlachConstruction,,2160470194651136096,True,False,True,,https://venmopics.appspot.com/u/v10/s/8b36720d...,BlachConst-laura-bold,1,
40155,,2018-01-25 18:53:12,Sigma Nu - Gamma Upsilon,,2400302804238336942,True,False,True,,https://venmopics.appspot.com/u/v1/s/aff4c5f8-...,SigmanuGU,1,
44974,,2017-10-12 01:28:41,UCSD Zor,,2323675630010368549,True,False,True,,https://venmopics.appspot.com/u/v1/s/df5539d8-...,ucsd-zor9,1,
71521,,2018-02-12 16:06:17,HHSLPA,,2413264755490816512,True,False,True,,https://venmopics.appspot.com/u/v1/s/624bcc60-...,HHSLPA,1,
74033,,2018-04-21 16:31:08,725 Church,,2462562012626944171,True,False,True,,https://s3.amazonaws.com/venmo/placeholder-ima...,seventwentyfive,1,
77117,,2018-02-02 19:10:32,Dog City Disco,,2406109734240256323,True,False,True,,https://venmopics.appspot.com/u/v1/s/db519f86-...,dogcitydisco,1,


In [102]:
payers_df.drop('new_name', axis=1, inplace=True)

Another revelation has just been made, although no null values seem to appear in the first and last name columns, there are blank values in it. Lets investigate this further.

In [48]:
payers_df.loc[payers_df['first_name'] == '']

Unnamed: 0,about,date_joined,display_name,first_name,id,is_active,is_blocked,is_group,last_name,profile_picture_url,username,payer
1822,,2017-06-30 16:04:20,Murray SC Boys U14 Orange,,2248739683041280434,True,False,True,,https://s3.amazonaws.com/venmo/placeholder-ima...,murrayscboysu14,1
5643,,2017-08-11 18:12:52,Psyclones Valor,,2279244956696576648,True,False,True,,https://venmopics.appspot.com/u/v1/s/b9f644d5-...,PsyclonesValor,1
19578,,2017-02-28 21:08:37,BlachConstruction,,2160470194651136096,True,False,True,,https://venmopics.appspot.com/u/v10/s/8b36720d...,BlachConst-laura-bold,1
40155,,2018-01-25 18:53:12,Sigma Nu - Gamma Upsilon,,2400302804238336942,True,False,True,,https://venmopics.appspot.com/u/v1/s/aff4c5f8-...,SigmanuGU,1
44974,,2017-10-12 01:28:41,UCSD Zor,,2323675630010368549,True,False,True,,https://venmopics.appspot.com/u/v1/s/df5539d8-...,ucsd-zor9,1
71521,,2018-02-12 16:06:17,HHSLPA,,2413264755490816512,True,False,True,,https://venmopics.appspot.com/u/v1/s/624bcc60-...,HHSLPA,1
74033,,2018-04-21 16:31:08,725 Church,,2462562012626944171,True,False,True,,https://s3.amazonaws.com/venmo/placeholder-ima...,seventwentyfive,1
77117,,2018-02-02 19:10:32,Dog City Disco,,2406109734240256323,True,False,True,,https://venmopics.appspot.com/u/v1/s/db519f86-...,dogcitydisco,1
78015,,2016-10-31 23:15:46,Rescue Paws USA - UGA,,2073561103597568287,True,False,True,,https://venmopics.appspot.com/u/v2/s/378e2682-...,rescuepawsUGA,1
78261,,2016-11-01 14:05:28,PeteyGreeneDC,,2074008904269824481,True,False,True,,https://venmopics.appspot.com/u/v1/s/07d13939-...,PeteyGreeneDC,1


All columns that are groups don't have either a first_name or last_name associated with it.

In [49]:
payers_df['profile_picture_url'].value_counts()

https://s3.amazonaws.com/venmo/no-image.gif                                   63039
https://s3.amazonaws.com/venmo/placeholder-image/groups-placeholder.svg           6
https://venmopics.appspot.com/u/v1/s/096f397c-f058-416d-9413-33ad8da08107         1
https://venmopics.appspot.com/u/v1/s/f0d6c472-b059-4fdb-bce0-cbb5a13df284         1
https://venmopics.appspot.com/u/v1/s/96fc568e-a242-4c5d-be08-13e790cc3993         1
https://venmopics.appspot.com/u/v1/s/0a0acdf8-b706-4a2f-949a-8b6f9dfa9c10         1
https://venmopics.appspot.com/u/v1/s/4e1dd111-f556-48a7-9eee-4148ed3444f7         1
https://graph.facebook.com/v2.10/1242261862473506/picture?type=square             1
https://venmopics.appspot.com/u/v1/s/42238df3-e75f-4d91-8bd9-6de63e91d369         1
https://venmopics.appspot.com/u/v1/s/9bc77816-bc39-4075-a8b9-6ee0f00a06a3         1
https://venmopics.appspot.com/u/v2/s/7ad5e98d-50f0-4eec-bc26-87ef6ae5fd05         1
https://venmopics.appspot.com/u/v1/f/293fb157-966a-4799-9554-7f15d03edf20   

In [50]:
payers_df['username'].value_counts()

Omar-Sanchez-74         2
jdbooth2                1
Jade-Salzano            1
Ashley-Bargsley         1
beth-rich               1
Chrystina-Andrade       1
RebeccaRedmond          1
John-Gamboa-jag         1
sampankow               1
FranFrutos              1
valencianator           1
jimmy-jam0              1
jeffrey-erickson        1
Megan-Albertini         1
Drew-Martin-19          1
MgoCrabtree             1
James-Yu-15             1
Leina-Nguyen            1
Chancy-Holbrook         1
James-Pimentel-91       1
lacinea                 1
Ricardo1                1
Michael-Lugossy         1
Raymond-Alas            1
Kevin-Radziewicz        1
Qbanbailarin            1
KayKayCorona            1
Hudson-Lorfing          1
Jack-Mangold            1
jared-dupes             1
                       ..
muffysmith              1
Elliott-Moore-5         1
Robert-Hosford          1
kelsey-hutchison        1
cody-comiskey           1
GrantRussell            1
Jofin-Mathai            1
Maya-Familet

In [51]:
payers_df.loc[payers_df['username'] == 'Omar-Sanchez-74']

Unnamed: 0,about,date_joined,display_name,first_name,id,is_active,is_blocked,is_group,last_name,profile_picture_url,username,payer
253027,,2018-07-28 23:44:02,Omar Sanchez,Omar,2533807919988736204,True,False,False,Sanchez,https://venmopics.appspot.com/u/v1/s/c8fe11a8-...,Omar-Sanchez-74,1
263251,,2018-07-29 02:24:07,Omar Sanchez,Omar,2533888492568576142,True,False,False,Sanchez,https://s3.amazonaws.com/venmo/no-image.gif,Omar-Sanchez-74,1


## Inspecting payee information

In [54]:
first

{'_id': ObjectId('5bb7bdce1bed297da9fcb251'),
 'mentions': {'count': 0, 'data': []},
 'comments': {'count': 0, 'data': []},
 'date_created': datetime.datetime(2018, 7, 26, 18, 48, 10),
 'audience': 'public',
 'type': 'payment',
 'authorization': None,
 'id': '2532209455978775150',
 'payment': {'note': 'for utilities',
  'date_created': '2018-07-26T18:48:10',
  'id': '2532209455660008361',
  'amount': None,
  'date_authorized': None,
  'status': 'settled',
  'actor': {'first_name': 'Nicole',
   'last_name': 'Andrews',
   'phone': None,
   'username': 'Nicole-Andrews-23',
   'id': '2206066431492096327',
   'profile_picture_url': 'https://s3.amazonaws.com/venmo/no-image.gif',
   'identity': None,
   'trust_request': None,
   'email': None,
   'is_blocked': False,
   'display_name': 'Nicole Andrews',
   'friend_status': None,
   'about': ' ',
   'is_group': False,
   'date_joined': '2017-05-02T19:00:12',
   'is_active': True,
   'friends_count': None},
  'merchant_split_purchase': None,
  

In [73]:
# Transform user information into a dataframe

payees = []
payee_ids = set()
counter = 0
for transaction in initial_5pct:
    user = transaction['payment']['target']['user']
    try:
        user_id = user['id']
    except TypeError:
        counter += 1
        continue
    if user_id in payee_ids:
        continue
    else:
        payee_ids.add(user_id)
        payee = {}
        for key, val in transaction['payment']['target']['user'].items():
            payee[key] = val
        payees.append(payee.copy())

payees_df = pd.DataFrame(payees)
payees_df['payee'] = 1

In [76]:
f'The number of transactions made to no users is {counter}'

'The number of transactions made to no users is 2422'

In [79]:
len(payer_ids)-len(payee_ids)

9210

In [80]:
payees_df.head()

Unnamed: 0,about,date_joined,display_name,email,first_name,friend_status,friends_count,id,identity,is_active,is_blocked,is_group,last_name,phone,profile_picture_url,trust_request,username,payee
0,,2014-12-10T19:59:53,Willem Chillum,,Willem,,,1572642482028544167,,True,False,False,Chillum,,https://venmopics.appspot.com/u/v1/s/ba8923dc-...,,WillemChillum,1
1,,2017-06-22T16:53:39,Jen Eten,,Jen,,,2242966299082752545,,True,False,False,Eten,,https://s3.amazonaws.com/venmo/no-image.gif,,Jen-Eten,1
2,,2016-07-01T02:47:04,Mariana Bornstein,,Mariana,,,1984520039432192983,,True,False,False,Bornstein,,https://venmopics.appspot.com/u/v2/s/7068eb7d-...,,Mariana-Bornstein,1
3,,2015-09-23T15:52:56,Catherine Lopez,,Catherine,,,1780528822878208201,,True,False,False,Lopez,,https://venmopics.appspot.com/u/v1/s/fccca740-...,,Catherine-Lopez140,1
4,,2018-06-07T20:59:37,CCU CandyCart,,CCU,,,2496761604079616999,,True,False,False,CandyCart,,https://venmopics.appspot.com/u/v2/s/d53f6f2c-...,,CCUCandyCart,1


In [81]:
payees_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319902 entries, 0 to 319901
Data columns (total 18 columns):
about                  319902 non-null object
date_joined            319902 non-null object
display_name           319902 non-null object
email                  0 non-null object
first_name             319902 non-null object
friend_status          0 non-null object
friends_count          0 non-null object
id                     319902 non-null object
identity               0 non-null object
is_active              319902 non-null bool
is_blocked             319902 non-null bool
is_group               319902 non-null bool
last_name              319902 non-null object
phone                  0 non-null object
profile_picture_url    319902 non-null object
trust_request          0 non-null object
username               319902 non-null object
payee                  319902 non-null int64
dtypes: bool(3), int64(1), object(14)
memory usage: 37.5+ MB


As expected from the actors, there are no emails, friend_status, friends_count, identity, phone, trust and requests

In [82]:
payees_df.isna().sum()

about                       0
date_joined                 0
display_name                0
email                  319902
first_name                  0
friend_status          319902
friends_count          319902
id                          0
identity               319902
is_active                   0
is_blocked                  0
is_group                    0
last_name                   0
phone                  319902
profile_picture_url         0
trust_request          319902
username                    0
payee                       0
dtype: int64

In [84]:
payees_df.duplicated().sum()

0

In [92]:
# Drop columns that only have null values
null_cols = ['email', 'friend_status', 'friends_count', 'identity', 'phone', 'trust_request']
payees_df.drop(labels = null_cols, axis=1, inplace=True)

In [86]:
payees_df['about'].value_counts()

                                                                 307100
No Short Bio                                                      11066
No short bio                                                         49
\n                                                                   16
 .                                                                    7
💸                                                                     6
No Short Bio\n                                                        6
                                                                      4
Pay me                                                                4
 hi                                                                   4
 🌻                                                                    4
💰💰💰                                                                   3
$$$                                                                   3
yo                                                              

In [87]:
# Converting the date joined object into a datetime.datetime field
payees_df['date_joined'] = pd.to_datetime(payees_df['date_joined'], format='%Y-%m-%dT%H:%M:%S')

In [93]:
payees_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319902 entries, 0 to 319901
Data columns (total 12 columns):
about                  319902 non-null object
date_joined            319902 non-null datetime64[ns]
display_name           319902 non-null object
first_name             319902 non-null object
id                     319902 non-null object
is_active              319902 non-null bool
is_blocked             319902 non-null bool
is_group               319902 non-null bool
last_name              319902 non-null object
profile_picture_url    319902 non-null object
username               319902 non-null object
payee                  319902 non-null int64
dtypes: bool(3), datetime64[ns](1), int64(1), object(7)
memory usage: 22.9+ MB


In [94]:
payees_df['is_active'].value_counts()

True    319902
Name: is_active, dtype: int64

In [95]:
payees_df['is_blocked'].value_counts()

False    319902
Name: is_blocked, dtype: int64

In [96]:
payees_df['is_group'].value_counts()

False    319724
True        178
Name: is_group, dtype: int64

Makes sense that there are more payees that are goups given that companies can receive venmos but not transact with said money, just extract it

In [100]:
# Investigating whether or not display_name is just a sum of first_name and last_name
payees_df['new_name'] = (payees_df['first_name'] + ' ' + payees_df['last_name'])

In [110]:
# Investigating the cases in which display name is not equal to the combination of first_name and last_name
payees_df.loc[payees_df['display_name'] != payees_df['new_name']]

Unnamed: 0,about,date_joined,display_name,first_name,id,is_active,is_blocked,is_group,last_name,profile_picture_url,username,payee,new_name
898,,2015-12-02 22:30:45,Delta Upsilon UVA,,1831463351746560977,True,False,True,,https://s3.amazonaws.com/venmo/placeholder-ima...,DU_UVA,1,
1321,,2017-06-07 03:24:33,Booty Hunters,,2231687429750784977,True,False,True,,https://venmopics.appspot.com/u/v2/s/7686ee94-...,BootyHunters,1,
1373,,2017-11-03 15:03:51,CRRR Holiday Party,,2340030982914048561,True,False,True,,https://venmopics.appspot.com/u/v1/s/9e52d3a0-...,CRHolidayParty,1,
1626,,2018-05-01 21:53:07,JayDoc,,2469971829456896110,True,False,True,,https://s3.amazonaws.com/venmo/placeholder-ima...,JayDoc,1,
3068,,2018-03-17 15:43:27,Housemates,,2437170862227456905,True,False,True,,https://s3.amazonaws.com/venmo/placeholder-ima...,Guerrero-625,1,
3618,,2017-08-27 23:47:14,Hill BEEs,,2291009660649472968,True,False,True,,https://venmopics.appspot.com/u/v1/s/7d72b612-...,HillBees,1,
4452,,2018-02-05 13:17:14,EATS,,2408106239721472198,True,False,True,,https://s3.amazonaws.com/venmo/placeholder-ima...,Eatssnacks,1,
6756,,2016-09-22 15:15:23,ThePinesKeystoneCondo,,2045053065232384163,True,False,True,,https://venmopics.appspot.com/u/v2/s/6dde75f7-...,ThePinesKeystoneCondo,1,
7984,,2017-06-08 22:37:24,Autopilot_Snacks,,2232992453885952810,True,False,True,,https://venmopics.appspot.com/u/v1/s/ccff4c94-...,ap_snacks,1,
8859,,2017-07-26 18:19:30,USC SOM MSA,,2267651883663360462,True,False,True,,https://venmopics.appspot.com/u/v1/s/5a8663f8-...,MSA_USCSOM,1,


In [119]:
payees_df.loc[(payees_df['display_name'] != payees_df['new_name'])  & (payees_df['is_group'] == False)]

Unnamed: 0,about,date_joined,display_name,first_name,id,is_active,is_blocked,is_group,last_name,profile_picture_url,username,payee,new_name
211558,,2012-05-27 05:51:06,Jeremy,Jeremy,900348172566528960,True,False,False,,https://venmopics.appspot.com/u/v1/f/a10ae1bc-...,JeremyDox,1,Jeremy


In [120]:
payees_df.drop('new_name', axis=1, inplace=True)

In [122]:
payees_df.loc[(payees_df['first_name'] == '') & (payees_df['is_group'] == False)]

Unnamed: 0,about,date_joined,display_name,first_name,id,is_active,is_blocked,is_group,last_name,profile_picture_url,username,payee


In [123]:
payees_df['profile_picture_url'].value_counts()

https://s3.amazonaws.com/venmo/no-image.gif                                  60617
https://s3.amazonaws.com/venmo/placeholder-image/groups-placeholder.svg         35
https://venmopics.appspot.com/u/v1/f/e3020d65-55bf-4202-884c-d94c12cc5306        1
https://graph.facebook.com/v2.10/1169280962/picture?type=square                  1
https://venmopics.appspot.com/u/v2/f/30c0a294-f080-4e6b-bb8c-a60679bf8860        1
https://venmopics.appspot.com/u/v4/s/c189b212-0d6d-470b-b30e-3941a5d881dd        1
https://venmopics.appspot.com/u/v1/f/52a0b46c-7e98-47b0-aff7-3a81932ffa76        1
https://venmopics.appspot.com/u/v1/s/3543f346-5ec0-456a-a6da-249a935d6162        1
https://venmopics.appspot.com/u/v1/f/fed47263-8f40-447e-91fc-dcfb6f86c290        1
https://venmopics.appspot.com/u/v1/s/6ffe6d6d-d2c8-46f7-828b-c6789dd78981        1
https://venmopics.appspot.com/u/v4/s/873f562d-f763-45e8-aa22-dc501c817db6        1
https://venmopics.appspot.com/u/v1/s/398d875d-2f30-4a0d-a6c9-8834ce677d78        1
http

In [124]:
payees_df['username'].value_counts()

GarbanzoBeans         1
Isabel-Kaufman-1      1
sammimarie812         1
Augustine-Valdez      1
Grace-Woods-5         1
marshall-Gardner-1    1
Esteban87             1
JacquelynLau          1
Brady-Wagner-2        1
Purvi_Shah            1
Katie-Harrill-1       1
TheKJ4                1
SwapnilBhoite         1
Rachelkrieger12       1
huntercolden          1
toakes12              1
akabanuk              1
Omar-Garcia-97        1
Michael-Ryan-113      1
Scott-Morrow-4        1
haleymei              1
thananun9             1
Katie-Zuman           1
jerardmacalma         1
Andrew-Udkovich       1
Catherine-Fleck       1
Athena-Peterson       1
Taylor_White97        1
Elizabethneary1       1
Kelsey-Kettell        1
                     ..
Austin-Keisling       1
Robert-SteinIII       1
Garrett-Garner-7      1
Lisa-Burton-7         1
SavannahHowell1       1
Haley-Butterfield     1
kdotkeane             1
MC-Wang               1
katrinapersson        1
Sarah-perkins-22      1
Ashley-He       