# Generate a unique user table

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn

In [2]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

## Actor (payer) extraction and manipulation

In [3]:
# Transform actor information into a dataframe

# Identifying columns that contain null values
null_columns = (['email', 'friend_status', 'friends_count', 'identity',
                 'phone', 'trust_request']) 

payers = []
payer_ids = set()
for transaction in initial_5pct:
    actor = transaction['payment']['actor']
    actor_id = actor['id']
    if actor_id in payer_ids:
        continue
    else:
        payer_ids.add(actor_id)
        payer = {}
        for key, val in transaction['payment']['actor'].items():
            if key in null_columns:
                continue
            else:
                payer[key] = val
        payers.append(payer.copy())

payers_df = pd.DataFrame(payers)

In [4]:
payers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329112 entries, 0 to 329111
Data columns (total 11 columns):
about                  329111 non-null object
date_joined            329111 non-null object
display_name           329112 non-null object
first_name             329112 non-null object
id                     329112 non-null object
is_active              329112 non-null bool
is_blocked             329112 non-null bool
is_group               329112 non-null bool
last_name              329112 non-null object
profile_picture_url    329112 non-null object
username               329111 non-null object
dtypes: bool(3), object(8)
memory usage: 21.0+ MB


In [5]:
# Investigate the only null value
payers_df.loc[payers_df['about'].isna()]

Unnamed: 0,about,date_joined,display_name,first_name,id,is_active,is_blocked,is_group,last_name,profile_picture_url,username
294315,,,Colleen Methvin,Colleen,2407520882655232799,False,False,False,Methvin,https://s3.amazonaws.com/venmo/no-image.gif,


Colleen Methvin's case is strange because even though he doesn't have a username or a date_joined, he had made one successful transaction in the past, so we will not drop him from the table.

In [6]:
# Create a column to determine if they have personalised the about column

# Values for default come after having explored the data in the eda_venmo notebook
about_default = [' ', 'No Short Bio', 'No short bio', '\n', ' \n', '  ', 'No Short Bio\n']
about_personalised = [0 if about in about_default else 1 for about in payers_df['about']]  
payers_df['about_personalised'] = about_personalised

In [7]:
# Convert the date_joined objects into a datetime field
payers_df['date_joined'] = pd.to_datetime(payers_df['date_joined'], format='%Y-%m-%dT%H:%M:%S')

For the vast majority of users, display name is a combination of first and last name. However, for groups the display name is composed of first_name only. As a result, we will eliminate the first and last name columns to reduce the amount of data we load into our database. Moreover, we are not interested in the names for our analysis.

In [8]:
# Drop the first and last name columns
payers_df.drop(['first_name', 'last_name'], axis = 1, inplace=True)

In [9]:
# Create a column to determine if they have included a photo other than the default photo

# Values for default come after having explored the data in the eda_venmo notebook
pic_default = (['https://s3.amazonaws.com/venmo/no-image.gif', 
                'https://s3.amazonaws.com/venmo/placeholder-image/groups-placeholder.svg'])
pic_personalised = [0 if about in pic_default else 1 for about in payers_df['about']]  
payers_df['pic_personalised'] = pic_personalised

A special thought should be given to the pic_personalised column as it can potentially not add that much value. This is because if users sign up through facebook, their profile_picture is updated to their facebook's profile pic. Consequently, this doesn't necessarily lead to more transactions. 

In [10]:
payers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329112 entries, 0 to 329111
Data columns (total 11 columns):
about                  329111 non-null object
date_joined            329111 non-null datetime64[ns]
display_name           329112 non-null object
id                     329112 non-null object
is_active              329112 non-null bool
is_blocked             329112 non-null bool
is_group               329112 non-null bool
profile_picture_url    329112 non-null object
username               329111 non-null object
about_personalised     329112 non-null int64
pic_personalised       329112 non-null int64
dtypes: bool(3), datetime64[ns](1), int64(2), object(5)
memory usage: 21.0+ MB


## User (payee) extraction and manipulation

In [11]:
# Transform user information into a dataframe

# Identifying columns that contain null values
null_columns = (['email', 'friend_status', 'friends_count', 'identity',
                 'phone', 'trust_request']) 

payees = []
payee_ids = set()
counter = 0
for transaction in initial_5pct:
    user = transaction['payment']['target']['user']
    try:
        user_id = user['id']
    except TypeError:
        counter += 1
        continue
    if user_id in payee_ids:
        continue
    else:
        payee_ids.add(user_id)
        payee = {}
        for key, val in transaction['payment']['target']['user'].items():
            if key in null_columns:
                continue
            else:
                payee[key] = val
        payees.append(payee.copy())

In [12]:
payees_df = pd.DataFrame(payees)

In [13]:
payees_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319902 entries, 0 to 319901
Data columns (total 11 columns):
about                  319902 non-null object
date_joined            319902 non-null object
display_name           319902 non-null object
first_name             319902 non-null object
id                     319902 non-null object
is_active              319902 non-null bool
is_blocked             319902 non-null bool
is_group               319902 non-null bool
last_name              319902 non-null object
profile_picture_url    319902 non-null object
username               319902 non-null object
dtypes: bool(3), object(8)
memory usage: 20.4+ MB


In [14]:
# Create a column to determine if they have personalised the about column

# Values for default come after having explored the data in the eda_venmo notebook
about_default = [' ', 'No Short Bio', 'No short bio', '\n', ' \n', '  ', 'No Short Bio\n']
about_personalised = [0 if about in about_default else 1 for about in payees_df['about']]  
payees_df['about_personalised'] = about_personalised

In [15]:
#payees_df.loc[payees_df['display_name'] == 'Eugene Yum']

In [16]:
#interesting = payees_df.loc[payees_df['display_name'] == 'Eugene Yum']['date_joined']

In [17]:
#pd.to_datetime(interesting, format='%Y-%m-%dT%H:%M:%S')

In [18]:
# Convert the date_joined objects into a datetime field
payees_df['date_joined'] = pd.to_datetime(payers_df['date_joined'], format='%Y-%m-%dT%H:%M:%S')

In [19]:
# Drop the first and last name columns
payees_df.drop(['first_name', 'last_name'], axis = 1, inplace=True)

In [20]:
# Create a column to determine if they have included a photo other than the default photo

# Values for default come after having explored the data in the eda_venmo notebook
pic_default = (['https://s3.amazonaws.com/venmo/no-image.gif', 
                'https://s3.amazonaws.com/venmo/placeholder-image/groups-placeholder.svg'])
pic_personalised = [0 if about in pic_default else 1 for about in payees_df['about']]  
payees_df['pic_personalised'] = pic_personalised

In [21]:
payees_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319902 entries, 0 to 319901
Data columns (total 11 columns):
about                  319902 non-null object
date_joined            319901 non-null datetime64[ns]
display_name           319902 non-null object
id                     319902 non-null object
is_active              319902 non-null bool
is_blocked             319902 non-null bool
is_group               319902 non-null bool
profile_picture_url    319902 non-null object
username               319902 non-null object
about_personalised     319902 non-null int64
pic_personalised       319902 non-null int64
dtypes: bool(3), datetime64[ns](1), int64(2), object(5)
memory usage: 20.4+ MB


## Unique user table

In [22]:
# Identifying the payees that have not been payers to make a complete user list

payees_not_payers = set()
for payee_id in payee_ids:
    if payee_id not in payer_ids:
        payees_not_payers.add(payee_id)

In [23]:
payees_not_payers_df = payees_df.loc[payees_df['id'].apply(lambda x: x in payees_not_payers)]

In [24]:
unique_users = pd.concat([payers_df, payees_not_payers_df], axis=0)
unique_users = unique_users.rename(columns = {"id": "user_id"}) 

In [25]:
len(payers_df) + len(payees_not_payers) == len(unique_users)

True

In [27]:
unique_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 624518 entries, 0 to 319901
Data columns (total 11 columns):
about                  624517 non-null object
date_joined            624516 non-null datetime64[ns]
display_name           624518 non-null object
user_id                624518 non-null object
is_active              624518 non-null bool
is_blocked             624518 non-null bool
is_group               624518 non-null bool
profile_picture_url    624518 non-null object
username               624517 non-null object
about_personalised     624518 non-null int64
pic_personalised       624518 non-null int64
dtypes: bool(3), datetime64[ns](1), int64(2), object(5)
memory usage: 44.7+ MB


In [26]:
# Store the user table
with open('initial_5pct_unique_users.pkl', 'wb') as f:
    pickle.dump(unique_users, f)