# Creating a unique transactions table

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn

## Extracting transaction information

In [3]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

In [24]:
transactions = []
transaction = {}

# Not including in _id in keys because that is the object id from Venmo's db
# Not including payment and app in keys because that requires special handling
keys = ['note', 'type', 'date_updated', 'id', 'date_created', 'audience']
subkeys = ['mentions', 'likes', 'comments', 'payment', 'app']
payment_keys = ['id', 'date_completed']
app_key = ['id']

for details in initial_5pct:
    transaction = {}
    for key, val in details.items():
        if key in keys:
            transaction[key] = val
        elif key in subkeys[:2]:
            for subkey, subval in val.items():
                unpacked = f'{key}_{subkey}'
                transaction[unpacked] = subval
        elif key in subkeys[3]:
            for subkey, subval in val.items():
                if subkey in payment_keys:
                    unpacked = f'{key}_{subkey}'
                    transaction[unpacked] = subval
                else:
                    pass
        elif key in subkeys[4]:
            app_id = f'{key}_id'
            app_id_val = details[f'{key}']['id']
            transaction[app_id] = app_id_val
        else:
            continue
    transactions.append(transaction.copy())

In [25]:
transactions_df = pd.DataFrame(transactions)

In [26]:
transactions_df.head().T

Unnamed: 0,0,1,2,3,4
app_id,10,1,1,1,1
audience,public,public,public,public,public
date_created,2018-07-26 18:48:10,2018-07-26 18:48:08,2018-07-26 18:48:08,2018-07-26 18:48:08,2018-07-26 18:48:08
date_updated,2018-07-26 18:48:10,2018-07-26 18:48:08,2018-07-26 18:48:08,2018-07-26 18:48:08,2018-07-26 18:48:08
id,2532209455978775150,2532209439948145006,2532209441005110060,2532209444645765720,2532209445644010224
likes_count,0,0,0,0,0
likes_data,[],[],[],[],[]
mentions_count,0,0,0,0,0
mentions_data,[],[],[],[],[]
note,for utilities,👕!,Thank you!,📱💸,Mt Dew & candy


In [27]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353829 entries, 0 to 353828
Data columns (total 13 columns):
app_id                    353829 non-null int64
audience                  353829 non-null object
date_created              353829 non-null datetime64[ns]
date_updated              353829 non-null datetime64[ns]
id                        353829 non-null object
likes_count               353829 non-null int64
likes_data                353829 non-null object
mentions_count            353829 non-null int64
mentions_data             353829 non-null object
note                      353829 non-null object
payment_date_completed    351407 non-null object
payment_id                353829 non-null object
type                      353829 non-null object
dtypes: datetime64[ns](2), int64(3), object(8)
memory usage: 35.1+ MB


In [29]:
# Rename col id to transaction_id for easier recognition in the db
transactions_df = transactions_df.rename(columns = {"id": "transaction_id"})

In [30]:
# Converting the date_created and date_completed objects into a datetime.datetime field
transactions_df['payment_date_completed'] = pd.to_datetime(transactions_df['payment_date_completed'],
                                                           format='%Y-%m-%dT%H:%M:%S')

In [31]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353829 entries, 0 to 353828
Data columns (total 13 columns):
app_id                    353829 non-null int64
audience                  353829 non-null object
date_created              353829 non-null datetime64[ns]
date_updated              353829 non-null datetime64[ns]
transaction_id            353829 non-null object
likes_count               353829 non-null int64
likes_data                353829 non-null object
mentions_count            353829 non-null int64
mentions_data             353829 non-null object
note                      353829 non-null object
payment_date_completed    351407 non-null datetime64[ns]
payment_id                353829 non-null object
type                      353829 non-null object
dtypes: datetime64[ns](3), int64(3), object(7)
memory usage: 35.1+ MB
