# Pull aggregated table and calculate statistics

In [81]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
# Retrieve information to connect to the database
def get_keys(path):
    with open(path) as f:
        return json.load(f)
    
keys = get_keys("/Users/jjherranzsarrion/.secret/local_info.json")
username = keys['username']
password = keys['password']

In [126]:
# Connect to the venmo_transactions database
connection = psycopg2.connect(user=f'{username}',
                              password=f'{username}',
                              database='venmo_transactions')
cursor = connection.cursor()

## Extract information to calculate features

### Feature 1: Number of days the account has been opened

In [4]:
q = """SELECT user_id, date_joined::timestamp::date
       FROM users"""
cursor.execute(q)
print("These are the different users and the date they joined venmo")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df['days_since_opened_account'] = datetime.date(2018, 8, 4) - df['date_joined']
df.head()

These are the different users and the date they joined venmo


Unnamed: 0,user_id,date_joined,days_since_opened_account
0,2206066431492096327,2017-05-02,459 days
1,2200417693859840681,2017-04-24,467 days
2,2373608382922752189,2017-12-19,228 days
3,1670504276557824171,2015-04-24,1198 days
4,1957419122950144676,2016-05-24,802 days


### Feature 2: Maximum time between previous transaction made

In [5]:
#q = """SELECT p1.actor_id, p1.payment_id, p1.date_completed::timestamp::date,
#              (p2.date_completed::timestamp::date - p1.date_completed::timestamp::date) as date_diff
#       FROM payments p1
#       INNER JOIN payments p2 USING (actor_id)
#       WHERE p1.actor_id = p2.actor_id 
#       AND p1.payment_id <> p2.payment_id
#       ORDER BY p1.date_completed::timestamp::date;"""
#cursor.execute(q)
#print("These are the different users and the date they joined venmo")
#df = pd.DataFrame(cursor.fetchall())
#df.columns = [x[0] for x in cursor.description]
#df.head()

In [40]:
q = """SELECT p.actor_id, p.payment_id, p.date_created
       FROM payments p
       ORDER BY p.date_created;"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,actor_id,payment_id,date_created
0,2060549944770560092,2532208900736811493,2018-07-26 18:47:04
1,2240373455847424319,2532208900367712690,2018-07-26 18:47:04
2,1774156567805952179,2532208907070210710,2018-07-26 18:47:04
3,1775535923396608366,2532208902397756302,2018-07-26 18:47:04
4,2147341310099456314,2532208900720034121,2018-07-26 18:47:04


In [41]:
df['time_diff'] = df.groupby(['actor_id'])['date_created'].diff().fillna(0)

In [9]:
max_time_between_payed_transactions_df = pd.DataFrame(df.groupby(['actor_id'])['time_diff'].max())
max_time_between_payed_transactions_df.reset_index(inplace=True)

In [10]:
max_time_between_payed_transactions_df.sort_values('time_diff', ascending=False).head()

Unnamed: 0,actor_id,time_diff
88039,1783765785378816488,9 days 11:50:52
45698,1602911490539520615,8 days 14:33:55
114165,1878396959195136941,2 days 21:37:17
143629,1977756673900544227,2 days 21:27:14
160632,2026905133907968295,2 days 21:19:39


### Feature 3: Maximum time between previous transaction received

In [11]:
q = """SELECT p.target_user_id, p.payment_id, p.date_created
       FROM payments p
       ORDER BY p.date_created;"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,target_user_id,payment_id,date_created
0,2012536740249600786,2532208900736811493,2018-07-26 18:47:04
1,2054553574834176086,2532208900367712690,2018-07-26 18:47:04
2,2363997714120704529,2532208907070210710,2018-07-26 18:47:04
3,1934351583412224902,2532208902397756302,2018-07-26 18:47:04
4,2377356102598656077,2532208900720034121,2018-07-26 18:47:04


In [12]:
df['time_diff'] = df.groupby(['target_user_id'])['date_created'].diff().fillna(0)

In [13]:
max_time_between_received_transactions_df = pd.DataFrame(df.groupby(['target_user_id'])['time_diff'].max())
max_time_between_received_transactions_df.reset_index(inplace=True)

In [15]:
max_time_between_received_transactions_df.sort_values('time_diff', ascending=False).head()

Unnamed: 0,target_user_id,time_diff
1481,1159040721747968512,10 days 21:58:10
106701,1857555538116608403,9 days 11:50:52
22860,1466121689497600896,9 days 06:01:55
177151,2067898642202624968,2 days 21:23:12
175527,2063808608600064345,2 days 21:18:41


### Feature 4: Mean time (in seconds) between transactions made

In [77]:
q = """SELECT p.actor_id, p.payment_id, p.date_created
       FROM payments p
       ORDER BY p.date_created;"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,actor_id,payment_id,date_created
0,2060549944770560092,2532208900736811493,2018-07-26 18:47:04
1,2240373455847424319,2532208900367712690,2018-07-26 18:47:04
2,1774156567805952179,2532208907070210710,2018-07-26 18:47:04
3,1775535923396608366,2532208902397756302,2018-07-26 18:47:04
4,2147341310099456314,2532208900720034121,2018-07-26 18:47:04


In [82]:
# Decide to not fill nans because if we fill them with zero, it will have an influence in 
# the mean time between transactions
df['time_diff'] = df.groupby(['actor_id'])['date_created'].diff().fillna(0)

In [83]:
df['time_diff'] = [diff.seconds for diff in df['time_diff']]

In [84]:
mean_time_between_transactions_made_df = pd.DataFrame(df.groupby(['actor_id'])['time_diff'].mean())
mean_time_between_transactions_made_df.reset_index(inplace=True)

In [85]:
mean_time_between_transactions_made_df['time_diff'].fillna(0, inplace=True)

In [86]:
mean_time_between_transactions_made_df.sort_values('time_diff', ascending=False).head()

Unnamed: 0,actor_id,time_diff
235373,2241640949350400344,53821.666667
315174,2500382899044352278,53790.333333
52647,1636575721029632364,53469.333333
226836,2214991784574976899,53164.666667
92325,1798101538963456728,51969.0


### Feature 5: Mean time (in seconds) between transactions received

In [87]:
q = """SELECT p.target_user_id, p.payment_id, p.date_created
       FROM payments p
       ORDER BY p.date_created;"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,target_user_id,payment_id,date_created
0,2012536740249600786,2532208900736811493,2018-07-26 18:47:04
1,2054553574834176086,2532208900367712690,2018-07-26 18:47:04
2,2363997714120704529,2532208907070210710,2018-07-26 18:47:04
3,1934351583412224902,2532208902397756302,2018-07-26 18:47:04
4,2377356102598656077,2532208900720034121,2018-07-26 18:47:04


In [90]:
# Decide to dropna because if we fill them with zero, it will have an influence in the mean time between transactions
df['time_diff'] = df.groupby(['target_user_id'])['date_created'].diff().fillna(0)

In [95]:
df['time_diff'] = [diff.seconds for diff in df['time_diff']]

In [96]:
mean_time_between_received_transactions_df = pd.DataFrame(df.groupby(['target_user_id'])['time_diff'].mean())
mean_time_between_received_transactions_df.reset_index(inplace=True)

In [97]:
mean_time_between_received_transactions_df['time_diff'].fillna(0, inplace=True)

In [98]:
mean_time_between_received_transactions_df.sort_values('time_diff', ascending=False).head()

Unnamed: 0,target_user_id,time_diff
198183,2133550170112000107,55037.0
258029,2309533586161664911,53790.333333
128271,1931709155115008424,53734.333333
220482,2205206532063232444,53544.333333
278130,2381033911615488261,53164.666667


### Feature 6: Total number of transactions made

In [110]:
q = """SELECT p.actor_id, COUNT (DISTINCT p.payment_id)
       FROM payments p
       GROUP BY p.actor_id;"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,actor_id,count
0,1000140982714368204,1
1,1000155285291008202,1
2,1000703229165568682,1
3,1000730617970688429,1
4,1000749978877952908,1


### Feature 6: Total number of transactions received

In [116]:
q = """SELECT p.target_user_id, COUNT (DISTINCT p.payment_id)
       FROM payments p
       GROUP BY p.target_user_id;"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,target_user_id,count
0,1000025236701184812,1
1,1000244103872512884,1
2,1000287095488512647,1
3,1000332008095744106,1
4,1000419081846784785,1


### Feature 7: Total number of transactions made in the past week

In [152]:
q = """SELECT p.actor_id, COUNT (DISTINCT p.payment_id)
       FROM payments p
       WHERE p.date_created >= CAST('2018-08-01 00:00:00' AS timestamp)
       GROUP BY p.actor_id;"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,actor_id,count
0,1126773295153152908,1
1,1207231630540800847,1
2,1399274239688704788,1
3,1443673019514880559,1
4,1569252351934464432,1


In [154]:
df.sort_values('count', ascending=False).head()

Unnamed: 0,actor_id,count
0,1126773295153152908,1
37,2302008090427392306,1
27,2171413033123840693,1
28,2181576024653824854,1
29,2205359305392128157,1


Strange, no user made more than one transaction in the period of 2018-08-01 00:00:00 until 2018-08-07 and there were only 50 transactions.

### Feature 8: Total number of transactions received in the past week

In [156]:
q = """SELECT p.target_user_id, COUNT (DISTINCT p.payment_id)
       FROM payments p
       WHERE p.date_created >= CAST('2018-08-01 00:00:00' AS timestamp)
       GROUP BY p.target_user_id;"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,target_user_id,count
0,1159040721747968512,1
1,1315452072493056711,1
2,1350110889902080835,1
3,1380818563891200885,1
4,1398715273183232872,1


In [157]:
df.sort_values('count', ascending=False).head()

Unnamed: 0,target_user_id,count
0,1159040721747968512,1
37,2298266762870784858,1
27,2126534223593472269,1
28,2138586245758976547,1
29,2143842430091264853,1


### Feature 9: Does the user have a personalised about

In [163]:
q = """SELECT user_id, about_personalised
       FROM users"""
cursor.execute(q)
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

These are the different users and the date they joined venmo


Unnamed: 0,user_id,about_personalised
0,2206066431492096327,0
1,2200417693859840681,0
2,2373608382922752189,0
3,1670504276557824171,0
4,1957419122950144676,0
