# Pull aggregated table and run first model

In [None]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
# Retrieve information to connect to the database
def get_keys(path):
    with open(path) as f:
        return json.load(f)
    
keys = get_keys("/Users/jjherranzsarrion/.secret/local_info.json")
username = keys['username']
password = keys['password']

In [None]:
# Connect to the venmo_transactions database
connection = psycopg2.connect(user=f'{username}',
                              password=f'{username}',
                              database='venmo_transactions')
cursor = connection.cursor()

## Extract information to calculate features

### Features 1 and 2: Number of days the account has been opened and personalised bio

In [None]:
q = """SELECT user_id, about_personalised as personalised_bio,
       SUM(CAST('2018-07-28 23:59:59' AS timestamp) - date_joined) as time_since_account_inception
       FROM users
       GROUP BY (user_id, about_personalised);"""
cursor.execute(q)
print("These are the different users and the date they joined venmo")
user_info_df = pd.DataFrame(cursor.fetchall())
user_info_df.columns = [x[0] for x in cursor.description]
user_info_df.head()

### Features 3, 4, and 5: Mean and Max time between previous transaction made and number of transactions made

In [None]:
q = """SELECT DISTINCT u.user_id, MAX(p1.diff_time) as max_time_diff_made_trans,
              AVG(p1.diff_time) as mean_time_diff_made_trans,
              COUNT (DISTINCT p1.payment_id) as n_transactions_made
       FROM (SELECT p.actor_id, p.payment_id,
                    (LEAD(p.date_created, 1) OVER (PARTITION BY p.actor_id ORDER BY p.date_created)
                    - p.date_created) as diff_time
             FROM payments p
             WHERE p.date_created < CAST('2018-07-29 00:00:00' AS timestamp)) as p1
       INNER JOIN users u ON u.user_id = p1.actor_id
       GROUP BY (u.user_id);"""

cursor.execute(q)
payed_transactions_df = pd.DataFrame(cursor.fetchall())
payed_transactions_df.columns = [x[0] for x in cursor.description]
payed_transactions_df.sort_values('max_time_diff_made_trans', ascending=False).head()

### Features 6, 7, and 8: Mean and Max time between previous transaction received and n transactions received

In [None]:
q = """SELECT DISTINCT u.user_id, MAX(p1.diff_time) as max_time_diff_received_trans,
              AVG(p1.diff_time) as mean_time_diff_received_trans,
              COUNT (DISTINCT p1.payment_id) as n_transactions_received
       FROM (SELECT p.target_user_id, p.payment_id,
                    (LEAD(p.date_created, 1) OVER (PARTITION BY p.target_user_id ORDER BY p.date_created)
                    - p.date_created) as diff_time
             FROM payments p
             WHERE p.date_created < CAST('2018-07-29 00:00:00' AS timestamp)) as p1
       INNER JOIN users u ON u.user_id = p1.target_user_id
       GROUP BY (u.user_id);"""

cursor.execute(q)
received_transactions_df = pd.DataFrame(cursor.fetchall())
received_transactions_df.columns = [x[0] for x in cursor.description]
received_transactions_df.sort_values('max_time_diff_received_trans', ascending=False).head()

### Feature 9: Total number of transactions made the previous day

In [None]:
q = """SELECT u.user_id, COUNT (DISTINCT p.payment_id) as n_trans_made_yest
       FROM payments p
       INNER JOIN users u ON u.user_id = p.actor_id
       WHERE p.date_created >= CAST('2018-07-28 00:00:00' AS timestamp)
       AND p.date_created < CAST('2018-07-29 00:00:00' AS timestamp)
       GROUP BY (u.user_id);"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
trans_made_yest_df = pd.DataFrame(cursor.fetchall())
trans_made_yest_df.columns = [x[0] for x in cursor.description]
trans_made_yest_df.head()

In [None]:
trans_made_yest_df.sort_values('n_trans_made_yest', ascending=False).head()

Strange, no user made more than one transaction in the period of 2018-08-01 00:00:00 until 2018-08-07 and there were only 50 transactions.

### Feature 10: Total number of transactions received in the previous day

In [None]:
q = """SELECT u.user_id, COUNT (DISTINCT p.payment_id) as n_trans_rec_yest
       FROM payments p
       INNER JOIN users u ON u.user_id = p.target_user_id
       WHERE p.date_created >= CAST('2018-07-28 00:00:00' AS timestamp)
       AND p.date_created < CAST('2018-07-29 00:00:00' AS timestamp)
       GROUP BY (u.user_id);"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
trans_rec_yest_df = pd.DataFrame(cursor.fetchall())
trans_rec_yest_df.columns = [x[0] for x in cursor.description]
trans_rec_yest_df.head()

In [None]:
trans_rec_yest_df.sort_values('n_trans_rec_yest', ascending=False).head()

## Creating the aggregated statistics table

In [None]:
# Merge the received_transactions_df and the number of transactions made yesterday

trans_made = pd.merge(payed_transactions_df, trans_made_yest_df, 
                      'outer', on='user_id')
trans_made.fillna(0, inplace=True)

In [None]:
# Merge the total number of transactions received and number of transactions received in the past week

trans_rec = pd.merge(received_transactions_df, trans_rec_yest_df, 
                     'outer', on='user_id')
trans_rec.fillna(0, inplace=True)

In [None]:
# Merge the total number of user transactions, both made and received
trans = pd.merge(trans_made, trans_rec, 'outer', on='user_id')
trans.fillna(0, inplace=True)

In [None]:
trans.head()

In [None]:
# Aggregate the user statistics with the user information
agg_table = pd.merge(user_info_df, trans, 'inner', on='user_id')

In [None]:
agg_table.head()

In [None]:
agg_table.info()

In [None]:
agg_table['time_since_account_inception'] = [diff.total_seconds() for diff in agg_table['time_since_account_inception']]
agg_table['max_time_diff_made_trans'] = [diff.total_seconds() for diff in agg_table['max_time_diff_made_trans']]
agg_table['max_time_diff_received_trans'] = [diff.total_seconds() for diff in agg_table['max_time_diff_received_trans']]
agg_table['mean_time_diff_made_trans'] = [diff.total_seconds() for diff in agg_table['mean_time_diff_made_trans']]
agg_table['mean_time_diff_received_trans'] = [diff.total_seconds() for diff in agg_table['mean_time_diff_received_trans']]

In [None]:
agg_table.describe()

## Building a logistic regression model

In [None]:
# Extracting my y value

q = """SELECT u.user_id, COUNT (DISTINCT p.payment_id) as n_transactions_made_29th
       FROM payments p
       INNER JOIN users u ON u.user_id = p.actor_id
       WHERE p.date_created >= CAST('2018-07-29 00:00:00' AS timestamp)
       AND p.date_created < CAST('2018-07-30 00:00:00' AS timestamp)
       GROUP BY (u.user_id);"""
cursor.execute(q)
#print("These are the different users and the date they joined venmo")
tran_or_not_df = pd.DataFrame(cursor.fetchall())
tran_or_not_df.columns = [x[0] for x in cursor.description]
tran_or_not_df.head()

In [None]:
tran_or_not_df['n_transactions_made_29th'] = [1 for trans in tran_or_not_df['n_transactions_made_29th']]

In [None]:
# Merge with agg table to include nulls

complete_table = pd.merge(agg_table, tran_or_not_df, 'outer', on='user_id')
complete_table.fillna(0, inplace=True)

In [None]:
X = complete_table.drop('n_transactions_made_29th', axis=1)
y = complete_table['n_transactions_made_29th']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
logreg = LogisticRegression(fit_intercept = False, C = 1e12, solver='liblinear')
model_log = logreg.fit(X_train_sc, y_train)
model_log

In [None]:
y_hat_test = logreg.predict(X_test_sc)

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Testing Precision: ', precision_score(y_test, y_hat_test))
print('\n')

print('Testing Recall: ', recall_score(y_test, y_hat_test))
print('\n')

print('Testing Accuracy: ', accuracy_score(y_test, y_hat_test))
print('\n')

print('Testing F1-Score: ',f1_score(y_test, y_hat_test))