# Pull aggregated table and run first model

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn
import matplotlib.pyplot as plt 
%matplotlib inline

In [3]:
# Retrieve information to connect to the database
keys = fn.get_keys("/Users/jjherranzsarrion/.secret/local_info.json")
username = keys['username']
password = keys['password']

In [6]:
# Select your time ranges and add them in variables
train_window_end = '2018-07-28 23:59:59'
test_window_start = '2018-07-29 00:00:00'
test_window_end = '2018-07-29 23:59:59'
previous_day_start = '2018-07-28 00:00:00'

## Extract information to calculate features

### Features 1 and 2: Number of days the account has been opened and personalised bio

In [None]:
def user_info(username, password, train_window_end):
    """ Function that returns the time period since the user opened the account and whether
        or not they have a personalised bio."""
    cursor = extracting_cursor(username, password)
    q = f"""SELECT user_id, about_personalised as personalised_bio,
            SUM(CAST('{train_window_end}' AS timestamp) - date_joined) as time_since_account_inception
            FROM users
            GROUP BY (user_id, about_personalised);"""
    cursor.execute(q)
    user_info_df = pd.DataFrame(cursor.fetchall())
    user_info_df.columns = [x[0] for x in cursor.description]
    return user_info_df

In [7]:
user_info_df = fn.user_info(username, password, train_window_end)

### Features 3, 4, and 5: Mean and Max time between previous transaction made and number of transactions made

In [None]:
def payed_transactions(username, password, train_window_end):
    """ Function that returns the total number of transactions made during a given period and
        the mean, max of the previous transactions made."""
    cursor = extracting_cursor(username, password)
    q = f"""SELECT DISTINCT u.user_id, MAX(p1.diff_time) as max_time_diff_made_trans,
                   AVG(p1.diff_time) as mean_time_diff_made_trans,
                   COUNT (DISTINCT p1.payment_id) as n_transactions_made
            FROM (SELECT p.actor_id, p.payment_id,
                         (LEAD(p.date_created, 1) OVER (PARTITION BY p.actor_id ORDER BY p.date_created)
                         - p.date_created) as diff_time
                  FROM payments p
                  WHERE p.date_created <= CAST('{train_window_end}' AS timestamp)) as p1
            INNER JOIN users u ON u.user_id = p1.actor_id
            GROUP BY (u.user_id);"""
    cursor.execute(q)
    payed_transactions_df = pd.DataFrame(cursor.fetchall())
    payed_transactions_df.columns = [x[0] for x in cursor.description]
    return payed_transactions_df

In [8]:
payed_transactions_df = fn.payed_transactions(username, password, train_window_end)

### Features 6, 7, and 8: Mean and Max time between previous transaction received and n transactions received

In [None]:
def received_transactions(username, password, train_window_end):
    """ Function that returns the total number of transactions received during a given period and
        the mean, max of the previous transactions received."""
    cursor = extracting_cursor(username, password)
    q = f"""SELECT DISTINCT u.user_id, MAX(p1.diff_time) as max_time_diff_received_trans,
                   AVG(p1.diff_time) as mean_time_diff_received_trans,
                   COUNT (DISTINCT p1.payment_id) as n_transactions_received
            FROM (SELECT p.target_user_id, p.payment_id,
                         (LEAD(p.date_created, 1) OVER (PARTITION BY p.target_user_id ORDER BY p.date_created)
                         - p.date_created) as diff_time
                  FROM payments p
                  WHERE p.date_created <= CAST('{train_window_end}' AS timestamp)) as p1
            INNER JOIN users u ON u.user_id = p1.target_user_id
            GROUP BY (u.user_id);"""
    cursor.execute(q)
    received_transactions_df = pd.DataFrame(cursor.fetchall())
    received_transactions_df.columns = [x[0] for x in cursor.description]
    return received_transactions_df

In [9]:
received_transactions_df = fn.received_transactions(username, password, train_window_end)

### Feature 9: Total number of transactions made the previous day

In [None]:
def transactions_made_previous_day(username, password, previous_day_start, train_window_end):
    """ Function that returns the total number of transactions made the previos day to our 
        testing time frame."""
    cursor = extracting_cursor(username, password)
    q = f"""SELECT u.user_id, COUNT (DISTINCT p.payment_id) as n_trans_made_yest
            FROM payments p
            INNER JOIN users u ON u.user_id = p.actor_id
            WHERE p.date_created >= CAST('{previous_day_start}' AS timestamp)
            AND p.date_created <= CAST('{train_window_end}' AS timestamp)
            GROUP BY (u.user_id);"""
    cursor.execute(q)
    trans_made_yest_df = pd.DataFrame(cursor.fetchall())
    trans_made_yest_df.columns = [x[0] for x in cursor.description]
    return trans_made_yest_df

In [10]:
transactions_made_previous_day_df = fn.transactions_made_previous_day(username, password,
                                                                      previous_day_start, 
                                                                      train_window_end)

Strange, no user made more than one transaction in the period of 2018-08-01 00:00:00 until 2018-08-07 and there were only 50 transactions.

### Feature 10: Total number of transactions received in the previous day

In [None]:
def transactions_rec_previous_day(username, password, previous_day_start, train_window_end):
    """ Function that returns the total number of transactions received the previos day 
        to our testing time frame."""
    cursor = extracting_cursor(username, password)
    q = f"""SELECT u.user_id, COUNT (DISTINCT p.payment_id) as n_trans_rec_yest
            FROM payments p
            INNER JOIN users u ON u.user_id = p.target_user_id
            WHERE p.date_created >= CAST('{previous_day_start}' AS timestamp)
            AND p.date_created <= CAST('{train_window_end}' AS timestamp)
            GROUP BY (u.user_id);"""
    cursor.execute(q)
    trans_rec_yest_df = pd.DataFrame(cursor.fetchall())
    trans_rec_yest_df.columns = [x[0] for x in cursor.description]
    return trans_rec_yest_df

In [11]:
transactions_rec_previous_day_df = fn.transactions_rec_previous_day(username, password,
                                                                    previous_day_start, 
                                                                    train_window_end)

## Creating the aggregated statistics table

In [None]:
# Merge the dataframes for users that payed transactions

def made(username, password, previous_day_start, train_window_end):
    "Function that returns a dataframe with combined statistics for payers"
    payed_transactions_df = payed_transactions(username, password, train_window_end)
    transactions_made_previous_day_df = transactions_made_previous_day(username, password,
                                                                   previous_day_start, 
                                                                   train_window_end)
    # Outer join because not everyone who has previously made a transaction necessarily made one yesterday
    trans_made = pd.merge(payed_transactions_df, transactions_made_previous_day_df, 
                          'outer', on='user_id') 
    # Filling with 0s the null values that arise when users have made a transaction but not yesterday
    trans_made.fillna(0, inplace=True)
    return trans_made

In [None]:
# Merge the dataframes for users that received transactions

def received(username, password, previous_day_start, train_window_end):
    "Function that returns a dataframe with combined statistics for payees"
    received_transactions_df = received_transactions(username, password, train_window_end)
    transactions_rec_previous_day_df = transactions_rec_previous_day(username, password,
                                                                 previous_day_start, 
                                                                 train_window_end)
    # Outer join because not everyone who has previously received a transaction necessarily received one yesterday
    trans_rec = pd.merge(received_transactions_df, transactions_rec_previous_day_df, 
                          'outer', on='user_id') 
    # Filling with 0s the null values that arise when users have received a transaction but not yesterday
    trans_rec.fillna(0, inplace=True)
    return trans_rec

In [None]:
# Merge the dataframes for users that received transactions

def transactions(username, password, previous_day_start, train_window_end):
    "Function that returns a dataframe with combined statistics for payees"
    made_df = made(username, password, previous_day_start, train_window_end)
    received_df = received(username, password, previous_day_start, train_window_end)
    # Outer join because not everyone who has made a transaction necessarily received one and viceversa
    trans = pd.merge(made_df, received_df, 'outer', on='user_id') 
    # Filling with 0s the null values that arise when users have made a transaction but not received one
    trans.fillna(0, inplace=True)
    return trans

In [None]:
# Aggregate the user statistics with the user information

def agg_table(username, password, previous_day_start, train_window_end):
    "Function that returns a dataframe with user information and relevant statistics"
    user_df = user_info(username, password, train_window_end)
    trans_df = transactions(username, password, previous_day_start, train_window_end)
    # Inner join because all users should have either made or received a transaction,
    # so they will have a user_id
    agg_table = pd.merge(user_df, trans_df, 'inner', on='user_id')
    
    time_delta_cols = (['time_since_account_inception', 'max_time_diff_made_trans',
                        'max_time_diff_received_trans', 'mean_time_diff_made_trans', 
                        'mean_time_diff_received_trans'])
    
    for col in time_delta_cols:
        agg_table[f'{col}'] = [diff.total_seconds() for diff in agg_table[f'{col}']]
    return agg_table

In [13]:
user_statistics = fn.get_aggregated_user_statistics(username, password, previous_day_start, train_window_end)

In [None]:
user_statistics.info()

In [None]:
user_statistics.describe()

## Building a logistic regression model

In [None]:
# Extracting my y value

def extract_target(username, password, test_window_start, test_window_end):
    """Function that returns the target variable (whether someone made a transaction 
       during a given time period) or not"""
    cursor = extracting_cursor(username, password)
    q = f"""SELECT u.user_id, COUNT (DISTINCT p.payment_id) as n_transactions_made_29th
            FROM payments p
            INNER JOIN users u ON u.user_id = p.actor_id
            WHERE p.date_created >= CAST('{test_window_start}' AS timestamp)
            AND p.date_created <= CAST('{test_window_end}' AS timestamp)
            GROUP BY (u.user_id);"""
    cursor.execute(q)
    tran_or_not_df = pd.DataFrame(cursor.fetchall())
    tran_or_not_df.columns = [x[0] for x in cursor.description]
    tran_or_not_df['n_transactions_made_29th'] = [1 for trans in tran_or_not_df['n_transactions_made_29th']]
    return tran_or_not_df

In [None]:
tran_or_not_df = extract_target(username, password, test_window_start, test_window_end)

In [None]:
# Merge with agg table to include nulls

complete_table = pd.merge(agg_table, tran_or_not_df, 'outer', on='user_id')
complete_table.fillna(0, inplace=True)

In [None]:
X = complete_table.drop('n_transactions_made_29th', axis=1)
y = complete_table['n_transactions_made_29th']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
logreg = LogisticRegression(fit_intercept = False, C = 1e12, solver='liblinear')
model_log = logreg.fit(X_train_sc, y_train)
model_log

In [None]:
y_hat_test = logreg.predict(X_test_sc)

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Testing Precision: ', precision_score(y_test, y_hat_test))
print('\n')

print('Testing Recall: ', recall_score(y_test, y_hat_test))
print('\n')

print('Testing Accuracy: ', accuracy_score(y_test, y_hat_test))
print('\n')

print('Testing F1-Score: ',f1_score(y_test, y_hat_test))