# Pipeline for the independent model

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn
import matplotlib.pyplot as plt 
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

## Extract raw transaction details from the Mongo DB database

In [None]:
# Retrieve the connection and access the venmo transactions in the MongoDB
venmo_collection = fn.collection()

In [None]:
# Extract the initial 5% of transactions
fn.initial_5pct(venmo_collection)

The reason why there is no variable assigned to the above's function result is that the function doesn't return anything. The only thing it does is generate a pickle.
- This has to be changed when running in AWS since there will be more computer power to handle larger files.

## Process the transactions, sort and generate the tables for our database

In [None]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

In [None]:
# Run to process all transactions and extract high level transaction information
high_level_transaction_info = fn.get_transaction_specific_information(initial_5pct)

In [None]:
# Run to process all transactions and extract all the users details
unique_users = fn.get_unique_user_table(initial_5pct)

In [None]:
f'There are currently {len(unique_users)} unique users and {len(high_level_transaction_info)} unique transactions.'

In [None]:
# Run to process all transactions and retrieve the different app details
app_details = fn.get_app_specific_information(initial_5pct)

## Store the tables for our database

In [None]:
# Retrieve information to connect to the database
keys = fn.get_keys("/Users/jjherranzsarrion/.secret/local_info.json")
username = keys['username']
password = keys['password']

In [None]:
# Move the different tables with transactions info into the venmo database
engine = create_engine(f'postgresql://{username}:{password}@localhost/venmo_transactions')
high_level_transaction_info.to_sql('transactions', engine)
unique_users.to_sql('users', engine)
app_details.to_sql('app_info', engine)

Payments information is missing until Monday

In [None]:
# Select your time ranges and add them in variables
train_window_end = '2018-07-28 23:59:59'
test_window_start = '2018-07-29 00:00:00'
test_window_end = '2018-07-29 23:59:59'
previous_day_start = '2018-07-28 00:00:00'

## Creating the aggregated statistics table

In [None]:
user_statistics = fn.get_aggregated_user_statistics(username, password, previous_day_start, train_window_end)

In [None]:
user_statistics.info()

In [None]:
user_statistics.describe()

## Building a logistic regression model

In [None]:
tran_or_not_df = fn.extract_target(username, password, test_window_start, test_window_end)

In [None]:
# Merge with agg table to include null values for user who did not make a transaction in the testing time period

complete_table = pd.merge(user_statistics, tran_or_not_df, 'outer', on='user_id')
complete_table.fillna(0, inplace=True)

In [None]:
X = complete_table.drop('n_transactions_made_29th', axis=1)
y = complete_table['n_transactions_made_29th']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
logreg = LogisticRegression(fit_intercept = False, C = 1e12, solver='liblinear')
model_log = logreg.fit(X_train_sc, y_train)
model_log

In [None]:
y_hat_test = logreg.predict(X_test_sc)

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Testing Precision: ', precision_score(y_test, y_hat_test))
print('\n')

print('Testing Recall: ', recall_score(y_test, y_hat_test))
print('\n')

print('Testing Accuracy: ', accuracy_score(y_test, y_hat_test))
print('\n')

print('Testing F1-Score: ',f1_score(y_test, y_hat_test))