# Querying Data From AWS Redshift V1

In [1]:
import pandas as pd
import numpy as np

import sqlalchemy
import psycopg2
import simplejson
%load_ext sql
%config SqlMagic.displaylimit = 5

import sys
reload(sys)
sys.setdefaultencoding('utf8')

## Connecting with AWS database

In [2]:
# Read the Redshift's credentials file 
with open("redshift_creds.json.nogit") as fh:
    creds = simplejson.loads(fh.read())

# Connect to the Redshift
connect_to_db = 'postgresql+psycopg2://' + \
                creds['user_name'] + ':' + creds['password'] + '@' + \
                creds['host_name'] + ':' + creds['port_num'] + '/' + creds['db_name'];
%sql $connect_to_db

u'Connected: dwahid@data_depot'

## 1. Periodetic Report Systems, Invoice and Client Creation Activities: All Accounts

In [14]:
# SQL query 
sql_rs_invoices_clients_activities_all_accounts = '''WITH periodic_report_system_activities AS (
    SELECT systemid, signup_date, is_freshbooks_account_active, is_new_trial_from_accountant_invite,
           freshbooks_account_status, is_paying, base_subscription_amount_first_upgrade,
           subscription_package_name, upgrade_ever, signup_ip_address
    FROM report_systems rs
    WHERE signup_date BETWEEN '2018-08-01' and '2019-07-30'
), invoice_create_date AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           inv.create_date,
           inv.created_at,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM periodic_report_system_activities AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
), invoice_grouping AS (
    SELECT
           systemid,
           COUNT(invoiceid) as invoice_count,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 7 THEN 1 ELSE 0 END) AS invoice_count_day_7,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 15 THEN 1 ELSE 0 END) AS invoice_count_day_15,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 30 THEN 1 ELSE 0 END) AS invoice_count_day_30,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 45 THEN 1 ELSE 0 END) AS invoice_count_day_45,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 60 THEN 1 ELSE 0 END) AS invoice_count_day_60,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 75 THEN 1 ELSE 0 END) AS invoice_count_day_75,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 90 THEN 1 ELSE 0 END) AS invoice_count_day_90
    FROM invoice_create_date
    GROUP BY systemid
), client_crate_date AS (
     SELECT
            pic.systemid,
            usr.userid,
            usr.signup_date,
            DATEDIFF(days, pic.signup_date, usr.signup_date) AS days_to_client_creation
    FROM periodic_report_system_activities  AS pic
    LEFT JOIN coalesced_live_shards."user" as usr USING (systemid)
), client_grouping AS (
    SELECT
           systemid,
           count(userid) AS client_count,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 7 THEN 1 ELSE 0 END) AS client_count_day_7,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 15 THEN 1 ELSE 0 END) AS client_count_day_15,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 30 THEN 1 ELSE 0 END) AS client_count_day_30,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 45 THEN 1 ELSE 0 END) AS client_count_day_45,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 60 THEN 1 ELSE 0 END) AS client_count_day_60,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 75 THEN 1 ELSE 0 END) AS client_count_day_75,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 90 THEN 1 ELSE 0 END) AS client_count_day_90
    FROM  client_crate_date
    GROUP BY systemid
)

SELECT
       systemid, signup_date,
       is_freshbooks_account_active, is_new_trial_from_accountant_invite,
       freshbooks_account_status, is_paying, base_subscription_amount_first_upgrade,
       subscription_package_name, upgrade_ever, signup_ip_address,
       inv_gr.invoice_count,
       inv_gr.invoice_count_day_7,
       inv_gr.invoice_count_day_15,
       inv_gr.invoice_count_day_30,
       inv_gr.invoice_count_day_45,
       inv_gr.invoice_count_day_60,
       inv_gr.invoice_count_day_75,
       inv_gr.invoice_count_day_90,
       cl_gr.client_count,
       cl_gr.client_count_day_7,
       cl_gr.client_count_day_15,
       cl_gr.client_count_day_30,
       cl_gr.client_count_day_45,
       cl_gr.client_count_day_60,
       cl_gr.client_count_day_75,
       cl_gr.client_count_day_90
FROM periodic_report_system_activities
LEFT JOIN invoice_grouping as inv_gr USING (systemid)
LEFT JOIN client_grouping AS cl_gr USING (systemid);
'''

# Import as dataframe from redshift
df_rs_invoices_clients_activities_all_accounts = pd.read_sql_query(sql_rs_invoices_clients_activities_all_accounts, connect_to_db)

# Export as csv file
df_rs_invoices_clients_activities_all_accounts.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/report_systems_invoices_clients_all_activities.csv", 
                                      sep="\t", index=False)

In [15]:
df_rs_invoices_clients_activities_all_accounts.tail()

Unnamed: 0,systemid,signup_date,is_freshbooks_account_active,is_new_trial_from_accountant_invite,freshbooks_account_status,is_paying,base_subscription_amount_first_upgrade,subscription_package_name,upgrade_ever,signup_ip_address,...,invoice_count_day_75,invoice_count_day_90,client_count,client_count_day_7,client_count_day_15,client_count_day_30,client_count_day_45,client_count_day_60,client_count_day_75,client_count_day_90
452621,4501746,2019-07-30,1,0,trial_expired,0,0.0,FB Trial Monthly Q1 2016,0,134.41.39.101,...,1,1,2,2,2,2,2,2,2,2
452622,4502010,2019-07-30,1,0,active,1,25.0,FBSMUX 25 Q3 2016,1,99.165.106.137,...,22,22,23,15,15,20,23,23,23,23
452623,4502032,2019-07-30,1,0,trial_expired,0,0.0,FB Trial Monthly Q1 2016,0,157.45.38.208,...,0,0,1,1,1,1,1,1,1,1
452624,4502088,2019-07-30,1,0,trial_expired,0,0.0,FB Trial Monthly Q1 2016,0,157.43.5.44,...,0,0,1,1,1,1,1,1,1,1
452625,4502116,2019-07-30,1,0,trial_expired,0,0.0,FB Trial Monthly Q1 2016,0,216.50.143.2,...,1,1,2,2,2,2,2,2,2,2


## 2. Periodic Invoices Descriptions, Notes, Terms, Address: All Accounts

### 2.1 Invoices created within 7 days period after account signup date

In [25]:
# SQL for impoorting all invoices created within 7 days after signup_date
sql_invoices_7day_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2018-08-01' and '2019-07-30'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ( days_to_invoice_creation <= 7 OR days_to_invoice_creation IS NULL)
)


SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_7day_all_accounts = pd.read_sql_query(sql_invoices_7day_all_accounts, connect_to_db)

# Export as csv file
df_invoices_7day_all_accounts.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoices_7days_all_accounts.csv", 
                                      sep="\t", index=False)

In [26]:
df_invoices_7day_all_accounts.tail()

Unnamed: 0,systemid,invoiceid,signup_date,create_date,created_at,description,notes,terms,address,days_to_invoice_creation
631386,4359910,,2019-06-07,,NaT,,,,,
631387,3628323,,2018-08-16,,NaT,,,,,
631388,3863002,,2018-11-16,,NaT,,,,,
631389,3905366,,2018-12-04,,NaT,,,,,
631390,3967988,,2019-01-04,,NaT,,,,,


### 2.2 Invoices created within 15 days after account signup date

In [17]:
# SQL for impoorting all invoices created within 15 days after account signup_date
sql_invoices_15day_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2018-08-01' and '2019-07-30'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ( days_to_invoice_creation <= 15 OR days_to_invoice_creation IS NULL)
)


SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_15day_all_accounts = pd.read_sql_query(sql_invoices_15day_all_accounts, connect_to_db)

# Export as csv file
df_invoices_15day_all_accounts.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoices_15days_all_accounts.csv", 
                                      sep="\t", index=False)

In [18]:
df_invoices_15day_all_accounts.tail()

Unnamed: 0,systemid,invoiceid,signup_date,create_date,created_at,description,days_to_invoice_creation
707517,4103504,,2019-02-25,,NaT,,
707518,4218290,,2019-04-10,,NaT,,
707519,3873560,,2018-11-21,,NaT,,
707520,4130204,,2019-03-06,,NaT,,
707521,4481760,,2019-07-23,,NaT,,


### 2.3 Invoices created within 30 days after account signup date

In [20]:
# SQL for impoorting all invoices created within 30 days after account signup_date
sql_invoices_30day_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2018-08-01' and '2019-07-30'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ( days_to_invoice_creation <= 30 OR days_to_invoice_creation IS NULL)
)


SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_30day_all_accounts = pd.read_sql_query(sql_invoices_30day_all_accounts, connect_to_db)

# Export as csv file
df_invoices_30day_all_accounts.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoices_30days_all_accounts.csv", 
                                      sep="\t", index=False)

In [21]:
df_invoices_30day_all_accounts.tail()

Unnamed: 0,systemid,invoiceid,signup_date,create_date,created_at,description,days_to_invoice_creation
838638,4032778,,2019-01-29,,NaT,,
838639,4083610,,2019-02-17,,NaT,,
838640,3619813,,2018-08-13,,NaT,,
838641,3779825,,2018-10-17,,NaT,,
838642,4094492,,2019-02-21,,NaT,,


### 2.4 Invoices created within 30 days after account signup date

In [24]:
# SQL for impoorting all invoices created within 45 days after account signup_date
sql_invoices_45day_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2018-08-01' and '2019-07-30'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ( days_to_invoice_creation <= 45 OR days_to_invoice_creation IS NULL)
)


SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_45day_all_accounts = pd.read_sql_query(sql_invoices_45day_all_accounts, connect_to_db)


# Export as csv file
df_invoices_45day_all_accounts.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoices_45days_all_accounts.csv", 
                                      sep="\t", index=False)