# Fraud Accounts Life Span Data Collection

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

sns.set(color_codes=True)

In [3]:
## Amazon AWS Redshift Connection
from contextlib import closing


import psycopg2
import pandas as pd
import simplejson

DEFAULT_DB = 'data_depot'
DEFAULT_HOST = 'freshbooks-data.c8exzn6geij3.us-east-1.redshift.amazonaws.com'
DEFAULT_PORT = 5439


class PsycopgConnector:
    '''
    A database connector that uses Psycopg to connect to Redshift.

    How to play:

        psy_conn = PsycopgConnector(username, password)
        df = psy_conn.run_query(sql=sql, return_data=True)

    NOTE: This class commits queries to redshift if return_data=False.
    This means INSERT, DROP, TRUNCATE, etc. all work against the DB.
    '''

    def __init__(
        self,
        username=None,
        password=None,
        db=DEFAULT_DB,
        host=DEFAULT_HOST,
        port=DEFAULT_PORT,
    ):

        self.db = DEFAULT_DB
        self.host = DEFAULT_HOST
        self.port = DEFAULT_PORT

        self.username = username
        self.password = password

    def _get_connection(self):

        self.conn = psycopg2.connect(
            dbname=self.db,
            user=self.username,
            password=self.password,
            host=self.host,
            port=self.port
        )

        return self.conn

    def run_query(self, sql, return_data=False):

        with closing(self._get_connection()) as conn:
            with conn, conn.cursor() as cur:
                if return_data:
                    return pd.read_sql(sql=sql, con=conn)
                else:
                    cur.execute(sql)
                    

# Read the Redshift's credentials file 
with open("redshift_creds.json.nogit") as fh:
    creds = simplejson.loads(fh.read())
    
username = creds.get("user_name")
password = creds.get("password")

pig = PsycopgConnector(username, password)

username = creds.get("user_name")
password = creds.get("password")

In [4]:
# Test the connection
query = "SELECT * FROM report_systems LIMIT 10"
df = pig.run_query(query, return_data=True)
df.head(2)

Unnamed: 0,systemid,business_id,admin_identity_id,subdomain,is_freshbooks_account_active,is_modern,most_recent_migrated_to_smux_at,is_contractor,currency_code,timezone,...,staff_count,staff_deleted_count,contractor_count,contractor_deleted_count,user_contact_count,enabled_gateway_count,google_sso_first_linked_date,google_sso_most_recent_linked_date,google_sso_first_removal_date,google_sso_most_recent_removal_date
0,3723,,,https://shane.freshbooks.com,0,0,,0,USD,US/Eastern,...,0,0,0,0,0,0,,,,
1,3809,,,https://ASA.freshbooks.com,0,0,,0,USD,US/Eastern,...,1,0,0,0,0,0,,,,


In [8]:
## Fraud Accounts Lifespan Data Collection
sql_fraud_account_lifespan = '''select *
from (
         with fra_rs as (
             select systemid
                  , rs.signup_date
             from report_systems as rs
         ),
              fra_list as (
                  select fra.systemid
                       , DATEDIFF(days, fr.signup_date, fra.status_date::date) as days_before_ban
                  from data_science.fraud_accounts_list as fra
                           left join fra_rs as fr using (systemid)
              )

         select *
         from fra_list
     )
join (select systemid from data_science.fraud_accounts_final) using (systemid);'''

# Import as dataframe from redshift
df_fraud_account_lifespan = pig.run_query(sql_fraud_account_lifespan, return_data=True)


In [9]:
## Test data
df_fraud_account_lifespan.tail()

Unnamed: 0,systemid,days_before_ban
357,1758644,690
358,1826920,742
359,2732755,631
360,732140,1542
361,1271218,1124


In [10]:
# Export as csv file
df_fraud_account_lifespan.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data/analyzing_fraud_accounts/fraud_accounts_lifespan.tsv", 
                                      sep="\t", index=False)