# Script 01: Identifying Fraud Risk Accounts

In [1]:
## 
import pandas as pd
import numpy as np
from datetime import date

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

In [2]:
## Testing AWS Connectivity
from contextlib import closing


import psycopg2
import simplejson
import sys
reload(sys)
sys.setdefaultencoding('utf8')

DEFAULT_DB = 'data_depot'
DEFAULT_HOST = 'freshbooks-data.c8exzn6geij3.us-east-1.redshift.amazonaws.com'
DEFAULT_PORT = 5439


class PsycopgConnector:
    '''
    A database connector that uses Psycopg to connect to Redshift.

    How to play:

        psy_conn = PsycopgConnector(username, password)
        df = psy_conn.run_query(sql=sql, return_data=True)

    NOTE: This class commits queries to redshift if return_data=False.
    This means INSERT, DROP, TRUNCATE, etc. all work against the DB.
    '''

    def __init__(
        self,
        username=None,
        password=None,
        db=DEFAULT_DB,
        host=DEFAULT_HOST,
        port=DEFAULT_PORT,
    ):

        self.db = DEFAULT_DB
        self.host = DEFAULT_HOST
        self.port = DEFAULT_PORT

        self.username = username
        self.password = password

    def _get_connection(self):

        self.conn = psycopg2.connect(
            dbname=self.db,
            user=self.username,
            password=self.password,
            host=self.host,
            port=self.port
        )

        return self.conn

    def run_query(self, sql, return_data=False):

        with closing(self._get_connection()) as conn:
            with conn, conn.cursor() as cur:
                if return_data:
                    return pd.read_sql(sql=sql, con=conn)
                else:
                    cur.execute(sql)
                    

# Read the Redshift's credentials file 
with open("redshift_creds.json.nogit") as fh:
    creds = simplejson.loads(fh.read())
    
username = creds.get("user_name")
password = creds.get("password")

pig = PsycopgConnector(username, password)

# Testing connection
sql_test = '''SELECT * FROM report_systems LIMIT 5'''
df_test = pig.run_query(sql_test, return_data=True)

In [3]:
df_test

Unnamed: 0,systemid,business_id,admin_identity_id,subdomain,is_freshbooks_account_active,is_modern,most_recent_migrated_to_smux_at,is_contractor,currency_code,timezone,...,staff_count,staff_deleted_count,contractor_count,contractor_deleted_count,user_contact_count,enabled_gateway_count,google_sso_first_linked_date,google_sso_most_recent_linked_date,google_sso_first_removal_date,google_sso_most_recent_removal_date
0,76,,,https://rimages.freshbooks.com,1,0,,0,EUR,US/Eastern,...,0,0,0,0,18,1,,,,
1,2865,,,https://mooredesigns.freshbooks.com,0,0,,0,USD,US/Mountain,...,1,0,0,0,86,1,,,,
2,3050,,,https://thrifty.freshbooks.com,1,0,,0,USD,US/Eastern,...,0,0,0,0,3,2,,,,
3,5490,61687.0,90905.0,https://jonom.freshbooks.com,1,1,2016-10-11,0,CAD,US/Pacific,...,0,0,0,0,0,1,,,,
4,5900,,,https://nrgomes.freshbooks.com,1,0,,0,EUR,Europe/London,...,1,0,0,0,0,0,,,,


## Number of Fraud Risk Accounts (N)

Please enter the nubmer **(N)** of Fraud Risk Accounts that you want to check today.

In [4]:
#------------------------------------------------------------
# PLEASE READ AND ANSWER THIS QUESTION
#
# Question: How many accounts you want to check today?
# Answer: Number of Accounts: N
#-----------------------------------------------------------
N = 25
#-------------------------------------------------------------

## Computing Fraud Risk Score (FRS)

This following script will query new account (if already not labeled) those are signed-up within last 91 days (from today), and compute the corresponding **Fraud Risk Score (FRS)**.

In [5]:
## Pull new user accounts and compute Fraud Risk Score (FRS)
%run ./fraud_risk_score_computing.ipynb

In [6]:
## Import new user accounts and corresponding Fraud Risk Score (FRS)
path = "/Users/dwahid/Documents/GitHub/fraud_detection/data/fraud_risk_acc_to_be_labeled_all_features/"

## Without date
# file_name = "new_fraud_risk_acc_tbl_all_features"
# data = path + file_name + ".tsv"

## With date
file_name = "new_fraud_risk_acc_tbl_all_features_"
today = str(date.today())
data = path + file_name + today + ".tsv"

df_fra_all_features = pd.read_csv(data, sep="\t")   # FRA - Fraud Risk Accounts

In [7]:
df_fra_all_features.tail()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admin_email,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,...,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice,fraud_risk_score,fraud_label,support_note
55160,0.0,0.0,0.0,dwahinya@icloud.com,0.0,0,0,0.0,0.0,0.0,...,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,TBL,
55161,0.0,0.0,0.0,aya.1919113@stemgharbiya.moe.edu.eg,0.0,0,0,0.0,0.0,0.0,...,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,TBL,
55162,0.0,0.0,0.0,ofaydavis@yahoo.com,0.0,0,0,0.0,0.0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,TBL,
55163,0.0,0.0,0.0,gautamsun784@gmail.com,0.0,0,0,0.0,0.0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,TBL,
55164,0.0,0.0,0.0,arielsoto379@gmail.com,0.0,0,0,0.0,0.0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,TBL,


In [8]:
df_fra_all_features.shape

(55165, 97)

## Return Top N Fraud Risk Accounts for Support Labeling
It will return a list of top N FRA accounts for support labeleing. 

In [9]:
## Top N accounts for support team reporting
df_fra_topN_all_features = df_fra_all_features.head(N)

## Selecting colums for support team reporting
df_fra_topN_for_support = df_fra_topN_all_features[['systemid', 'admin_email', 'signup_date', 
                                                   'effective_date', 'days_on_platform', 'fraud_label', 'support_note']]

## Path and file name for user accounts for need to be labeled by support team
path = "/Users/dwahid/Documents/GitHub/fraud_detection/data/fraud_risk_acc_to_be_labeled_for_support/"

## Without date
# file_name = "new_fraud_risk_acc_tbl_for_support"
# path_fra_topN_for_support = path + file_name + ".csv"

## With date 
file_name = "new_fraud_risk_acc_tbl_for_support_"
today = str(date.today())
path_fra_topN_for_support = path + file_name + today + ".csv"

## Save the user accounts for need to be labeled by support team
df_fra_topN_for_support.to_csv(path_fra_topN_for_support, sep=",", index=False)

In [10]:
df_fra_topN_for_support.head(25)

Unnamed: 0,systemid,admin_email,signup_date,effective_date,days_on_platform,fraud_label,support_note
0,5423977,jkpttreasures@gmail.com,2020-06-24,2020-07-06,12,TBL,
1,5420121,abel@foliocomunic.com,2020-06-23,2020-07-06,13,TBL,
2,5377715,lui.nathan@gmail.com,2020-06-12,2020-07-06,24,TBL,
3,5417697,saajith.samoon@themugshotlk.com,2020-06-23,2020-07-06,13,TBL,
4,5434377,ben@bayintegration.com,2020-06-26,2020-07-06,10,TBL,
5,5433889,katerbugs@gmail.com,2020-06-26,2020-07-06,10,TBL,
6,5341301,pauladlynn29@gmail.com,2020-06-03,2020-07-06,33,TBL,
7,5433763,brandon.cedarmfg@gmail.com,2020-06-26,2020-07-06,10,TBL,
8,5435493,anthonyholroyd1991@gmail.com,2020-06-27,2020-07-06,9,TBL,
9,5410469,rishitaadani@gmail.com,2020-06-20,2020-07-06,16,TBL,


In [11]:
df_fra_topN_for_support.shape

(25, 7)

In [12]:
"You data file for Support labeling is ready"

'You data file for Support labeling is ready'