# Word Counts: Periodic Invoices - All Accounts

In [1]:
import pandas as pd
import numpy as np

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')


In [2]:
# !pip install simplejson

## Connect with the Redshift Database

In [3]:
from contextlib import closing


import psycopg2
import simplejson
import sys
reload(sys)
sys.setdefaultencoding('utf8')

DEFAULT_DB = 'data_depot'
DEFAULT_HOST = 'freshbooks-data.c8exzn6geij3.us-east-1.redshift.amazonaws.com'
DEFAULT_PORT = 5439


class PsycopgConnector:
    '''
    A database connector that uses Psycopg to connect to Redshift.

    How to play:

        psy_conn = PsycopgConnector(username, password)
        df = psy_conn.run_query(sql=sql, return_data=True)

    NOTE: This class commits queries to redshift if return_data=False.
    This means INSERT, DROP, TRUNCATE, etc. all work against the DB.
    '''

    def __init__(
        self,
        username=None,
        password=None,
        db=DEFAULT_DB,
        host=DEFAULT_HOST,
        port=DEFAULT_PORT,
    ):

        self.db = DEFAULT_DB
        self.host = DEFAULT_HOST
        self.port = DEFAULT_PORT

        self.username = username
        self.password = password

    def _get_connection(self):

        self.conn = psycopg2.connect(
            dbname=self.db,
            user=self.username,
            password=self.password,
            host=self.host,
            port=self.port
        )

        return self.conn

    def run_query(self, sql, return_data=False):

        with closing(self._get_connection()) as conn:
            with conn, conn.cursor() as cur:
                if return_data:
                    return pd.read_sql(sql=sql, con=conn)
                else:
                    cur.execute(sql)
                    

# Read the Redshift's credentials file 
with open("redshift_creds.json.nogit") as fh:
    creds = simplejson.loads(fh.read())
    
username = creds.get("user_name")
password = creds.get("password")

pig = PsycopgConnector(username, password)

In [4]:
# Testing connection
sql_test = '''SELECT * FROM report_systems LIMIT 5'''
df_test = pig.run_query(sql_test, return_data=True)

In [5]:
df_test

Unnamed: 0,systemid,business_id,admin_identity_id,subdomain,is_freshbooks_account_active,is_modern,most_recent_migrated_to_smux_at,is_contractor,currency_code,timezone,...,staff_count,staff_deleted_count,contractor_count,contractor_deleted_count,user_contact_count,enabled_gateway_count,google_sso_first_linked_date,google_sso_most_recent_linked_date,google_sso_first_removal_date,google_sso_most_recent_removal_date
0,848,,,https://systemdt.freshbooks.com,1,0,,1,CAD,Etc/GMT+5,...,3,3,4,6,9,2,,,,
1,2154,,,https://IntercodeTechnologiesInc.freshbooks.com,1,0,,0,USD,US/Eastern,...,0,0,0,0,0,0,,,,
2,2332,91460.0,122105.0,https://cstoneweb.freshbooks.com,1,1,2016-11-14,0,USD,US/Eastern,...,0,0,0,0,1,1,,,,
3,6359,,,https://paul.freshbooks.com,0,0,,0,GBP,Etc/GMT,...,9,9,0,0,2,0,,,,
4,7541,,,https://Prological1.freshbooks.com,1,0,,0,USD,US/Eastern,...,0,0,0,0,0,0,,,,


## Functions

In [6]:
# Word count fuction
import re
def words_count (strg):
    
    #print(strg)
    
    if strg == '' or pd.isnull(strg):
        no_of_words = 0
        #print('NaN')
    else:
        strg_words_list = re.findall(r"[\w']+", strg)
        no_of_words = len(strg_words_list)

        
        #print(strg_words_list)
    
    return no_of_words 
    

# 1. Import Invoice Data & Extract Avg Word Counts Features

## 1.01 Invoice within 7 days

In [7]:
# SQL for impoorting all invoices created within 7 days after signup_date
sql_invoices_7days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT 
            systemid, 
            signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           pic.signup_date,
           inv.invoiceid,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 7) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
# df_invoices_7days_all_accounts = pd.read_sql_query(sql_invoices_7days_all_accounts, connect_to_db)
df_invoices_7days_all_accounts = pig.run_query(sql_invoices_7days_all_accounts, return_data=True)


# Words count in invoice's description, notes, terms, address
df_invoices_7days_all_accounts['avg_wc_description_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_7days_all_accounts['avg_wc_notes_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_7days_all_accounts['avg_wc_terms_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_7days_all_accounts['avg_wc_address_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_7days_all_accounts_fil = df_invoices_7days_all_accounts.filter(['systemid', 
                                                                            'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_7', 
                                                                            'avg_wc_notes_day_7', 
                                                                            'avg_wc_terms_day_7',
                                                                            'avg_wc_address_day_7'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_7days_all_accounts_total = df_invoices_7days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_7days_all_accounts_final = df_word_count_7days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_7', 
                                                                            'avg_wc_notes_day_7', 
                                                                            'avg_wc_terms_day_7',
                                                                            'avg_wc_address_day_7'])

In [8]:
# Export as csv file
df_word_count_7days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_7days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


## 1.2 Invoice within 14 days 

In [9]:
# SQL for impoorting all invoices created within 14 days after signup_date
sql_invoices_14days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 14) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
# df_invoices_14days_all_accounts = pd.read_sql_query(sql_invoices_14days_all_accounts, connect_to_db)
df_invoices_14days_all_accounts = pig.run_query(sql_invoices_14days_all_accounts, return_data=True)


# Words count in invoice's description, notes, terms, address
df_invoices_14days_all_accounts['avg_wc_description_day_14'] = df_invoices_14days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_14days_all_accounts['avg_wc_notes_day_14'] = df_invoices_14days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_14days_all_accounts['avg_wc_terms_day_14'] = df_invoices_14days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_14days_all_accounts['avg_wc_address_day_14'] = df_invoices_14days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_14days_all_accounts_fil = df_invoices_14days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_14', 
                                                                            'avg_wc_notes_day_14', 
                                                                            'avg_wc_terms_day_14',
                                                                            'avg_wc_address_day_14'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_14days_all_accounts_total = df_invoices_14days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_14days_all_accounts_final = df_word_count_14days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_14', 
                                                                            'avg_wc_notes_day_14', 
                                                                            'avg_wc_terms_day_14',
                                                                            'avg_wc_address_day_14'])

# Export as csv file
df_word_count_14days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_14days_new_accounts_final.tsv", 
                                      sep="\t", index=False)



In [10]:
df_word_count_14days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_14,avg_wc_notes_day_14,avg_wc_terms_day_14,avg_wc_address_day_14
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.3 Invoice within 21 days 

In [12]:
# SQL for impoorting all invoices created within 21 days after signup_date
sql_invoices_21days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 21) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_21days_all_accounts = pig.run_query(sql_invoices_21days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_21days_all_accounts['avg_wc_description_day_21'] = df_invoices_21days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_21days_all_accounts['avg_wc_notes_day_21'] = df_invoices_21days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_21days_all_accounts['avg_wc_terms_day_21'] = df_invoices_21days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_21days_all_accounts['avg_wc_address_day_21'] = df_invoices_21days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_21days_all_accounts_fil = df_invoices_21days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_21', 
                                                                            'avg_wc_notes_day_21', 
                                                                            'avg_wc_terms_day_21',
                                                                            'avg_wc_address_day_21'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_21days_all_accounts_total = df_invoices_21days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_21days_all_accounts_final = df_word_count_21days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_21', 
                                                                            'avg_wc_notes_day_21', 
                                                                            'avg_wc_terms_day_21',
                                                                            'avg_wc_address_day_21'])

# Export as csv file
df_word_count_21days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_21days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [13]:
df_word_count_21days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_21,avg_wc_notes_day_21,avg_wc_terms_day_21,avg_wc_address_day_21
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.4 Invoice within 28 days 

In [14]:
# SQL for impoorting all invoices created within 28 days after signup_date
sql_invoices_28days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 28) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_28days_all_accounts = pig.run_query(sql_invoices_28days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_28days_all_accounts['avg_wc_description_day_28'] = df_invoices_28days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_28days_all_accounts['avg_wc_notes_day_28'] = df_invoices_28days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_28days_all_accounts['avg_wc_terms_day_28'] = df_invoices_28days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_28days_all_accounts['avg_wc_address_day_28'] = df_invoices_28days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_28days_all_accounts_fil = df_invoices_28days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_28', 
                                                                            'avg_wc_notes_day_28', 
                                                                            'avg_wc_terms_day_28',
                                                                            'avg_wc_address_day_28'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_28days_all_accounts_total = df_invoices_28days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_28days_all_accounts_final = df_word_count_28days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_28', 
                                                                            'avg_wc_notes_day_28', 
                                                                            'avg_wc_terms_day_28',
                                                                            'avg_wc_address_day_28'])

# Export as csv file
df_word_count_28days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_28days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [15]:
df_word_count_28days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_28,avg_wc_notes_day_28,avg_wc_terms_day_28,avg_wc_address_day_28
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.5 Invoice within 35 days 

In [16]:
# SQL for impoorting all invoices created within 35 days after signup_date
sql_invoices_35days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 35) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_35days_all_accounts = pig.run_query(sql_invoices_35days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_35days_all_accounts['avg_wc_description_day_35'] = df_invoices_35days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_35days_all_accounts['avg_wc_notes_day_35'] = df_invoices_35days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_35days_all_accounts['avg_wc_terms_day_35'] = df_invoices_35days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_35days_all_accounts['avg_wc_address_day_35'] = df_invoices_35days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_35days_all_accounts_fil = df_invoices_35days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_35', 
                                                                            'avg_wc_notes_day_35', 
                                                                            'avg_wc_terms_day_35',
                                                                            'avg_wc_address_day_35'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_35days_all_accounts_total = df_invoices_35days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_35days_all_accounts_final = df_word_count_35days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_35', 
                                                                            'avg_wc_notes_day_35', 
                                                                            'avg_wc_terms_day_35',
                                                                            'avg_wc_address_day_35'])

# Export as csv file
df_word_count_35days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_35days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [17]:
df_word_count_35days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_35,avg_wc_notes_day_35,avg_wc_terms_day_35,avg_wc_address_day_35
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.6 Invoice within 42 days 

In [18]:
# SQL for impoorting all invoices created within 42 days after signup_date
sql_invoices_42days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 42) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_42days_all_accounts = pig.run_query(sql_invoices_42days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_42days_all_accounts['avg_wc_description_day_42'] = df_invoices_42days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_42days_all_accounts['avg_wc_notes_day_42'] = df_invoices_42days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_42days_all_accounts['avg_wc_terms_day_42'] = df_invoices_42days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_42days_all_accounts['avg_wc_address_day_42'] = df_invoices_42days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_42days_all_accounts_fil = df_invoices_42days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_42', 
                                                                            'avg_wc_notes_day_42', 
                                                                            'avg_wc_terms_day_42',
                                                                            'avg_wc_address_day_42'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_42days_all_accounts_total = df_invoices_42days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_42days_all_accounts_final = df_word_count_42days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_42', 
                                                                            'avg_wc_notes_day_42', 
                                                                            'avg_wc_terms_day_42',
                                                                            'avg_wc_address_day_42'])

# Export as csv file
df_word_count_42days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_42days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [19]:
df_word_count_42days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_42,avg_wc_notes_day_42,avg_wc_terms_day_42,avg_wc_address_day_42
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.7 Invoice within 49 days

In [20]:
# SQL for impoorting all invoices created within 49 days after signup_date
sql_invoices_49days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 49) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_49days_all_accounts = pig.run_query(sql_invoices_49days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_49days_all_accounts['avg_wc_description_day_49'] = df_invoices_49days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_49days_all_accounts['avg_wc_notes_day_49'] = df_invoices_49days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_49days_all_accounts['avg_wc_terms_day_49'] = df_invoices_49days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_49days_all_accounts['avg_wc_address_day_49'] = df_invoices_49days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_49days_all_accounts_fil = df_invoices_49days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_49', 
                                                                            'avg_wc_notes_day_49', 
                                                                            'avg_wc_terms_day_49',
                                                                            'avg_wc_address_day_49'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_49days_all_accounts_total = df_invoices_49days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_49days_all_accounts_final = df_word_count_49days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_49', 
                                                                            'avg_wc_notes_day_49', 
                                                                            'avg_wc_terms_day_49',
                                                                            'avg_wc_address_day_49'])

# Export as csv file
df_word_count_49days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_49days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [21]:
df_word_count_49days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_49,avg_wc_notes_day_49,avg_wc_terms_day_49,avg_wc_address_day_49
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.8 Invoice within 56 days

In [23]:
# SQL for impoorting all invoices created within 56 days after signup_date
sql_invoices_56days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 56) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_56days_all_accounts = pig.run_query(sql_invoices_56days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_56days_all_accounts['avg_wc_description_day_56'] = df_invoices_56days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_56days_all_accounts['avg_wc_notes_day_56'] = df_invoices_56days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_56days_all_accounts['avg_wc_terms_day_56'] = df_invoices_56days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_56days_all_accounts['avg_wc_address_day_56'] = df_invoices_56days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_56days_all_accounts_fil = df_invoices_56days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_56', 
                                                                            'avg_wc_notes_day_56', 
                                                                            'avg_wc_terms_day_56',
                                                                            'avg_wc_address_day_56'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_56days_all_accounts_total = df_invoices_56days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_56days_all_accounts_final = df_word_count_56days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_56', 
                                                                            'avg_wc_notes_day_56', 
                                                                            'avg_wc_terms_day_56',
                                                                            'avg_wc_address_day_56'])

# Export as csv file
df_word_count_56days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_56days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [24]:
df_word_count_56days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_56,avg_wc_notes_day_56,avg_wc_terms_day_56,avg_wc_address_day_56
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.9 Invoice within 63 days 

In [26]:
# SQL for impoorting all invoices created within 63 days after signup_date
sql_invoices_63days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 63) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_63days_all_accounts = pig.run_query(sql_invoices_63days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_63days_all_accounts['avg_wc_description_day_63'] = df_invoices_63days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_63days_all_accounts['avg_wc_notes_day_63'] = df_invoices_63days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_63days_all_accounts['avg_wc_terms_day_63'] = df_invoices_63days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_63days_all_accounts['avg_wc_address_day_63'] = df_invoices_63days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_63days_all_accounts_fil = df_invoices_63days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_63', 
                                                                            'avg_wc_notes_day_63', 
                                                                            'avg_wc_terms_day_63',
                                                                            'avg_wc_address_day_63'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_63days_all_accounts_total = df_invoices_63days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_63days_all_accounts_final = df_word_count_63days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_63', 
                                                                            'avg_wc_notes_day_63', 
                                                                            'avg_wc_terms_day_63',
                                                                            'avg_wc_address_day_63'])

# Export as csv file
df_word_count_63days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_63days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [27]:
df_word_count_63days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_63,avg_wc_notes_day_63,avg_wc_terms_day_63,avg_wc_address_day_63
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.10 Invoice within 70 days 

In [28]:
# SQL for impoorting all invoices created within 70 days after signup_date
sql_invoices_70days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 70) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_70days_all_accounts = pig.run_query(sql_invoices_70days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_70days_all_accounts['avg_wc_description_day_70'] = df_invoices_70days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_70days_all_accounts['avg_wc_notes_day_70'] = df_invoices_70days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_70days_all_accounts['avg_wc_terms_day_70'] = df_invoices_70days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_70days_all_accounts['avg_wc_address_day_70'] = df_invoices_70days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_70days_all_accounts_fil = df_invoices_70days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_70', 
                                                                            'avg_wc_notes_day_70', 
                                                                            'avg_wc_terms_day_70',
                                                                            'avg_wc_address_day_70'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_70days_all_accounts_total = df_invoices_70days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_70days_all_accounts_final = df_word_count_70days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_70', 
                                                                            'avg_wc_notes_day_70', 
                                                                            'avg_wc_terms_day_70',
                                                                            'avg_wc_address_day_70'])

# Export as csv file
df_word_count_70days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_70days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [29]:
df_word_count_70days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_70,avg_wc_notes_day_70,avg_wc_terms_day_70,avg_wc_address_day_70
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.11 Invoice 77 days 

In [30]:
# SQL for impoorting all invoices created within 77 days after signup_date
sql_invoices_77days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 77) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_77days_all_accounts = pig.run_query(sql_invoices_77days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_77days_all_accounts['avg_wc_description_day_77'] = df_invoices_77days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_77days_all_accounts['avg_wc_notes_day_77'] = df_invoices_77days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_77days_all_accounts['avg_wc_terms_day_77'] = df_invoices_77days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_77days_all_accounts['avg_wc_address_day_77'] = df_invoices_77days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_77days_all_accounts_fil = df_invoices_77days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_77', 
                                                                            'avg_wc_notes_day_77', 
                                                                            'avg_wc_terms_day_77',
                                                                            'avg_wc_address_day_77'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_77days_all_accounts_total = df_invoices_77days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_77days_all_accounts_final = df_word_count_77days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_77', 
                                                                            'avg_wc_notes_day_77', 
                                                                            'avg_wc_terms_day_77',
                                                                            'avg_wc_address_day_77'])

# Export as csv file
df_word_count_77days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_77days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [31]:
df_word_count_77days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_77,avg_wc_notes_day_77,avg_wc_terms_day_77,avg_wc_address_day_77
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.12 Invoice within 84 days 

In [32]:
# SQL for impoorting all invoices created within 84 days after signup_date
sql_invoices_84days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 84) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_84days_all_accounts = pig.run_query(sql_invoices_84days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_84days_all_accounts['avg_wc_description_day_84'] = df_invoices_84days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_84days_all_accounts['avg_wc_notes_day_84'] = df_invoices_84days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_84days_all_accounts['avg_wc_terms_day_84'] = df_invoices_84days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_84days_all_accounts['avg_wc_address_day_84'] = df_invoices_84days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_84days_all_accounts_fil = df_invoices_84days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_84', 
                                                                            'avg_wc_notes_day_84', 
                                                                            'avg_wc_terms_day_84',
                                                                            'avg_wc_address_day_84'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_84days_all_accounts_total = df_invoices_84days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_84days_all_accounts_final = df_word_count_84days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_84', 
                                                                            'avg_wc_notes_day_84', 
                                                                            'avg_wc_terms_day_84',
                                                                            'avg_wc_address_day_84'])

# Export as csv file
df_word_count_84days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_84days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [33]:
df_word_count_84days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_84,avg_wc_notes_day_84,avg_wc_terms_day_84,avg_wc_address_day_84
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.13 Invoice within 91 days 

In [34]:
# SQL for impoorting all invoices created within 91 days after signup_date
sql_invoices_91days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT systemid, signup_date
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           pic.signup_date,
           inv.create_date,
           inv.created_at,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 91) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
df_invoices_91days_all_accounts = pig.run_query(sql_invoices_91days_all_accounts, return_data=True)

# Words count in invoice's description, notes, terms, address
df_invoices_91days_all_accounts['avg_wc_description_day_91'] = df_invoices_91days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_91days_all_accounts['avg_wc_notes_day_91'] = df_invoices_91days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_91days_all_accounts['avg_wc_terms_day_91'] = df_invoices_91days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_91days_all_accounts['avg_wc_address_day_91'] = df_invoices_91days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_91days_all_accounts_fil = df_invoices_91days_all_accounts.filter(['systemid', 'invoiceid', 
                                                                            'signup_date', 
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_91', 
                                                                            'avg_wc_notes_day_91', 
                                                                            'avg_wc_terms_day_91',
                                                                            'avg_wc_address_day_91'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_91days_all_accounts_total = df_invoices_91days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_91days_all_accounts_final = df_word_count_91days_all_accounts_total.filter(['systemid',
                                                                            'avg_wc_description_day_91', 
                                                                            'avg_wc_notes_day_91', 
                                                                            'avg_wc_terms_day_91',
                                                                            'avg_wc_address_day_91'])

# Export as csv file
df_word_count_91days_all_accounts_final.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_word_count_91days_new_accounts_final.tsv", 
                                      sep="\t", index=False)


In [35]:
df_word_count_91days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_91,avg_wc_notes_day_91,avg_wc_terms_day_91,avg_wc_address_day_91
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4504870,0.0,0.0,0.0,0.0
4504872,1.0,0.0,0.0,0.0
4504874,0.0,0.0,0.0,0.0
4504876,0.0,0.0,0.0,0.0
4504878,0.0,0.0,0.0,0.0


## 1.14 Joining All Periodic Average Words Counts Features Data

In [36]:
# Joininig day 7 and day 14 th dataframes
df_avg_invoice_word_count = pd.merge(df_word_count_7days_all_accounts_final, df_word_count_14days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 21 
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_21days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 28
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_28days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 35
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_35days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 42
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_42days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 49
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_49days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 56
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_56days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 63
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_63days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 70
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_70days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 77
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_77days_all_accounts_final,
                                     on='systemid', how='left')
# left join day 84
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_84days_all_accounts_final,
                                     on='systemid', how='left')

# left join day 91
df_avg_invoice_word_count = pd.merge(df_avg_invoice_word_count, df_word_count_91days_all_accounts_final,
                                     on='systemid', how='left')

In [37]:
# Export as csv file
df_avg_invoice_word_count.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_avg_invoice_word_count.tsv", 
                                      sep="\t", index=False)

In [38]:
# Checking
df_avg_invoice_word_count.tail()
# df_avg_invoice_word_count.shape

Unnamed: 0_level_0,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,avg_wc_description_day_14,avg_wc_notes_day_14,avg_wc_terms_day_14,avg_wc_address_day_14,avg_wc_description_day_21,avg_wc_notes_day_21,...,avg_wc_terms_day_77,avg_wc_address_day_77,avg_wc_description_day_84,avg_wc_notes_day_84,avg_wc_terms_day_84,avg_wc_address_day_84,avg_wc_description_day_91,avg_wc_notes_day_91,avg_wc_terms_day_91,avg_wc_address_day_91
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4735844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4735846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4735848,1.055556,0.0,0.0,0.0,1.038462,0.0,0.0,0.0,1.038462,0.0,...,0.0,0.0,1.038462,0.0,0.0,0.0,1.038462,0.0,0.0,0.0
4735850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4735852,0.0,0.0,0.0,0.0,13.0,16.0,0.0,0.0,13.0,16.0,...,0.0,0.0,13.0,16.0,0.0,0.0,13.0,16.0,0.0,0.0


# 2. Report Systems, Invoice & Client Counts Features


In [39]:
################# Import RSystems, Periodic Invoices & Client Counts Data ###############

# SQL query 
sql_rs_invoices_clients_activities_all_accounts = '''WITH periodic_report_system_activities AS (
    SELECT
        systemid,
        signup_date,
        admin_email,
        is_sales_managed,
        is_freshbooks_account_active,
        is_paying,
        signup_ip_address
    FROM report_systems rs
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), invoice_create_date AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           inv.create_date,
           inv.created_at,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM periodic_report_system_activities AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
), invoice_grouping AS (
    SELECT
           systemid,
           COUNT(invoiceid) as invoice_count,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 7 THEN 1 ELSE 0 END) AS invoice_count_day_7,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 14 THEN 1 ELSE 0 END) AS invoice_count_day_14,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 21 THEN 1 ELSE 0 END) AS invoice_count_day_21,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 28 THEN 1 ELSE 0 END) AS invoice_count_day_28,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 35 THEN 1 ELSE 0 END) AS invoice_count_day_35,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 42 THEN 1 ELSE 0 END) AS invoice_count_day_42,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 49 THEN 1 ELSE 0 END) AS invoice_count_day_49,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 56 THEN 1 ELSE 0 END) AS invoice_count_day_56,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 63 THEN 1 ELSE 0 END) AS invoice_count_day_63,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 70 THEN 1 ELSE 0 END) AS invoice_count_day_70,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 77 THEN 1 ELSE 0 END) AS invoice_count_day_77,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 84 THEN 1 ELSE 0 END) AS invoice_count_day_84,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 91 THEN 1 ELSE 0 END) AS invoice_count_day_91
    FROM invoice_create_date
    GROUP BY systemid
), client_crate_date AS (
     SELECT
            pic.systemid,
            usr.userid,
            usr.signup_date,
            DATEDIFF(days, pic.signup_date, usr.signup_date) AS days_to_client_creation
    FROM periodic_report_system_activities  AS pic
    LEFT JOIN coalesced_live_shards."user" as usr USING (systemid)
), client_grouping AS (
    SELECT
           systemid,
           count(userid) AS client_count,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 7 THEN 1 ELSE 0 END) AS client_count_day_7,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 14 THEN 1 ELSE 0 END) AS client_count_day_14,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 21 THEN 1 ELSE 0 END) AS client_count_day_21,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 28 THEN 1 ELSE 0 END) AS client_count_day_28,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 35 THEN 1 ELSE 0 END) AS client_count_day_35,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 42 THEN 1 ELSE 0 END) AS client_count_day_42,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 49 THEN 1 ELSE 0 END) AS client_count_day_49,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 56 THEN 1 ELSE 0 END) AS client_count_day_56,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 63 THEN 1 ELSE 0 END) AS client_count_day_63,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 70 THEN 1 ELSE 0 END) AS client_count_day_70,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 77 THEN 1 ELSE 0 END) AS client_count_day_77,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 84 THEN 1 ELSE 0 END) AS client_count_day_84,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 91 THEN 1 ELSE 0 END) AS client_count_day_91
    FROM  client_crate_date
    GROUP BY systemid
)

SELECT
       systemid,
       signup_date,
       admin_email,
       is_sales_managed,
       is_freshbooks_account_active,
       is_paying,
       signup_ip_address,
       inv_gr.invoice_count,
       inv_gr.invoice_count_day_7,
       inv_gr.invoice_count_day_14,
       inv_gr.invoice_count_day_21,
       inv_gr.invoice_count_day_28,
       inv_gr.invoice_count_day_35,
       inv_gr.invoice_count_day_42,
       inv_gr.invoice_count_day_49,
       inv_gr.invoice_count_day_56,
       inv_gr.invoice_count_day_63,
       inv_gr.invoice_count_day_70,
       inv_gr.invoice_count_day_77,
       inv_gr.invoice_count_day_84,
       inv_gr.invoice_count_day_91,
       cl_gr.client_count,
       cl_gr.client_count_day_7,
       cl_gr.client_count_day_14,
       cl_gr.client_count_day_21,
       cl_gr.client_count_day_28,
       cl_gr.client_count_day_35,
       cl_gr.client_count_day_42,
       cl_gr.client_count_day_49,
       cl_gr.client_count_day_56,
       cl_gr.client_count_day_63,
       cl_gr.client_count_day_70,
       cl_gr.client_count_day_77,
       cl_gr.client_count_day_84,
       cl_gr.client_count_day_91
FROM periodic_report_system_activities
LEFT JOIN invoice_grouping as inv_gr USING (systemid)
LEFT JOIN client_grouping AS cl_gr USING (systemid);
'''

# Import as dataframe from redshift
df_rs_invoices_clients_activities_all_accounts = pig.run_query(sql_rs_invoices_clients_activities_all_accounts, return_data=True)


In [40]:
# Export as csv file
df_rs_invoices_clients_activities_all_accounts.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/df_rs_invoices_clients_activities_new_accounts.tsv", 
                                      sep="\t", index=False)

In [41]:
# checking 
df_rs_invoices_clients_activities_all_accounts.tail()

Unnamed: 0,systemid,signup_date,admin_email,is_sales_managed,is_freshbooks_account_active,is_paying,signup_ip_address,invoice_count,invoice_count_day_7,invoice_count_day_14,...,client_count_day_28,client_count_day_35,client_count_day_42,client_count_day_49,client_count_day_56,client_count_day_63,client_count_day_70,client_count_day_77,client_count_day_84,client_count_day_91
114965,4666258,2019-10-02,bill@kleanroute.com,0,1,0,107.77.207.58,2,1,1,...,3,3,3,3,3,3,3,3,3,3
114966,4695854,2019-10-15,soraiya.n@gmail.com,0,1,0,176.249.99.174,1,1,1,...,2,2,2,2,2,2,2,2,2,2
114967,4698906,2019-10-16,rameylr@gmail.com,0,1,0,108.202.9.98,1,1,1,...,2,2,2,2,2,2,2,2,2,2
114968,4710470,2019-10-21,sapna@littlestepsasia.com,0,1,0,58.177.129.86,0,0,0,...,1,1,1,1,1,1,1,1,1,1
114969,4718224,2019-10-24,menk8@outlook.com,0,1,0,118.149.144.116,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [42]:
df_rs_invoices_clients_activities_all_accounts.shape

(114970, 35)

## 3. Join Avg Word counts and Invoice & Client Counts

In [43]:
############### Join Avg Word Counts and Invoice & Client Counts ########################

# left join invoices' average periodic word counts (description, notes, terms, address) with the invices & client counts
df_periodic_invoice_all_counts = pd.merge(df_avg_invoice_word_count, df_rs_invoices_clients_activities_all_accounts,
                                     on='systemid', how='left')

In [44]:
# chiecking
df_periodic_invoice_all_counts.tail()

Unnamed: 0,systemid,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,avg_wc_description_day_14,avg_wc_notes_day_14,avg_wc_terms_day_14,avg_wc_address_day_14,avg_wc_description_day_21,...,client_count_day_28,client_count_day_35,client_count_day_42,client_count_day_49,client_count_day_56,client_count_day_63,client_count_day_70,client_count_day_77,client_count_day_84,client_count_day_91
112134,4735844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,1,1,1,1,1,1,1,1
112135,4735846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,1,1,1,1,1,1,1,1
112136,4735848,1.055556,0.0,0.0,0.0,1.038462,0.0,0.0,0.0,1.038462,...,25,25,25,25,25,25,25,25,25,25
112137,4735850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,1,1,1,1,1,1,1,1
112138,4735852,0.0,0.0,0.0,0.0,13.0,16.0,0.0,0.0,13.0,...,3,3,3,3,3,3,3,3,3,3


In [45]:
df_periodic_invoice_all_counts.shape

(112139, 87)

In [46]:
# Export as csv file
df_periodic_invoice_all_counts.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/periodic_invoice_all_counts_new_accounts.csv", 
                                      sep="\t", index=False)

# Import CSV file
# df_periodic_invoice_all_counts = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/periodic_invoice_all_counts_new_accounts.csv", 
#                                       sep="\t")

In [47]:
# checking
df_periodic_invoice_all_counts.shape

(112139, 87)

In [48]:
list(df_periodic_invoice_all_counts)

['systemid',
 'avg_wc_description_day_7',
 'avg_wc_notes_day_7',
 'avg_wc_terms_day_7',
 'avg_wc_address_day_7',
 'avg_wc_description_day_14',
 'avg_wc_notes_day_14',
 'avg_wc_terms_day_14',
 'avg_wc_address_day_14',
 'avg_wc_description_day_21',
 'avg_wc_notes_day_21',
 'avg_wc_terms_day_21',
 'avg_wc_address_day_21',
 'avg_wc_description_day_28',
 'avg_wc_notes_day_28',
 'avg_wc_terms_day_28',
 'avg_wc_address_day_28',
 'avg_wc_description_day_35',
 'avg_wc_notes_day_35',
 'avg_wc_terms_day_35',
 'avg_wc_address_day_35',
 'avg_wc_description_day_42',
 'avg_wc_notes_day_42',
 'avg_wc_terms_day_42',
 'avg_wc_address_day_42',
 'avg_wc_description_day_49',
 'avg_wc_notes_day_49',
 'avg_wc_terms_day_49',
 'avg_wc_address_day_49',
 'avg_wc_description_day_56',
 'avg_wc_notes_day_56',
 'avg_wc_terms_day_56',
 'avg_wc_address_day_56',
 'avg_wc_description_day_63',
 'avg_wc_notes_day_63',
 'avg_wc_terms_day_63',
 'avg_wc_address_day_63',
 'avg_wc_description_day_70',
 'avg_wc_notes_day_70',
 

# 4. Import and Exract Features from Events Data
## 4.1 Event data collection 

In [49]:
############################### Event Features Extraction ################################

#SQL for events 
sql_events = '''WITH selected_accounts_events AS (
    SELECT systemid,
           signup_date,
           signup_datetime
    FROM report_systems
    WHERE signup_date BETWEEN '2019-08-01' and '2019-10-31'
), events_activities AS (
    SELECT sae.systemid,
           signup_date,
           dd.date,
           datediff(days, signup_date, dd.date) as days_to_event,
           lower(e.event) as event,
           ec.count
    FROM selected_accounts_events AS sae
    LEFT JOIN event_counts AS ec USING (systemid)
    LEFT JOIN d_date AS dd USING (date_key)
    LEFT JOIN d_event e on ec.event_key = e.event_key
), event_groupings AS (
    SELECT distinct  ea.systemid,
                    ea.signup_date,
                    ea.date,
                    ea.event,
                    ea.count,
                    (CASE WHEN days_to_event BETWEEN 0 AND 7 THEN ea.count END) AS day_7_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 14 THEN ea.count END) AS day_14_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 21 THEN ea.count END) AS day_21_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 28 THEN ea.count END) AS day_28_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 35 THEN ea.count END) AS day_35_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 42 THEN ea.count END) AS day_42_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 49 THEN ea.count END) AS day_49_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 56 THEN ea.count END) AS day_56_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 63 THEN ea.count END) AS day_63_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 70 THEN ea.count END) AS day_70_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 77 THEN ea.count END) AS day_77_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 84 THEN ea.count END) AS day_84_event,
                    (CASE WHEN days_to_event BETWEEN 0 AND 91 THEN ea.count END) AS day_91_event
    FROM events_activities AS ea
)
SELECT systemid,
       signup_date,
       date,
       event,
       count,
       sum(day_7_event) AS event_count_day_7,
       sum(day_14_event) AS event_count_day_14,
       sum(day_21_event) AS event_count_day_21,
       sum(day_28_event) AS event_count_day_28,
       sum(day_35_event) AS event_count_day_35,
       sum(day_42_event) AS event_count_day_42,
       sum(day_49_event) AS event_count_day_49,
       sum(day_56_event) AS event_count_day_56,
       sum(day_63_event) AS event_count_day_63,
       sum(day_70_event) AS event_count_day_70,
       sum(day_77_event) AS event_count_day_77,
       sum(day_84_event) AS event_count_day_84,
       sum(day_91_event) AS event_count_day_91
From event_groupings
GROUP BY systemid, signup_date, date, event, count
ORDER BY systemid, count DESC;'''

# Import as dataframe from redshift
# df_events_all_accounts = pd.read_sql_query(sql_events, connect_to_db)
df_events_all_accounts = pig.run_query(sql_events, return_data=True)


In [50]:
# checking
df_events_all_accounts.head()

Unnamed: 0,systemid,signup_date,date,event,count,event_count_day_7,event_count_day_14,event_count_day_21,event_count_day_28,event_count_day_35,event_count_day_42,event_count_day_49,event_count_day_56,event_count_day_63,event_count_day_70,event_count_day_77,event_count_day_84,event_count_day_91
0,4504870,2019-08-01,2019-08-01,subscription details changed,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
1,4504870,2019-08-01,2019-08-01,update identity,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,4504870,2019-08-01,2019-08-01,aria supplemental plan replaced,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
3,4504870,2019-08-01,2019-08-07,aria subscription suspended,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,4504870,2019-08-01,2019-08-01,aria supplemental plan upgraded,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [51]:
df_events_all_accounts.shape

(4068118, 18)

## 4.2 Removing whitespce from the event strings

In [52]:
# Removing row if there is 'None' the event cell
df_events_all_accounts = df_events_all_accounts[~df_events_all_accounts.astype(str).eq('None').any(1)]

# Replace the 'NaN' cell by zero
df_events_all_accounts.fillna(0, inplace=True)

# Using lambda function to remove the white space in the event string name
df_events_all_accounts['event_name'] = df_events_all_accounts.apply(lambda x: x['event'].replace(' ', ''), axis=1)

In [53]:
# checking
df_events_all_accounts.shape

(4068054, 19)

In [54]:
df_events_all_accounts.head()

Unnamed: 0,systemid,signup_date,date,event,count,event_count_day_7,event_count_day_14,event_count_day_21,event_count_day_28,event_count_day_35,event_count_day_42,event_count_day_49,event_count_day_56,event_count_day_63,event_count_day_70,event_count_day_77,event_count_day_84,event_count_day_91,event_name
0,4504870,2019-08-01,2019-08-01,subscription details changed,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,subscriptiondetailschanged
1,4504870,2019-08-01,2019-08-01,update identity,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,updateidentity
2,4504870,2019-08-01,2019-08-01,aria supplemental plan replaced,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,ariasupplementalplanreplaced
3,4504870,2019-08-01,2019-08-07,aria subscription suspended,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,ariasubscriptionsuspended
4,4504870,2019-08-01,2019-08-01,aria supplemental plan upgraded,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,ariasupplementalplanupgraded


# 5. Final Features Extraction: Day 7

## 5.1 Filter Only Events for Day 7

In [55]:
########################## Final Features Extraction: Day 7 ##############################

# Filtered the events columns for day 7 period
df_events_all_accounts_day_7 = df_events_all_accounts[['systemid', 'event_count_day_7', 'event_name']]

In [56]:
df_events_all_accounts_day_7.tail()

Unnamed: 0,systemid,event_count_day_7,event_name
4068113,4735852,1.0,ariatrialsubscriptioncreated
4068114,4735852,1.0,refreshtokenused
4068115,4735852,1.0,createfirstinvoice
4068116,4735852,1.0,createartifactcomment
4068117,4735852,1.0,teamsizeset


In [57]:
df_events_all_accounts_day_7.shape

(4068054, 3)

### 8.2 Pivote the Day 7 Events (Each Unique Event Become a Column)

In [58]:
### Pivote the Day 7 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_7 = df_events_all_accounts_day_7.pivot_table(values='event_count_day_7', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_7.columns.name = None

# Reset the index
df_events_all_accounts_day_7 = df_events_all_accounts_day_7.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_7.fillna(0, inplace=True)

In [59]:
# checking
df_events_all_accounts_day_7.tail()

Unnamed: 0,systemid,acceptestimate,accesstokencreated,activateclient,activatecontractor,activateestimate,activateexpense,activateinvoice,activateitem,activateotherincome,...,upgradeform-submitted,uploadexpensereceipt,uploadhi-reslogo,verifymigration,viewedcreupgradepage,viewestimate,viewinvoice,welcomeaccount,zendesksupporte-mail,zero-amountinvoicefromrecurringprofile
114901,4735844,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
114902,4735846,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
114903,4735848,0,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
114904,4735850,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
114905,4735852,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,1,0,0


In [60]:
df_events_all_accounts_day_7.shape

(114906, 393)

In [61]:
# Checking duplicate systemid presense 
# duplicateSystemID = pd.concat(g for _, g in df_events_all_accounts_day_7.groupby('systemid') if len(g) > 1)
# duplicateSystemID

In [62]:
# CSV export 
df_events_all_accounts_day_7.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_7.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_7 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_7.tsv", sep="\t")

## 5.3 Events Important Features Selection

### 5.3.1 Adding missing features columns in the event features dataframe

In [63]:
# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

In [64]:
# Checking
# important_features.head()
# imp_features_list

In [65]:
# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_7.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_7[imp_features_list[i]] = 0


In [66]:
# Checking
# df_events_all_accounts_day_7.head()

### 5.3.2 Fitering only important features

In [67]:
# Filtering only important features 
df_events_imp_features_all_accounts_day_7 = \
            df_events_all_accounts_day_7.loc[:, df_events_all_accounts_day_7.columns.str.contains('|'.join(imp_features_list))]


In [68]:
# Checking
df_events_imp_features_all_accounts_day_7.head()

Unnamed: 0,systemid,acceptestimate,accesstokencreated,activateclient,activateestimate,activateexpense,activateinvoice,activateotherincome,activatepayment,activateproject,...,zendesksupporte-mail,activatestaff,clientimportcsvsucceeded,convertpaymenttocredit,createbanktransfer,createfolder,emailcreditnote,exportjournalentries,updatefolderpermissions,verifycallback
0,4504870,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4504872,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4504874,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4504876,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4504878,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 5.4 Filtering avgerage word counts features from invoice data

In [69]:
### Filtering average word counts features from the invoice data
# Invoice features at day 7
df_invoice_features_all_accounts_day_7 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_7',
        'avg_wc_notes_day_7',
        'avg_wc_terms_day_7',
        'avg_wc_address_day_7',
        'invoice_count_day_7',
        'client_count_day_7'
        ]]

# CSV export
df_invoice_features_all_accounts_day_7.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_7.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_7 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_7.csv", sep="\t")

In [70]:
# checking
# df_invoice_features_all_accounts_day_7.shape
# list(df_periodic_invoice_all_counts)

## 5.5 Merging events' and Invoice features

In [71]:
# Merging events' and invoice features
df_final_features_day_7 = pd.merge(df_events_imp_features_all_accounts_day_7, 
                                   df_invoice_features_all_accounts_day_7,
                                     on='systemid', how='left')


In [72]:
# Head of the dataframe
df_final_features_day_7.head()

Unnamed: 0,systemid,acceptestimate,accesstokencreated,activateclient,activateestimate,activateexpense,activateinvoice,activateotherincome,activatepayment,activateproject,...,admin_email,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7
0,4504870,0,1,0,0,0,0,0,0,0,...,gdfggdgd12@gmail.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4504872,0,2,0,0,0,0,0,0,0,...,jamaicahamilton71@gmail.com,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0
2,4504874,0,0,0,0,0,0,0,0,0,...,lillosnx@gmail.com,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4504876,0,0,0,0,0,0,0,0,0,...,solutions@okanaganorganizer.com,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4504878,0,0,0,0,0,0,0,0,0,...,reed.bianca26@gmail.com,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [73]:
# Check the dimension 
df_final_features_day_7.shape

(114906, 242)

## 5.6 Filtering FreshBooks test accounts 

In [79]:
################# Filtering FreshBooks Test Accounts #############################################################

# Import Freshbooks test accounts email from CSV file (non-freshbooks email)
fb_test_emails = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_fb_test_email/non-fb-testing-emails.tsv", 
                                      sep="\t")
fb_test_email_list = list(fb_test_emails['email'])

In [80]:
# fb_test_email_list

In [81]:
# Function: Filtering FB test account by using admin email
from difflib import SequenceMatcher

def email_match(em, email_list):
    
    L = len(email_list)
#     print('L', L)
#     print('em-before-loop: ', em)
    match_score = 0
#     x = float(em)
    
    for i in range(0, L):
#         if math.isnan(x):
#             match_score = 0
#             break;
        if pd.isnull(em):
            match_score = 0
            break;
        else: 
            match_score =  max(match_score, SequenceMatcher(None,em, email_list[i]).ratio())
            print(i, em, email_list[i], match_score)

    return match_score
    


In [82]:
# Filtering final data from the FreshBooks Test emails
# df_final_features_day_7 = df_final_features_day_7[
#     df_final_features_day_7.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

In [83]:
# CSV export 
df_final_features_day_7.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_7.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_7 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_7.tsv", sep="\t")

In [84]:
# Checking
df_final_features_day_7.shape

(114906, 242)

# 6. Final Features Data: Day 14

In [85]:
####### Final Features Data: Day 14 Model ###########

# Filtered the events columns for day 14 period
df_events_all_accounts_day_14 = df_events_all_accounts[['systemid', 'event_count_day_14', 'event_name']]

### Pivote the Day 14 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_14 = df_events_all_accounts_day_14.pivot_table(values='event_count_day_14', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_14.columns.name = None

# Reset the index
df_events_all_accounts_day_14 = df_events_all_accounts_day_14.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_14.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_14.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_14.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_14 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_14.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_14.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_14[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_14 = \
            df_events_all_accounts_day_14.loc[:, df_events_all_accounts_day_14.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 14
df_invoice_features_all_accounts_day_14 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_14',
        'avg_wc_notes_day_14',
        'avg_wc_terms_day_14',
        'avg_wc_address_day_14',
        'invoice_count_day_14',
        'client_count_day_14'
        ]]

# CSV export
df_invoice_features_all_accounts_day_14.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_14.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_14 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_14.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_14 = pd.merge(df_events_imp_features_all_accounts_day_14, 
                                   df_invoice_features_all_accounts_day_14,
                                     on='systemid', how='left')


# Filtering final data from the FreshBooks Test emails
# df_final_features_day_14 = df_final_features_day_14[
#     df_final_features_day_14.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_14.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_14.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_14 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_14.tsv", sep="\t")


# 7. Final Featues Data: Day 21


In [86]:
####### Final Features Data: Day 21 Model ###########

# Filtered the events columns for day 21 period
df_events_all_accounts_day_21 = df_events_all_accounts[['systemid', 'event_count_day_21', 'event_name']]

### Pivote the Day 21 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_21 = df_events_all_accounts_day_21.pivot_table(values='event_count_day_21', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_21.columns.name = None

# Reset the index
df_events_all_accounts_day_21 = df_events_all_accounts_day_21.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_21.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_21.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_21.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_21 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_21.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_21.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_21[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_21 = \
            df_events_all_accounts_day_21.loc[:, df_events_all_accounts_day_21.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 21
df_invoice_features_all_accounts_day_21 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_21',
        'avg_wc_notes_day_21',
        'avg_wc_terms_day_21',
        'avg_wc_address_day_21',
        'invoice_count_day_21',
        'client_count_day_21'
        ]]

# CSV export
df_invoice_features_all_accounts_day_21.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_21.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_21 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_21.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_21 = pd.merge(df_events_imp_features_all_accounts_day_21, 
                                   df_invoice_features_all_accounts_day_21,
                                     on='systemid', how='left')


# Filtering final data from the FreshBooks Test emails
# df_final_features_day_21 = df_final_features_day_21[
#     df_final_features_day_21.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_21.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_21.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_21 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_21.tsv", sep="\t")


# 8. Final Features Extraction: Day 28

In [87]:
####### Final Features Data: Day 28 Model ###########

# Filtered the events columns for day 28 period
df_events_all_accounts_day_28 = df_events_all_accounts[['systemid', 'event_count_day_28', 'event_name']]

### Pivote the Day 28 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_28 = df_events_all_accounts_day_28.pivot_table(values='event_count_day_28', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_28.columns.name = None

# Reset the index
df_events_all_accounts_day_28 = df_events_all_accounts_day_28.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_28.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_28.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_28.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_28 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_28.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_28.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_28[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_28 = \
            df_events_all_accounts_day_28.loc[:, df_events_all_accounts_day_28.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 28
df_invoice_features_all_accounts_day_28 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email', 
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_28',
        'avg_wc_notes_day_28',
        'avg_wc_terms_day_28',
        'avg_wc_address_day_28',
        'invoice_count_day_28',
        'client_count_day_28'
        ]]

# CSV export
df_invoice_features_all_accounts_day_28.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_28.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_28 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_28.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_28 = pd.merge(df_events_imp_features_all_accounts_day_28, 
                                   df_invoice_features_all_accounts_day_28,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
# df_final_features_day_28 = df_final_features_day_28[
#     df_final_features_day_28.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_28.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_28.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_28 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_28.tsv", sep="\t")


# 9. Final Features Extraction: Day 35

In [88]:
####### Final Features Data: Day 35 Model ###########

# Filtered the events columns for day 35 period
df_events_all_accounts_day_35 = df_events_all_accounts[['systemid', 'event_count_day_35', 'event_name']]

### Pivote the Day 35 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_35 = df_events_all_accounts_day_35.pivot_table(values='event_count_day_35', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_35.columns.name = None

# Reset the index
df_events_all_accounts_day_35 = df_events_all_accounts_day_35.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_35.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_35.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_35.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_35 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_35.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_35.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_35[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_35 = \
            df_events_all_accounts_day_35.loc[:, df_events_all_accounts_day_35.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 35
df_invoice_features_all_accounts_day_35 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_35',
        'avg_wc_notes_day_35',
        'avg_wc_terms_day_35',
        'avg_wc_address_day_35',
        'invoice_count_day_35',
        'client_count_day_35'
        ]]

# CSV export
df_invoice_features_all_accounts_day_35.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_35.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_35 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_35.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_35 = pd.merge(df_events_imp_features_all_accounts_day_35, 
                                   df_invoice_features_all_accounts_day_35,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
# df_final_features_day_35 = df_final_features_day_35[
#     df_final_features_day_35.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_35.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_35.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_35 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_35.tsv", sep="\t")


# 10. Final Features Extraction: Day 42

In [89]:
####### Final Features Data: Day 42 Model ###########

# Filtered the events columns for day 42 period
df_events_all_accounts_day_42 = df_events_all_accounts[['systemid', 'event_count_day_42', 'event_name']]

### Pivote the Day 42 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_42 = df_events_all_accounts_day_42.pivot_table(values='event_count_day_42', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_42.columns.name = None

# Reset the index
df_events_all_accounts_day_42 = df_events_all_accounts_day_42.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_42.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_42.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_42.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_42 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_42.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_42.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_42[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_42 = \
            df_events_all_accounts_day_42.loc[:, df_events_all_accounts_day_42.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 42
df_invoice_features_all_accounts_day_42 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_42',
        'avg_wc_notes_day_42',
        'avg_wc_terms_day_42',
        'avg_wc_address_day_42',
        'invoice_count_day_42',
        'client_count_day_42'
        ]]

# CSV export
df_invoice_features_all_accounts_day_42.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_42.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_42 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_42.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_42 = pd.merge(df_events_imp_features_all_accounts_day_42, 
                                   df_invoice_features_all_accounts_day_42,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
# df_final_features_day_42 = df_final_features_day_42[
#     df_final_features_day_42.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_42.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_42.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_42 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_42.tsv", sep="\t")


# 10. Final Features Data: Day 49 Model

In [90]:
####### Final Features Data: Day 49 Model ###########

# Filtered the events columns for day 49 period
df_events_all_accounts_day_49 = df_events_all_accounts[['systemid', 'event_count_day_49', 'event_name']]

### Pivote the Day 49 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_49 = df_events_all_accounts_day_49.pivot_table(values='event_count_day_49', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_49.columns.name = None

# Reset the index
df_events_all_accounts_day_49 = df_events_all_accounts_day_49.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_49.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_49.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_49.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_49 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_49.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_49.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_49[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_49 = \
            df_events_all_accounts_day_49.loc[:, df_events_all_accounts_day_49.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 49
df_invoice_features_all_accounts_day_49 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_49',
        'avg_wc_notes_day_49',
        'avg_wc_terms_day_49',
        'avg_wc_address_day_49',
        'invoice_count_day_49',
        'client_count_day_49'
        ]]

# CSV export
df_invoice_features_all_accounts_day_49.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_49.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_49 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_49.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_49 = pd.merge(df_events_imp_features_all_accounts_day_49, 
                                   df_invoice_features_all_accounts_day_49,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
# df_final_features_day_49 = df_final_features_day_49[
#     df_final_features_day_49.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_49.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_49.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_49 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_49.tsv", sep="\t")


# 11. Final Feature Data: Day 56 Model

In [None]:
####### Final Features Data: Day 56 Model ###########

# Filtered the events columns for day 56 period
df_events_all_accounts_day_56 = df_events_all_accounts[['systemid', 'event_count_day_56', 'event_name']]

### Pivote the Day 56 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_56 = df_events_all_accounts_day_56.pivot_table(values='event_count_day_56', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_56.columns.name = None

# Reset the index
df_events_all_accounts_day_56 = df_events_all_accounts_day_56.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_56.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_56.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_56.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_56 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_56.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_56.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_56[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_56 = \
            df_events_all_accounts_day_56.loc[:, df_events_all_accounts_day_56.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 56
df_invoice_features_all_accounts_day_56 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_56',
        'avg_wc_notes_day_56',
        'avg_wc_terms_day_56',
        'avg_wc_address_day_56',
        'invoice_count_day_56',
        'client_count_day_56'
        ]]

# CSV export
df_invoice_features_all_accounts_day_56.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_56.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_56 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_56.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_56 = pd.merge(df_events_imp_features_all_accounts_day_56, 
                                   df_invoice_features_all_accounts_day_56,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
df_final_features_day_56 = df_final_features_day_56[
    df_final_features_day_56.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_56.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_56.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_56 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_56.tsv", sep="\t")


# 12. Final Features Data: Day 63 Model

In [None]:
####### Final Features Data: Day 63 Model ###########

# Filtered the events columns for day 63 period
df_events_all_accounts_day_63 = df_events_all_accounts[['systemid', 'event_count_day_63', 'event_name']]

### Pivote the Day 63 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_63 = df_events_all_accounts_day_63.pivot_table(values='event_count_day_63', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_63.columns.name = None

# Reset the index
df_events_all_accounts_day_63 = df_events_all_accounts_day_63.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_63.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_63.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_63.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_63 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_63.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_63.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_63[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_63 = \
            df_events_all_accounts_day_63.loc[:, df_events_all_accounts_day_63.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 63
df_invoice_features_all_accounts_day_63 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_63',
        'avg_wc_notes_day_63',
        'avg_wc_terms_day_63',
        'avg_wc_address_day_63',
        'invoice_count_day_63',
        'client_count_day_63'
        ]]

# CSV export
df_invoice_features_all_accounts_day_63.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_63.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_63 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_63.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_63 = pd.merge(df_events_imp_features_all_accounts_day_63, 
                                   df_invoice_features_all_accounts_day_63,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
df_final_features_day_63 = df_final_features_day_63[
    df_final_features_day_63.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_63.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_63.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_63 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_63.tsv", sep="\t")


# 13. Final Features Data: Day 70 Model

In [None]:
####### Final Features Data: Day 70 Model ###########

# Filtered the events columns for day 70 period
df_events_all_accounts_day_70 = df_events_all_accounts[['systemid', 'event_count_day_70', 'event_name']]

### Pivote the Day 70 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_70 = df_events_all_accounts_day_70.pivot_table(values='event_count_day_70', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_70.columns.name = None

# Reset the index
df_events_all_accounts_day_70 = df_events_all_accounts_day_70.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_70.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_70.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_70.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_70 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_70.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_70.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_70[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_70 = \
            df_events_all_accounts_day_70.loc[:, df_events_all_accounts_day_70.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 70
df_invoice_features_all_accounts_day_70 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_70',
        'avg_wc_notes_day_70',
        'avg_wc_terms_day_70',
        'avg_wc_address_day_70',
        'invoice_count_day_70',
        'client_count_day_70'
        ]]

# CSV export
df_invoice_features_all_accounts_day_70.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_70.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_70 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_70.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_70 = pd.merge(df_events_imp_features_all_accounts_day_70, 
                                   df_invoice_features_all_accounts_day_70,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
df_final_features_day_70 = df_final_features_day_70[
    df_final_features_day_70.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_70.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_70.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_70 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_70.tsv", sep="\t")


# 14. Final Features Data: Day 77 Model

In [None]:
####### Final Features Data: Day 77 Model ###########

# Filtered the events columns for day 77 period
df_events_all_accounts_day_77 = df_events_all_accounts[['systemid', 'event_count_day_77', 'event_name']]

### Pivote the Day 77 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_77 = df_events_all_accounts_day_77.pivot_table(values='event_count_day_77', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_77.columns.name = None

# Reset the index
df_events_all_accounts_day_77 = df_events_all_accounts_day_77.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_77.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_77.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_77.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_77 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_77.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_77.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_77[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_77 = \
            df_events_all_accounts_day_77.loc[:, df_events_all_accounts_day_77.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 77
df_invoice_features_all_accounts_day_77 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_77',
        'avg_wc_notes_day_77',
        'avg_wc_terms_day_77',
        'avg_wc_address_day_77',
        'invoice_count_day_77',
        'client_count_day_77'
        ]]

# CSV export
df_invoice_features_all_accounts_day_77.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_77.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_77 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_77.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_77 = pd.merge(df_events_imp_features_all_accounts_day_77, 
                                   df_invoice_features_all_accounts_day_77,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
df_final_features_day_77 = df_final_features_day_77[
    df_final_features_day_77.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_77.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_77.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_77 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_77.tsv", sep="\t")


## 15. Final Features Data: Day 84 Model

In [None]:
####### Final Features Data: Day 84 Model ###########

# Filtered the events columns for day 84 period
df_events_all_accounts_day_84 = df_events_all_accounts[['systemid', 'event_count_day_84', 'event_name']]

### Pivote the Day 84 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_84 = df_events_all_accounts_day_84.pivot_table(values='event_count_day_84', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_84.columns.name = None

# Reset the index
df_events_all_accounts_day_84 = df_events_all_accounts_day_84.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_84.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_84.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_84.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_84 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_84.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_84.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_84[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_84 = \
            df_events_all_accounts_day_84.loc[:, df_events_all_accounts_day_84.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 84
df_invoice_features_all_accounts_day_84 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email', 
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_84',
        'avg_wc_notes_day_84',
        'avg_wc_terms_day_84',
        'avg_wc_address_day_84',
        'invoice_count_day_84',
        'client_count_day_84'
        ]]

# CSV export
df_invoice_features_all_accounts_day_84.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_84.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_84 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_84.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_84 = pd.merge(df_events_imp_features_all_accounts_day_84, 
                                   df_invoice_features_all_accounts_day_84,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
df_final_features_day_84 = df_final_features_day_84[
    df_final_features_day_84.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_84.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_84.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_84 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_84.tsv", sep="\t")


## 20. Final Features Data: Day 91 Model

In [None]:
####### Final Features Data: Day 91 Model ###########

# Filtered the events columns for day 91 period
df_events_all_accounts_day_91 = df_events_all_accounts[['systemid', 'event_count_day_91', 'event_name']]

### Pivote the Day 91 Events (Each Unique Event Become a Column)###
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_91 = df_events_all_accounts_day_91.pivot_table(values='event_count_day_91', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_91.columns.name = None

# Reset the index
df_events_all_accounts_day_91 = df_events_all_accounts_day_91.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_91.fillna(0, inplace=True)

# CSV export 
df_events_all_accounts_day_91.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/events_new_accounts_day_91.tsv", sep="\t", index=False)

# Import CSV
# df_events_all_accounts_day_91 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/events_all_accounts_day_91.tsv", sep="\t")


# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/important_features.csv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Adding missing important feature column 
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_all_accounts_day_91.columns:
        print("True")
    else:
        print("False")
        df_events_all_accounts_day_91[imp_features_list[i]] = 0

# Filtering only important features 
df_events_imp_features_all_accounts_day_91 = \
            df_events_all_accounts_day_91.loc[:, df_events_all_accounts_day_91.columns.str.contains('|'.join(imp_features_list))]


### Filtering average word counts features from the invoice data
# Invoice features at day 91
df_invoice_features_all_accounts_day_91 = df_periodic_invoice_all_counts[[
        'systemid',
        'admin_email',
        'is_sales_managed', 
        'is_freshbooks_account_active',
        'is_paying',
        'avg_wc_description_day_91',
        'avg_wc_notes_day_91',
        'avg_wc_terms_day_91',
        'avg_wc_address_day_91',
        'invoice_count_day_91',
        'client_count_day_91'
        ]]

# CSV export
df_invoice_features_all_accounts_day_91.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new/invoice_features_new_accounts_day_91.csv", sep="\t", index=False)

# Import CSV
# df_invoice_features_all_accounts_day_91 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/invoice_features_all_accounts_day_91.csv", sep="\t")


# Merging events' and invoice features
df_final_features_day_91 = pd.merge(df_events_imp_features_all_accounts_day_91, 
                                   df_invoice_features_all_accounts_day_91,
                                     on='systemid', how='left')

# Filtering final data from the FreshBooks Test emails
df_final_features_day_91 = df_final_features_day_91[
    df_final_features_day_91.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) > 0.9, axis=1)]

# CSV export 
df_final_features_day_91.to_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_91.tsv", sep="\t", index=False)

# Import CSV
# df_final_features_day_91 = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_91.tsv", sep="\t")
