# Non Fraud Accounts 20180801 To 20190730: Querying and Features Extraction (Day 07)

We use this script to quering and extracting features for identified fraud accounts.

In [1]:
import pandas as pd
import numpy as np
from datetime import date

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')


## Connect with the Redshift Database

In [2]:
from contextlib import closing


import psycopg2
import simplejson
import sys
reload(sys)
sys.setdefaultencoding('utf8')

DEFAULT_DB = 'data_depot'
DEFAULT_HOST = 'freshbooks-data.c8exzn6geij3.us-east-1.redshift.amazonaws.com'
DEFAULT_PORT = 5439


class PsycopgConnector:
    '''
    A database connector that uses Psycopg to connect to Redshift.

    How to play:

        psy_conn = PsycopgConnector(username, password)
        df = psy_conn.run_query(sql=sql, return_data=True)

    NOTE: This class commits queries to redshift if return_data=False.
    This means INSERT, DROP, TRUNCATE, etc. all work against the DB.
    '''

    def __init__(
        self,
        username=None,
        password=None,
        db=DEFAULT_DB,
        host=DEFAULT_HOST,
        port=DEFAULT_PORT,
    ):

        self.db = DEFAULT_DB
        self.host = DEFAULT_HOST
        self.port = DEFAULT_PORT

        self.username = username
        self.password = password

    def _get_connection(self):

        self.conn = psycopg2.connect(
            dbname=self.db,
            user=self.username,
            password=self.password,
            host=self.host,
            port=self.port
        )

        return self.conn

    def run_query(self, sql, return_data=False):

        with closing(self._get_connection()) as conn:
            with conn, conn.cursor() as cur:
                if return_data:
                    return pd.read_sql(sql=sql, con=conn)
                else:
                    cur.execute(sql)
                    

# Read the Redshift's credentials file 
with open("redshift_creds.json.nogit") as fh:
    creds = simplejson.loads(fh.read())
    
username = creds.get("user_name")
password = creds.get("password")

pig = PsycopgConnector(username, password)

In [3]:
# Testing connection
sql_test = '''SELECT * FROM report_systems LIMIT 5'''
df_test = pig.run_query(sql_test, return_data=True)
df_test

Unnamed: 0,systemid,business_id,admin_identity_id,subdomain,is_freshbooks_account_active,is_modern,most_recent_migrated_to_smux_at,is_contractor,currency_code,timezone,...,staff_count,staff_deleted_count,contractor_count,contractor_deleted_count,user_contact_count,enabled_gateway_count,google_sso_first_linked_date,google_sso_most_recent_linked_date,google_sso_first_removal_date,google_sso_most_recent_removal_date
0,2991,,,https://Dollface.freshbooks.com,1,0,,0,USD,US/Eastern,...,0,1,0,0,0,1,,,,
1,3627,,,https://mjbcomputers.freshbooks.com,0,0,,0,USD,US/Mountain,...,0,0,0,0,0,1,,,,
2,5623,,,https://BamboDansoko.freshbooks.com,1,0,,0,USD,Africa/Casablanca,...,0,0,0,0,0,0,,,,
3,6309,,,https://jw850.freshbooks.com,0,0,,0,USD,US/Eastern,...,0,0,0,0,0,0,,,,
4,7028,,,https://pinetreehost.freshbooks.com,1,0,,0,USD,US/Eastern,...,0,1,0,0,3,1,,,,


## Functions

In [4]:
# Word count fuction
import re
def words_count (strg):
    
    #print(strg)
    
    if strg == '' or pd.isnull(strg):
        no_of_words = 0
        #print('NaN')
    else:
        strg_words_list = re.findall(r"[\w']+", strg)
        no_of_words = len(strg_words_list)

        
        #print(strg_words_list)
    
    return no_of_words 
    

# 1. Import Invoice Data & Extract Avg Word Counts Features

## 1.01 Invoice within 7 days

In [5]:
# SQL for impoorting all invoices created within 7 days after signup_date
sql_invoices_7days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT
            systemid,
            signup_date
    , is_sales_managed
    , is_freshbooks_account_active
    , freshbooks_account_status
    , is_paying
    FROM report_systems rs
    WHERE signup_date BETWEEN '2018-08-01' and '2019-07-30'
    AND is_sales_managed = '1' AND is_freshbooks_account_active = '1' AND is_paying ='1'
    AND freshbooks_account_status = 'active'
), invoice_created_at AS (
    SELECT
           pic.systemid,
           pic.signup_date,
           inv.invoiceid,
           inv.create_date,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 7) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;
'''

# Import as dataframe from redshift
# df_invoices_7days_all_accounts = pd.read_sql_query(sql_invoices_7days_all_accounts, connect_to_db)
df_invoices_7days_all_accounts = pig.run_query(sql_invoices_7days_all_accounts, return_data=True)


# Words count in invoice's description, notes, terms, address
df_invoices_7days_all_accounts['avg_wc_description_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_7days_all_accounts['avg_wc_notes_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_7days_all_accounts['avg_wc_terms_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_7days_all_accounts['avg_wc_address_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_7days_all_accounts_fil = df_invoices_7days_all_accounts.filter(['systemid', 
                                                                            'invoiceid', 
                                                                            'signup_date',
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_7', 
                                                                            'avg_wc_notes_day_7', 
                                                                            'avg_wc_terms_day_7',
                                                                            'avg_wc_address_day_7'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_7days_all_accounts_total = df_invoices_7days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_7days_all_accounts_final = df_word_count_7days_all_accounts_total.filter([
                                                                            'systemid',
                                                                            'signup_date',
                                                                            'avg_wc_description_day_7', 
                                                                            'avg_wc_notes_day_7', 
                                                                            'avg_wc_terms_day_7',
                                                                            'avg_wc_address_day_7'])

In [6]:
df_word_count_7days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3594215,0.0,2.0,4.0,0.0
3597113,18.25,9.875,8.9375,0.0
3597541,0.0,2.166667,8.0,0.0
3599743,10.0,0.0,0.0,0.0
3602563,32.225352,41.408451,0.591549,0.0


In [7]:
df_word_count_7days_all_accounts_final.shape

(297, 4)

# 2. Report Systems Features


In [8]:
################# Import RSystems, Periodic Invoices & Client Counts Data ###############

# SQL query 
sql_rs_invoices_clients_activities_all_accounts = '''WITH periodic_report_system_activities AS (
    SELECT
        systemid,
        signup_date,
        admin_email,
        is_sales_managed,
        is_freshbooks_account_active,
        freshbooks_account_status,
        is_paying
    FROM report_systems rs
    WHERE signup_date BETWEEN '2018-08-01' and '2019-07-30'
    AND is_sales_managed = '1' AND is_freshbooks_account_active = '1' AND is_paying ='1'
    AND freshbooks_account_status = 'active'
), invoice_create_date AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           inv.create_date,
           inv.created_at,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM periodic_report_system_activities AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
), invoice_grouping AS (
    SELECT
           systemid,
           COUNT(invoiceid) as invoice_count,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 7 THEN 1 ELSE 0 END) AS invoice_count_day_7
    FROM invoice_create_date
    GROUP BY systemid
), client_crate_date AS (
     SELECT
            pic.systemid,
            usr.userid,
            usr.signup_date,
            DATEDIFF(days, pic.signup_date, usr.signup_date) AS days_to_client_creation
    FROM periodic_report_system_activities  AS pic
    LEFT JOIN coalesced_live_shards."user" as usr USING (systemid)
), client_grouping AS (
    SELECT
           systemid,
           count(userid) AS client_count,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 7 THEN 1 ELSE 0 END) AS client_count_day_7

    FROM  client_crate_date
    GROUP BY systemid
)

SELECT
       systemid,
       signup_date,
       current_date as effective_date,
       DATEDIFF(days, signup_date, current_date) as days_on_platform,
       admin_email,
       is_sales_managed,
       is_freshbooks_account_active,
       is_paying,
       inv_gr.invoice_count_day_7,
       cl_gr.client_count_day_7
FROM periodic_report_system_activities
LEFT JOIN invoice_grouping as inv_gr USING (systemid)
LEFT JOIN client_grouping AS cl_gr USING (systemid);
'''

# Import as dataframe from redshift
df_rs_invoices_clients_activities_all_accounts = pig.run_query(sql_rs_invoices_clients_activities_all_accounts, return_data=True)


In [9]:
# checking 
df_rs_invoices_clients_activities_all_accounts.tail()

Unnamed: 0,systemid,signup_date,effective_date,days_on_platform,admin_email,is_sales_managed,is_freshbooks_account_active,is_paying,invoice_count_day_7,client_count_day_7
386,4213392,2019-04-08,2020-02-24,322,proflame@shaw.ca,1,1,1,82,73
387,3664645,2018-08-31,2020-02-24,542,rruckman@nabcblues.org,1,1,1,3,108
388,3793865,2018-10-23,2020-02-24,489,dave@davecoffin.com,1,1,1,0,2
389,4161136,2019-03-18,2020-02-24,343,stuart@sbmedia.co,1,1,1,4,2
390,4166000,2019-03-20,2020-02-24,341,alewis@realhrsolutions.com,1,1,1,0,3


In [10]:
df_rs_invoices_clients_activities_all_accounts.shape

(391, 10)

# 4. Import and Exract Features from Events Data
## 4.1 Event data collection 

In [12]:
############################### Event Features Extraction ################################

#SQL for events 
sql_events = '''WITH selected_accounts_events AS (
    SELECT 
            systemid,
            signup_date,
            signup_datetime, 
            is_sales_managed,
            is_freshbooks_account_active,
            freshbooks_account_status,
            is_paying
    FROM report_systems
    WHERE signup_date BETWEEN '2018-08-01' and '2019-07-30'
    AND is_sales_managed = '1' AND is_freshbooks_account_active = '1' AND is_paying ='1'
    AND freshbooks_account_status = 'active'
), events_activities AS (
    SELECT sae.systemid,
           signup_date,
           dd.date,
           datediff(days, signup_date, dd.date) as days_to_event,
           lower(e.event) as event,
           ec.count
    FROM selected_accounts_events AS sae
    LEFT JOIN event_counts AS ec USING (systemid)
    LEFT JOIN d_date AS dd USING (date_key)
    LEFT JOIN d_event e on ec.event_key = e.event_key
), event_groupings AS (
    SELECT distinct  ea.systemid,
                    ea.signup_date,
                    ea.date,
                    ea.event,
                    ea.count,
                    (CASE WHEN days_to_event BETWEEN 0 AND 7 THEN ea.count END) AS day_7_event
    FROM events_activities AS ea
)
SELECT systemid,
       event,
       sum(day_7_event) AS event_count_day_7
From event_groupings
GROUP BY systemid, signup_date, event
ORDER BY systemid, event_count_day_7 DESC;'''

# Import as dataframe from redshift
# df_events_all_accounts = pd.read_sql_query(sql_events, connect_to_db)
df_events_all_accounts = pig.run_query(sql_events, return_data=True)


In [13]:
# checking
df_events_all_accounts.head()

Unnamed: 0,systemid,event,event_count_day_7
0,3594215,create bank transaction,
1,3594215,activate expense,
2,3594215,email third late reminder,
3,3594215,credit card client access granted,
4,3594215,first client role email,


In [14]:
df_events_all_accounts.shape

(31486, 3)

## 4.2 Removing whitespce from the event strings

In [15]:
# Removing row if there is 'None' the event cell
df_events_all_accounts = df_events_all_accounts[~df_events_all_accounts.astype(str).eq('None').any(1)]

# Replace the 'NaN' cell by zero
df_events_all_accounts.fillna(0, inplace=True)

# Using lambda function to remove the white space in the event string name
df_events_all_accounts['event_name'] = df_events_all_accounts.apply(lambda x: x['event'].replace(' ', '').replace('-','').replace('/', ''), axis=1)

In [16]:
# checking
df_events_all_accounts.head()

Unnamed: 0,systemid,event,event_count_day_7,event_name
0,3594215,create bank transaction,0.0,createbanktransaction
1,3594215,activate expense,0.0,activateexpense
2,3594215,email third late reminder,0.0,emailthirdlatereminder
3,3594215,credit card client access granted,0.0,creditcardclientaccessgranted
4,3594215,first client role email,0.0,firstclientroleemail


In [17]:
# list(df_events_all_accounts['event_name'])

In [18]:
# Filtered the events columns for day 7
df_events_all_accounts_day_7 = df_events_all_accounts[['systemid', 'event_count_day_7', 'event_name']]

In [19]:
df_events_all_accounts_day_7.tail()

Unnamed: 0,systemid,event_count_day_7,event_name
31481,4500658,1.0,finishedansweringsurveyforbusiness
31482,4500658,1.0,ariasubscriptioncreated
31483,4500658,1.0,emailinvoicesample
31484,4500658,1.0,teamsizeset
31485,4500658,1.0,updateinvoicesample


In [20]:
df_events_all_accounts_day_7.shape

(31486, 3)

## 4.2 Pivote the events (each unique event become a column)

In [21]:
### Pivote the Day 7 Events (Each Unique Event Become a Column)###

# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_7 = df_events_all_accounts_day_7.pivot_table(values='event_count_day_7', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_7.columns.name = None

# Reset the index
df_events_all_accounts_day_7 = df_events_all_accounts_day_7.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_7.fillna(0, inplace=True)

In [22]:
# checking
df_events_all_accounts_day_7.tail()

Unnamed: 0,systemid,acceptestimate,accesstokencreated,activateclient,activateestimate,activateexpense,activateinvoice,activateitem,activateotherincome,activatepayment,...,uploadexpensereceipt,uploadhireslogo,verifycallback,verifymigration,viewedcreupgradepage,viewestimate,viewinvoice,welcomeaccount,zendesksupportemail,zeroamountinvoicefromrecurringprofile
386,4488836,0,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
387,4489036,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
388,4498040,0,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
389,4499052,0,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,21,1,0,0
390,4500658,0,7,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,1,0,0


In [23]:
df_events_all_accounts_day_7.shape

(391, 403)

# 5. Merging all data: Report system, average word count and event data

In [24]:
# Merging report system and events data for day 7 period
df_rs_events_day_7 = pd.merge(df_rs_invoices_clients_activities_all_accounts, df_events_all_accounts_day_7,
                             on='systemid', how='left')

# Merging average word count with 'df_rs_events_day_7'
df_rs_events_avg_wc_day_7 = pd.merge(df_rs_events_day_7, df_word_count_7days_all_accounts_final,
                                    on='systemid', how='left')

In [25]:
df_rs_events_avg_wc_day_7.head()

Unnamed: 0,systemid,signup_date,effective_date,days_on_platform,admin_email,is_sales_managed,is_freshbooks_account_active,is_paying,invoice_count_day_7,client_count_day_7,...,viewedcreupgradepage,viewestimate,viewinvoice,welcomeaccount,zendesksupportemail,zeroamountinvoicefromrecurringprofile,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7
0,3836507,2018-11-05,2020-02-24,476,corey.calhoun@dkgconceptsllc.com,1,1,1,1,3,...,0,0,0,2,0,0,0.0,23.0,95.0,0.0
1,4094202,2019-02-21,2020-02-24,368,amccluskey@law-dmc.com,1,1,1,0,2,...,0,0,0,1,0,0,,,,
2,4129986,2019-03-06,2020-02-24,355,stephanie@sraadmin.com,1,1,1,6,5,...,0,0,4,1,0,0,0.0,0.0,4.0,0.0
3,4131556,2019-03-06,2020-02-24,355,kels@kelseyzander.com,1,1,1,1,2,...,0,0,2,1,0,0,5.0,0.0,0.0,0.0
4,4149724,2019-03-13,2020-02-24,348,gtddamico@gmail.com,1,1,1,5,5,...,0,0,9,1,0,0,3.8,2.6,4.2,0.0


In [26]:
# df_rs_events_avg_wc_day_7['signup_date']

# 6. Filtering out FreshBooks test accounts 

In [27]:
################# Filtering Out FreshBooks Test Accounts #############################################################

# Import Freshbooks test accounts email from CSV file (non-freshbooks email)
fb_test_emails = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data/freshbooks_test_email/non-fb-testing-emails.tsv", sep="\t")
fb_test_email_list = list(fb_test_emails['email'])

In [28]:
# fb_test_email_list

In [29]:
# Function: Filtering FB test account by using admin email
from difflib import SequenceMatcher

def email_match(em, email_list):
    
    L = len(email_list)
#     print('L', L)
#     print('em-before-loop: ', em)
    match_score = 0
#     x = float(em)
    
    for i in range(0, L):
#         if math.isnan(x):
#             match_score = 0
#             break;
        if pd.isnull(em):
            match_score = 0
            break;
        else: 
            match_score =  max(match_score, SequenceMatcher(None,em, email_list[i]).ratio())
#             print(i, em, email_list[i], match_score)

    return match_score
    


In [30]:
# Filtering final data from the FreshBooks Test emails
df_rs_events_avg_wc_day_7_noFBtest = df_rs_events_avg_wc_day_7[
    df_rs_events_avg_wc_day_7.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) < 0.95, axis=1)]

In [31]:
df_rs_events_avg_wc_day_7_noFBtest['signup_date'].head()

0    2018-11-05
1    2019-02-21
2    2019-03-06
3    2019-03-06
4    2019-03-13
Name: signup_date, dtype: object

In [32]:
# list(df_rs_events_avg_wc_day_7_noFBtest)

# 7. Filtering only important features: Day 7

In [33]:
# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/src/important_features/important_features_day_07_new_accounts.tsv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

In [34]:
len(imp_features_list)

93

In [35]:
# Adding missing important feature column with zero values (if there any!)
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_rs_events_avg_wc_day_7_noFBtest.columns:
#         print("True")
        continue;
        
    else:
        print("False: ", imp_features_list[i])
        df_rs_events_avg_wc_day_7_noFBtest[imp_features_list[i]] = 0


In [36]:
df_rs_events_avg_wc_day_7_noFBtest.shape

(391, 418)

In [37]:
# list(df_rs_events_avg_wc_day_7_noFBtest)

In [38]:
# df_rs_events_avg_wc_day_7_noFBtest['signup_date']

In [39]:
# Filtering only important features 
df_imp_features_new_accounts_day_7 =\
            df_rs_events_avg_wc_day_7_noFBtest[df_rs_events_avg_wc_day_7_noFBtest.columns.intersection(imp_features_list)]

In [40]:
df_imp_features_new_accounts_day_7.head()

Unnamed: 0,systemid,signup_date,effective_date,days_on_platform,admin_email,is_sales_managed,is_freshbooks_account_active,is_paying,invoice_count_day_7,client_count_day_7,...,updateexpense,updateinvoicesample,updateitem,updateservice,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,disconnectbankaccount,expenseimportsucceeded
0,3836507,2018-11-05,2020-02-24,476,corey.calhoun@dkgconceptsllc.com,1,1,1,1,3,...,0,2,0,0,0.0,23.0,95.0,0.0,0,0
1,4094202,2019-02-21,2020-02-24,368,amccluskey@law-dmc.com,1,1,1,0,2,...,0,0,0,0,,,,,0,0
2,4129986,2019-03-06,2020-02-24,355,stephanie@sraadmin.com,1,1,1,6,5,...,0,1,1,0,0.0,0.0,4.0,0.0,0,0
3,4131556,2019-03-06,2020-02-24,355,kels@kelseyzander.com,1,1,1,1,2,...,0,0,0,0,5.0,0.0,0.0,0.0,0,0
4,4149724,2019-03-13,2020-02-24,348,gtddamico@gmail.com,1,1,1,5,5,...,0,0,1,0,3.8,2.6,4.2,0.0,0,0


In [41]:
df_imp_features_new_accounts_day_7 = df_imp_features_new_accounts_day_7.reindex(
    sorted(df_imp_features_new_accounts_day_7.columns), axis=1)

In [42]:
# list(df_final_features_new_accounts_day_7)

In [43]:
# df_imp_features_new_accounts_day_7['signup_date']

In [44]:
# Drop rows with nan value
df_imp_features_new_accounts_day_7 = df_imp_features_new_accounts_day_7.dropna()

# 8. Filtering inactive users' accounts

In [45]:
ex_cols_list = ['admin_email','days_on_platform', 'effective_date', 'signup_date', 'systemid']
cols_list = list(df_imp_features_new_accounts_day_7) 
cols = list(set(cols_list) - set(ex_cols_list))

In [46]:
# cols

In [47]:
# Function for aggregating selected column values
def cell_value_sum (row, cols):
    #print(row)
    sum = 0
    for i in cols:
        #print(i)
        #print(i, row[i])
        sum = sum + row[i]
    
    #print('Final sum: ', sum)
    return sum


In [48]:
# Fltering out all inactive users accounts
df_final_features_new_accounts_day_7 =\
        df_imp_features_new_accounts_day_7[df_imp_features_new_accounts_day_7.apply(lambda x: cell_value_sum(x, cols) > 0, axis=1)]

In [49]:
df_final_features_new_accounts_day_7.head()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admin_email,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
0,0,0,0,corey.calhoun@dkgconceptsllc.com,0,0,0,0,0,0,...,0,7,0,0,0,0,0,2,0,0
2,0,0,0,stephanie@sraadmin.com,0,0,0,0,0,0,...,0,4,0,0,0,0,0,1,1,0
3,0,0,0,kels@kelseyzander.com,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,gtddamico@gmail.com,0,0,0,0,0,0,...,0,10,0,0,0,0,0,0,1,0
7,0,0,0,squait@180drinks.ca,0,0,0,0,0,0,...,0,1,0,0,0,9,0,0,0,0


In [50]:
# df_final_features_new_accounts_day_7['signup_date']

# 9. Saving the filtered features data for new accounts

In [54]:
# Export filtered features data fro new accounts
today = str(date.today())
path = "/Users/dwahid/Documents/GitHub/fraud_detection/data/analyzing_fraud_accounts/fbooks_non_fraud_accounts_features_day_07_20180801_20190730.tsv"
df_final_features_new_accounts_day_7.to_csv(path, sep="\t", index=False)

In [52]:
list(df_final_features_new_accounts_day_7)

['activateexpense',
 'activateotherincome',
 'activatepayment',
 'admin_email',
 'admindeactivation',
 'adminonlinepaymentattempt',
 'adminpayinvoiceonlineinvoice',
 'adminpayinvoiceonlinelistview',
 'archiveclient',
 'archiveexpense',
 'archiveotherincome',
 'archiveproject',
 'archivetask',
 'autobillpayment',
 'avg_wc_address_day_7',
 'avg_wc_description_day_7',
 'avg_wc_notes_day_7',
 'avg_wc_terms_day_7',
 'bulkimportclientscomplete',
 'client_count_day_7',
 'clientimportcsvsucceeded',
 'clientlimitupgradenudge',
 'createbankaccount',
 'createbanktransaction',
 'createbanktransfer',
 'createcategory',
 'createcontact',
 'createcontractor',
 'createcreditnote',
 'createdexpense',
 'createestimate',
 'createexpense',
 'createitem',
 'createotherincome',
 'createreceipt',
 'createservice',
 'creditcardclientaccessgranted',
 'customemailsignature',
 'days_on_platform',
 'declinedonlinepaymentnotification',
 'deletebusinesspartner',
 'deletecollaborator',
 'deletecreditnote',
 'deletee

In [53]:
df_final_features_new_accounts_day_7.shape

(297, 93)