# Quering and Features Extraction: Accounts in Last 07 Days

We use this script to quering and extracting features for accounts signup in last 07 days.

In [1]:
import pandas as pd
import numpy as np
from datetime import date

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')


## Connect with the Redshift Database

In [2]:
from contextlib import closing


import psycopg2
import simplejson
import sys
reload(sys)
sys.setdefaultencoding('utf8')

DEFAULT_DB = 'data_depot'
DEFAULT_HOST = 'freshbooks-data.c8exzn6geij3.us-east-1.redshift.amazonaws.com'
DEFAULT_PORT = 5439


class PsycopgConnector:
    '''
    A database connector that uses Psycopg to connect to Redshift.

    How to play:

        psy_conn = PsycopgConnector(username, password)
        df = psy_conn.run_query(sql=sql, return_data=True)

    NOTE: This class commits queries to redshift if return_data=False.
    This means INSERT, DROP, TRUNCATE, etc. all work against the DB.
    '''

    def __init__(
        self,
        username=None,
        password=None,
        db=DEFAULT_DB,
        host=DEFAULT_HOST,
        port=DEFAULT_PORT,
    ):

        self.db = DEFAULT_DB
        self.host = DEFAULT_HOST
        self.port = DEFAULT_PORT

        self.username = username
        self.password = password

    def _get_connection(self):

        self.conn = psycopg2.connect(
            dbname=self.db,
            user=self.username,
            password=self.password,
            host=self.host,
            port=self.port
        )

        return self.conn

    def run_query(self, sql, return_data=False):

        with closing(self._get_connection()) as conn:
            with conn, conn.cursor() as cur:
                if return_data:
                    return pd.read_sql(sql=sql, con=conn)
                else:
                    cur.execute(sql)
                    

# Read the Redshift's credentials file 
with open("redshift_creds.json.nogit") as fh:
    creds = simplejson.loads(fh.read())
    
username = creds.get("user_name")
password = creds.get("password")

pig = PsycopgConnector(username, password)

In [3]:
# Testing connection
sql_test = '''SELECT * FROM report_systems LIMIT 5'''
df_test = pig.run_query(sql_test, return_data=True)
df_test

Unnamed: 0,systemid,business_id,admin_identity_id,subdomain,is_freshbooks_account_active,is_modern,most_recent_migrated_to_smux_at,is_contractor,currency_code,timezone,...,staff_count,staff_deleted_count,contractor_count,contractor_deleted_count,user_contact_count,enabled_gateway_count,google_sso_first_linked_date,google_sso_most_recent_linked_date,google_sso_first_removal_date,google_sso_most_recent_removal_date
0,2024,,,https://NETERGY.freshbooks.com,1,0,,0,AUD,Australia/Melbourne,...,0,0,0,0,0,0,,,,
1,2646,,,https://ifixit.freshbooks.com,1,0,,0,USD,US/Central,...,0,1,0,0,2,2,,,,
2,3360,,,https://syndicatelabs.billingarm.com,0,0,,0,USD,US/Pacific,...,0,2,0,0,104,2,,,,
3,3387,,,https://IowaLink2.freshbooks.com,0,0,,0,USD,US/Central,...,0,0,0,0,0,1,,,,
4,4498,,,https://ProgrammedConcepts.freshbooks.com,0,0,,0,USD,US/Central,...,0,0,0,0,0,1,,,,


## Functions

In [4]:
# Word count fuction
import re
def words_count (strg):
    
    #print(strg)
    
    if strg == '' or pd.isnull(strg):
        no_of_words = 0
        #print('NaN')
    else:
        strg_words_list = re.findall(r"[\w']+", strg)
        no_of_words = len(strg_words_list)

        
        #print(strg_words_list)
    
    return no_of_words 
    

# 1. Import Invoice Data & Extract Avg Word Counts Features

## 1.01 Invoice within 7 days

In [5]:
# SQL for impoorting all invoices created within 7 days after signup_date
sql_invoices_7days_all_accounts = '''WITH invoices_in_a_period AS (
    SELECT
            systemid,
            signup_date
    FROM report_systems rs
    WHERE signup_date between (current_date - interval '13 days') and (current_date - interval '7 days')
), invoice_created_at AS (
    SELECT
           pic.systemid,
           pic.signup_date,
           inv.invoiceid,
           inv.create_date,
           inv.description,
           inv.notes,
           inv.terms,
           inv.address,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM invoices_in_a_period AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
    WHERE ((days_to_invoice_creation BETWEEN 0 AND 7) OR days_to_invoice_creation IS NULL)
)

SELECT *
FROM invoice_created_at;'''

# Import as dataframe from redshift
# df_invoices_7days_all_accounts = pd.read_sql_query(sql_invoices_7days_all_accounts, connect_to_db)
df_invoices_7days_all_accounts = pig.run_query(sql_invoices_7days_all_accounts, return_data=True)


# Words count in invoice's description, notes, terms, address
df_invoices_7days_all_accounts['avg_wc_description_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_7days_all_accounts['avg_wc_notes_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_7days_all_accounts['avg_wc_terms_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_7days_all_accounts['avg_wc_address_day_7'] = df_invoices_7days_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_7days_all_accounts_fil = df_invoices_7days_all_accounts.filter(['systemid', 
                                                                            'invoiceid', 
                                                                            'signup_date',
                                                                            'create_date', 
                                                                            'created_at',
                                                                            'days_to_invoice_creation', 
                                                                            'avg_wc_description_day_7', 
                                                                            'avg_wc_notes_day_7', 
                                                                            'avg_wc_terms_day_7',
                                                                            'avg_wc_address_day_7'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'systemid'
df_word_count_7days_all_accounts_total = df_invoices_7days_all_accounts_fil.groupby('systemid').mean()  

# Final word count table
df_word_count_7days_all_accounts_final = df_word_count_7days_all_accounts_total.filter([
                                                                            'systemid',
                                                                            'signup_date',
                                                                            'avg_wc_description_day_7', 
                                                                            'avg_wc_notes_day_7', 
                                                                            'avg_wc_terms_day_7',
                                                                            'avg_wc_address_day_7'])

In [6]:
df_word_count_7days_all_accounts_final.head()

Unnamed: 0_level_0,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7
systemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4963285,0.0,51.0,0.0,0.0
4963287,0.0,0.0,0.0,0.0
4963289,0.0,0.0,0.0,0.0
4963291,0.0,0.0,0.0,0.0
4963293,0.0,0.0,0.0,0.0


In [7]:
df_word_count_7days_all_accounts_final.shape

(9756, 4)

# 2. Report Systems Features


In [8]:
################# Import RSystems, Periodic Invoices & Client Counts Data ###############

# SQL query 
sql_rs_invoices_clients_activities_all_accounts = '''WITH periodic_report_system_activities AS (
    SELECT
        systemid,
        signup_date,
        admin_email,
        is_sales_managed,
        is_freshbooks_account_active,
        freshbooks_account_status,
        is_paying
    FROM report_systems rs
    WHERE signup_date between (current_date - interval '13 days') and (current_date - interval '7 days') 
        AND is_sales_managed ='0' AND is_freshbooks_account_active = '1' 
), invoice_create_date AS (
    SELECT
           pic.systemid,
           inv.invoiceid,
           inv.create_date,
           inv.created_at,
           DATEDIFF(days, pic.signup_date, inv.created_at) AS days_to_invoice_creation
    FROM periodic_report_system_activities AS pic
    LEFT JOIN coalesced_live_shards.invoice_stable as inv USING (systemid)
), invoice_grouping AS (
    SELECT
           systemid,
           COUNT(invoiceid) as invoice_count,
           SUM(CASE WHEN days_to_invoice_creation BETWEEN 0 AND 7 THEN 1 ELSE 0 END) AS invoice_count_day_7
    FROM invoice_create_date
    GROUP BY systemid
), client_crate_date AS (
     SELECT
            pic.systemid,
            usr.userid,
            usr.signup_date,
            DATEDIFF(days, pic.signup_date, usr.signup_date) AS days_to_client_creation
    FROM periodic_report_system_activities  AS pic
    LEFT JOIN coalesced_live_shards."user" as usr USING (systemid)
), client_grouping AS (
    SELECT
           systemid,
           count(userid) AS client_count,
           SUM(CASE WHEN days_to_client_creation BETWEEN 0 AND 7 THEN 1 ELSE 0 END) AS client_count_day_7

    FROM  client_crate_date
    GROUP BY systemid
)

SELECT
       systemid,
       signup_date,
       current_date as effective_date,
       DATEDIFF(days, signup_date, current_date) as days_on_platform,
       admin_email,
       is_sales_managed,
       is_freshbooks_account_active,
       is_paying,
       inv_gr.invoice_count_day_7,
       cl_gr.client_count_day_7
FROM periodic_report_system_activities 
LEFT JOIN invoice_grouping as inv_gr USING (systemid)
LEFT JOIN client_grouping AS cl_gr USING (systemid);
'''

# Import as dataframe from redshift
df_rs_invoices_clients_activities_all_accounts = pig.run_query(sql_rs_invoices_clients_activities_all_accounts, return_data=True)


In [9]:
# checking 
df_rs_invoices_clients_activities_all_accounts.tail()

Unnamed: 0,systemid,signup_date,effective_date,days_on_platform,admin_email,is_sales_managed,is_freshbooks_account_active,is_paying,invoice_count_day_7,client_count_day_7
9637,4981839,2020-02-06,2020-02-13,7,jcarlton79@gmail.com,0,1,0,0,1
9638,4982035,2020-02-06,2020-02-13,7,karlapalaciospacheco@gmail.com,0,1,0,0,1
9639,4982077,2020-02-06,2020-02-13,7,fmadysonb@gmail.com,0,1,0,0,1
9640,4982337,2020-02-06,2020-02-13,7,keckelberg@yahoo.com,0,1,0,0,1
9641,4982787,2020-02-06,2020-02-13,7,believesinaangel@comcast.net,0,1,0,0,1


In [10]:
df_rs_invoices_clients_activities_all_accounts.shape

(9642, 10)

# 4. Import and Exract Features from Events Data
## 4.1 Event data collection 

In [11]:
############################### Event Features Extraction ################################

#SQL for events 
sql_events = '''WITH selected_accounts_events AS (
    SELECT systemid,
           signup_date,
           signup_datetime
    FROM report_systems
    WHERE signup_date between (current_date - interval '13 days') and (current_date - interval '7 days') and is_sales_managed ='0'
), events_activities AS (
    SELECT sae.systemid,
           signup_date,
           dd.date,
           datediff(days, signup_date, dd.date) as days_to_event,
           lower(e.event) as event,
           ec.count
    FROM selected_accounts_events AS sae
    LEFT JOIN event_counts AS ec USING (systemid)
    LEFT JOIN d_date AS dd USING (date_key)
    LEFT JOIN d_event e on ec.event_key = e.event_key
), event_groupings AS (
    SELECT distinct  ea.systemid,
                    ea.signup_date,
                    ea.date,
                    ea.event,
                    ea.count,
                    (CASE WHEN days_to_event BETWEEN 0 AND 7 THEN ea.count END) AS day_7_event
    FROM events_activities AS ea
)
SELECT systemid,
       event,
       sum(day_7_event) AS event_count_day_7
From event_groupings
GROUP BY systemid, signup_date, event
ORDER BY systemid, event_count_day_7 DESC;'''

# Import as dataframe from redshift
# df_events_all_accounts = pd.read_sql_query(sql_events, connect_to_db)
df_events_all_accounts = pig.run_query(sql_events, return_data=True)


In [12]:
# checking
df_events_all_accounts.head()

Unnamed: 0,systemid,event,event_count_day_7
0,4963285,access token created,
1,4963285,update identity,12.0
2,4963285,subscription details changed,10.0
3,4963285,create item,9.0
4,4963285,survey question answered,7.0


In [13]:
df_events_all_accounts.shape

(184589, 3)

## 4.2 Removing whitespce from the event strings

In [14]:
# Removing row if there is 'None' the event cell
df_events_all_accounts = df_events_all_accounts[~df_events_all_accounts.astype(str).eq('None').any(1)]

# Replace the 'NaN' cell by zero
df_events_all_accounts.fillna(0, inplace=True)

# Using lambda function to remove the white space in the event string name
df_events_all_accounts['event_name'] = df_events_all_accounts.apply(lambda x: x['event'].replace(' ', '').replace('-','').replace('/', ''), axis=1)

In [15]:
# checking
df_events_all_accounts.head()

Unnamed: 0,systemid,event,event_count_day_7,event_name
0,4963285,access token created,0.0,accesstokencreated
1,4963285,update identity,12.0,updateidentity
2,4963285,subscription details changed,10.0,subscriptiondetailschanged
3,4963285,create item,9.0,createitem
4,4963285,survey question answered,7.0,surveyquestionanswered


In [16]:
# list(df_events_all_accounts['event_name'])

In [17]:
# Filtered the events columns for day 7
df_events_all_accounts_day_7 = df_events_all_accounts[['systemid', 'event_count_day_7', 'event_name']]

In [18]:
df_events_all_accounts_day_7.tail()

Unnamed: 0,systemid,event_count_day_7,event_name
184584,4983015,1.0,createbusiness
184585,4983015,1.0,identitysignedupwithgoogle
184586,4983015,1.0,createservice
184587,4983015,1.0,createsystemgateway
184588,4983015,1.0,createtask


In [19]:
df_events_all_accounts_day_7.shape

(184581, 3)

## 4.2 Pivote the events (each unique event become a column)

In [20]:
### Pivote the Day 7 Events (Each Unique Event Become a Column)###

# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_7 = df_events_all_accounts_day_7.pivot_table(values='event_count_day_7', columns='event_name', index='systemid', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_7.columns.name = None

# Reset the index
df_events_all_accounts_day_7 = df_events_all_accounts_day_7.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_7.fillna(0, inplace=True)

In [21]:
# checking
df_events_all_accounts_day_7.tail()

Unnamed: 0,systemid,acceptestimate,accesstokencreated,activateclient,activateestimate,activateexpense,activateinvoice,activateitem,activatesystem,activatetask,...,updatetax,updatetimeentry,updateuser,uploadexpensereceipt,viewedcreupgradepage,viewestimate,viewinvoice,welcomeaccount,zendesksupportemail,zeroamountinvoicefromrecurringprofile
9768,4983007,0,5,0,0,0,0,0,0,0,...,0,0,3,0,0,0,0,1,0,0
9769,4983009,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,1,0,0
9770,4983011,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,1,0,0
9771,4983013,0,0,0,0,0,0,0,0,0,...,0,0,3,0,0,0,0,1,0,0
9772,4983015,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,1,0,0


In [22]:
df_events_all_accounts_day_7.shape

(9773, 276)

# 5. Merging all data: Report system, average word count and event data

In [23]:
# Merging report system and events data for day 7 period
df_rs_events_day_7 = pd.merge(df_rs_invoices_clients_activities_all_accounts, df_events_all_accounts_day_7,
                             on='systemid', how='left')

# Merging average word count with 'df_rs_events_day_7'
df_rs_events_avg_wc_day_7 = pd.merge(df_rs_events_day_7, df_word_count_7days_all_accounts_final,
                                    on='systemid', how='left')

In [24]:
df_rs_events_avg_wc_day_7.head()

Unnamed: 0,systemid,signup_date,effective_date,days_on_platform,admin_email,is_sales_managed,is_freshbooks_account_active,is_paying,invoice_count_day_7,client_count_day_7,...,viewedcreupgradepage,viewestimate,viewinvoice,welcomeaccount,zendesksupportemail,zeroamountinvoicefromrecurringprofile,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7
0,4963293,2020-01-31,2020-02-13,13,ytadno1@gmail.com,0,1,0,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4963307,2020-01-31,2020-02-13,13,shawnsdependlawncare14@gmail.com,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4963647,2020-01-31,2020-02-13,13,closekwan000@gmail.com,0,1,0,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4963659,2020-01-31,2020-02-13,13,abbasaid@live.com,0,1,0,7,7,...,0.0,0.0,1.0,1.0,0.0,0.0,2.714286,0.0,0.0,0.0
4,4963749,2020-01-31,2020-02-13,13,monicitcarlo100@gmail.com,0,1,0,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# df_rs_events_avg_wc_day_7['signup_date']

# 6. Filtering out FreshBooks test accounts 

In [26]:
################# Filtering Out FreshBooks Test Accounts #############################################################

# Import Freshbooks test accounts email from CSV file (non-freshbooks email)
fb_test_emails = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data/freshbooks_test_email/non-fb-testing-emails.tsv", 
                                      sep="\t")
fb_test_email_list = list(fb_test_emails['email'])

In [27]:
# fb_test_email_list

In [28]:
# Function: Filtering FB test account by using admin email
from difflib import SequenceMatcher

def email_match(em, email_list):
    
    L = len(email_list)
#     print('L', L)
#     print('em-before-loop: ', em)
    match_score = 0
#     x = float(em)
    
    for i in range(0, L):
#         if math.isnan(x):
#             match_score = 0
#             break;
        if pd.isnull(em):
            match_score = 0
            break;
        else: 
            match_score =  max(match_score, SequenceMatcher(None,em, email_list[i]).ratio())
#             print(i, em, email_list[i], match_score)

    return match_score
    


In [29]:
# Filtering final data from the FreshBooks Test emails
df_rs_events_avg_wc_day_7_noFBtest = df_rs_events_avg_wc_day_7[
    df_rs_events_avg_wc_day_7.apply(lambda x: email_match(x['admin_email'], fb_test_email_list) < 0.95, axis=1)]

In [30]:
# df_rs_events_avg_wc_day_7_noFBtest['signup_date']

In [31]:
# list(df_rs_events_avg_wc_day_7_noFBtest)

# 7. Filtering only important features: Day 7

In [32]:
# Importing importing features list
important_features = pd.read_csv( 
    "/Users/dwahid/Documents/GitHub/fraud_detection/src/important_features/important_features_day_07_new_accounts.tsv", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

In [33]:
len(imp_features_list)

93

In [34]:
# Adding missing important feature column with zero values (if there any!)
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_rs_events_avg_wc_day_7_noFBtest.columns:
#         print("True")
        continue;
        
    else:
        print("False: ", imp_features_list[i])
        df_rs_events_avg_wc_day_7_noFBtest[imp_features_list[i]] = 0


In [35]:
df_rs_events_avg_wc_day_7_noFBtest.shape

(9642, 312)

In [36]:
# list(df_rs_events_avg_wc_day_7_noFBtest)

In [37]:
# df_rs_events_avg_wc_day_7_noFBtest['signup_date']

In [38]:
# Filtering only important features 
df_imp_features_new_accounts_day_7 =\
            df_rs_events_avg_wc_day_7_noFBtest[df_rs_events_avg_wc_day_7_noFBtest.columns.intersection(imp_features_list)]

In [39]:
df_imp_features_new_accounts_day_7.head()

Unnamed: 0,systemid,signup_date,effective_date,days_on_platform,admin_email,is_sales_managed,is_freshbooks_account_active,is_paying,invoice_count_day_7,client_count_day_7,...,deletestaff,disconnectbankaccount,disconnectpaymentgateway,emailcreditnote,enableautobilling,expenseimportsucceeded,fbpayupdatedacceptedcreditcards,hitpaywallclientlimit,stripepaymentsuccessful,updatecreditnote
0,4963293,2020-01-31,2020-02-13,13,ytadno1@gmail.com,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,4963307,2020-01-31,2020-02-13,13,shawnsdependlawncare14@gmail.com,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,4963647,2020-01-31,2020-02-13,13,closekwan000@gmail.com,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4963659,2020-01-31,2020-02-13,13,abbasaid@live.com,0,1,0,7,7,...,0,0,0,0,0,0,0,0,0,0
4,4963749,2020-01-31,2020-02-13,13,monicitcarlo100@gmail.com,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [40]:
df_imp_features_new_accounts_day_7 = df_imp_features_new_accounts_day_7.reindex(
    sorted(df_imp_features_new_accounts_day_7.columns), axis=1)

In [41]:
# list(df_final_features_new_accounts_day_7)

In [42]:
# df_imp_features_new_accounts_day_7['signup_date']

In [43]:
# Drop rows with nan value
df_imp_features_new_accounts_day_7 = df_imp_features_new_accounts_day_7.dropna()

# 8. Filtering inactive users' accounts

In [44]:
ex_cols_list = ['admin_email','days_on_platform', 'effective_date', 'signup_date', 'systemid']
cols_list = list(df_imp_features_new_accounts_day_7) 
cols = list(set(cols_list) - set(ex_cols_list))

In [45]:
# cols

In [46]:
# Function for aggregating selected column values
def cell_value_sum (row, cols):
    #print(row)
    sum = 0
    for i in cols:
        #print(i)
        #print(i, row[i])
        sum = sum + row[i]
    
    #print('Final sum: ', sum)
    return sum


In [47]:
# Fltering out all inactive users accounts
df_final_features_new_accounts_day_7 =\
        df_imp_features_new_accounts_day_7[df_imp_features_new_accounts_day_7.apply(lambda x: cell_value_sum(x, cols) > 0, axis=1)]

In [48]:
df_final_features_new_accounts_day_7.head()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admin_email,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
0,0.0,0,0,ytadno1@gmail.com,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
1,0.0,0,0,shawnsdependlawncare14@gmail.com,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
2,0.0,0,0,closekwan000@gmail.com,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
3,0.0,0,0,abbasaid@live.com,0,0,0,0,0.0,0.0,...,0.0,6.0,0.0,0.0,0,0.0,0.0,0.0,2.0,1.0
4,0.0,0,0,monicitcarlo100@gmail.com,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [49]:
# df_final_features_new_accounts_day_7['signup_date']

# 9. Saving the filtered features data for new accounts

In [50]:
# Export filtered features data fro new accounts
today = str(date.today())
path = "/Users/dwahid/Documents/GitHub/fraud_detection/data/new_users_features/new_users_features_day_07_" + today + ".tsv"
df_final_features_new_accounts_day_7.to_csv(path, sep="\t", index=False)