# Module - 1 - Recap - Building DataSet
---

In [23]:

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
%matplotlib inline
sns.set_style('darkgrid', {'axes.facecolor': '1'})
# Run Module-1 Key functions
%run Module-1-functions.py

In [24]:
(customer_profiles_df, station_profiles_df, transactions_df)=Simulate_dataset(n_customers = 10000, n_stations = 10000, 
                     nb_days=150, 
                     start_date="2022-01-01", 
                     r=7)

In [25]:
transactions_df = Simulate_frauds(customer_profiles_df, station_profiles_df, transactions_df)

Number of frauds from scenario 1: 3149
Number of frauds from scenario 2: 9728
Number of frauds from scenario 3: 2469


In [26]:
transactions_df

Unnamed: 0,TRANSACTION_ID,Trans_DATETIME,CUSTOMER_ID,STORE_ID,Trans_AMOUNT,Trans_TIME_SECONDS,Trans_TIME_DAYS,Trans_FRAUD,Trans_FRAUD_SCENARIO
0,0,2022-01-01 00:00:17,5820,2647,17.99,17,0,0,0
1,1,2022-01-01 00:00:17,6160,2980,44.48,17,0,0,0
2,2,2022-01-01 00:00:30,356,752,86.87,30,0,0,0
3,3,2022-01-01 00:00:31,1829,2266,16.65,31,0,0,0
4,4,2022-01-01 00:00:31,596,2344,98.33,31,0,0,0
...,...,...,...,...,...,...,...,...,...
3631920,3631920,2022-05-30 23:59:08,3779,6482,63.78,12959948,149,0,0
3631921,3631921,2022-05-30 23:59:11,2051,7172,17.76,12959951,149,0,0
3631922,3631922,2022-05-30 23:59:27,6519,3400,54.42,12959967,149,0,0
3631923,3631923,2022-05-30 23:59:39,304,338,54.93,12959979,149,0,0


# Module - 2 - Feature Engineering
---

In [27]:
%run Module-2-helper.py

In [28]:
transactions_df.head()

Unnamed: 0,TRANSACTION_ID,Trans_DATETIME,CUSTOMER_ID,STORE_ID,Trans_AMOUNT,Trans_TIME_SECONDS,Trans_TIME_DAYS,Trans_FRAUD,Trans_FRAUD_SCENARIO
0,0,2022-01-01 00:00:17,5820,2647,17.99,17,0,0,0
1,1,2022-01-01 00:00:17,6160,2980,44.48,17,0,0,0
2,2,2022-01-01 00:00:30,356,752,86.87,30,0,0,0
3,3,2022-01-01 00:00:31,1829,2266,16.65,31,0,0,0
4,4,2022-01-01 00:00:31,596,2344,98.33,31,0,0,0


In [29]:
transactions_df['Trans_DURING_WEEKEND']=transactions_df.Trans_DATETIME.apply(is_weekend)
transactions_df['Trans_DURING_NIGHT']=transactions_df.Trans_DATETIME.apply(is_night)
transactions_df[transactions_df.Trans_TIME_DAYS>=40]

Unnamed: 0,TRANSACTION_ID,Trans_DATETIME,CUSTOMER_ID,STORE_ID,Trans_AMOUNT,Trans_TIME_SECONDS,Trans_TIME_DAYS,Trans_FRAUD,Trans_FRAUD_SCENARIO,Trans_DURING_WEEKEND,Trans_DURING_NIGHT
968873,968873,2022-02-10 00:00:23,7888,1642,23.07,3456023,40,0,0,0,1
968874,968874,2022-02-10 00:00:30,3066,6134,116.18,3456030,40,0,0,0,1
968875,968875,2022-02-10 00:01:07,7948,2572,68.97,3456067,40,0,0,0,1
968876,968876,2022-02-10 00:01:10,9791,9134,45.8,3456070,40,0,0,0,1
968877,968877,2022-02-10 00:02:25,1702,4933,80.24,3456145,40,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
3631920,3631920,2022-05-30 23:59:08,3779,6482,63.78,12959948,149,0,0,0,0
3631921,3631921,2022-05-30 23:59:11,2051,7172,17.76,12959951,149,0,0,0,0
3631922,3631922,2022-05-30 23:59:27,6519,3400,54.42,12959967,149,0,0,0,0
3631923,3631923,2022-05-30 23:59:39,304,338,54.93,12959979,149,0,0,0,0


In [30]:
transactions_df=transactions_df.groupby('CUSTOMER_ID').apply(lambda x: get_customer_spending_behaviour_features(x, windows_size_in_days=[1,7,30]))
transactions_df=transactions_df.sort_values('Trans_DATETIME').reset_index(drop=True)
transactions_df.head()

Unnamed: 0,TRANSACTION_ID,Trans_DATETIME,CUSTOMER_ID,STORE_ID,Trans_AMOUNT,Trans_TIME_SECONDS,Trans_TIME_DAYS,Trans_FRAUD,Trans_FRAUD_SCENARIO,Trans_DURING_WEEKEND,Trans_DURING_NIGHT,CUSTOMER_ID_NB_Trans_1DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW,CUSTOMER_ID_NB_Trans_7DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,CUSTOMER_ID_NB_Trans_30DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW
0,0,2022-01-01 00:00:17,5820,2647,17.99,17,0,0,0,1,1,1.0,17.99,1.0,17.99,1.0,17.99
1,1,2022-01-01 00:00:17,6160,2980,44.48,17,0,0,0,1,1,1.0,44.48,1.0,44.48,1.0,44.48
2,2,2022-01-01 00:00:30,356,752,86.87,30,0,0,0,1,1,1.0,86.87,1.0,86.87,1.0,86.87
3,3,2022-01-01 00:00:31,1829,2266,16.65,31,0,0,0,1,1,1.0,16.65,1.0,16.65,1.0,16.65
4,5,2022-01-01 00:00:31,6820,8046,128.04,31,0,0,0,1,1,1.0,128.04,1.0,128.04,1.0,128.04


In [31]:
transactions_df=transactions_df.groupby('STORE_ID').apply(lambda x: get_count_risk_rolling_window(x, delay_period=7, windows_size_in_days=[1,7,30], feature="STORE_ID"))
transactions_df=transactions_df.sort_values('Trans_DATETIME').reset_index(drop=True)

# Output Data

In [32]:
DIR_OUTPUT = "./simulated-data-transformed/"
if not os.path.exists(DIR_OUTPUT):
    os.makedirs(DIR_OUTPUT)
start_date = datetime.datetime.strptime("2022-01-01", "%Y-%m-%d")
for day in range(transactions_df.Trans_TIME_DAYS.max()+1):
    transactions_day = transactions_df[transactions_df.Trans_TIME_DAYS==day].sort_values('Trans_TIME_SECONDS')
    date = start_date + datetime.timedelta(days=day)
    filename_output = date.strftime("%Y-%m-%d")+'.pkl'
    # Protocol=4 required for Google Colab
    transactions_day.to_pickle(DIR_OUTPUT+filename_output, protocol=4)