# Module - 2 - Building DataSet + Feature Engineering
---

In [1]:

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
%matplotlib inline
sns.set_style('darkgrid', {'axes.facecolor': '1'})
# Run Module-1 Key functions
%run Module-1-functions.py

In [2]:
(customer_profiles_df, 
 station_profiles_df, 
 transactions_df)=Simulate_dataset(
     n_customers = 10000, 
     n_stations = 10000, 
    nb_days=150, 
    start_date="2022-01-01", 
    r=7)

In [3]:
transactions_df = Simulate_frauds(customer_profiles_df, 
                                  station_profiles_df, 
                                  transactions_df)

In [4]:
transactions_df

Unnamed: 0,TRANSACTION_ID,Trans_DATETIME,CUSTOMER_ID,STORE_ID,Trans_AMOUNT,Trans_TIME_SECONDS,Trans_TIME_DAYS,Trans_FRAUD
0,0,2022-01-01 00:00:17,5820,2647,17.99,17,0,0
1,1,2022-01-01 00:00:17,6160,2980,44.48,17,0,0
2,2,2022-01-01 00:00:30,356,752,86.87,30,0,0
3,3,2022-01-01 00:00:31,1829,2266,16.65,31,0,0
4,4,2022-01-01 00:00:31,596,2344,98.33,31,0,0
...,...,...,...,...,...,...,...,...
3631920,3631920,2022-05-30 23:59:08,3779,6482,63.78,12959948,149,0
3631921,3631921,2022-05-30 23:59:11,2051,7172,17.76,12959951,149,0
3631922,3631922,2022-05-30 23:59:27,6519,3400,54.42,12959967,149,0
3631923,3631923,2022-05-30 23:59:39,304,338,54.93,12959979,149,0


# Module - 2 - Feature Engineering
---

In [5]:
%run Module-2-helper.py
# Below is the Module-2-helper function to build the new features
# --------------------------------------------------------------------------------- Feature Engineering
# ----------------------------------------------------
# Binary Output : Whether a day is during weekend or during weekday
def is_weekend(tx_datetime):
    weekday = tx_datetime.weekday()
    is_weekend = weekday>=5
    return int(is_weekend)

# ----------------------------------------------------
# Binary Output: Whether the transaction happens during night
def is_night(tx_datetime):
    tx_hour = tx_datetime.hour
    is_night = tx_hour<=8
    return int(is_night)

# ----------------------------------------------------
# define a function computing the average transaction amount in each window size (Customer Views)
def compute_avg_amt(C_T, window):
    for window_size in window:
        # Compute the SUM
        _SUM = C_T['Trans_AMOUNT'].rolling(str(window_size)+'d').sum()
        _WIND = C_T['Trans_AMOUNT'].rolling(str(window_size)+'d').count()
        # Compute the AVG
        _AVG = _SUM/_WIND
        # Saving
        C_T['WIND_Trans_'+str(window_size)+'DAY']=list(_WIND)
        C_T['AVG_AMOUNT_'+str(window_size)+'DAY']=list(_AVG)

def get_customer_spending_behaviour_features(C_T, window=[1,7,30]):
    # Order transactions chronologically
    C_T=C_T.sort_values('Trans_DATETIME')
    C_T.index=C_T.Trans_DATETIME
    compute_avg_amt(C_T, window)
    # Reindex according to transaction IDs
    C_T.index=C_T.TRANSACTION_ID
    # And return the dataframe with the new features
    return C_T

# ----------------------------------------------------
# define a function computing the average transaction amount in each window size (STORE Views)
def update_features(store_T, delay_period, window, feature, NB_FRAUD_DELAY, NB_Trans_DELAY):
    for window_size in window:
        NB_FRAUD=store_T['Trans_FRAUD'].rolling(str(delay_period+window_size)+'d').sum()
        NB_DELAY=store_T['Trans_FRAUD'].rolling(str(delay_period+window_size)+'d').count()
        NB_FRAUD_WINDOW=NB_FRAUD-NB_FRAUD_DELAY
        NB_Trans_WINDOW=NB_DELAY-NB_Trans_DELAY
        RISK_WINDOW=NB_FRAUD_WINDOW/NB_Trans_WINDOW
        store_T[feature+'_NB_Trans_'+str(window_size)+'DAY_WINDOW']=list(NB_Trans_WINDOW)
        store_T[feature+'_RISK_'+str(window_size)+'DAY_WINDOW']=list(RISK_WINDOW)
        
def get_count_risk_rolling_window(store_T, delay_period=7, window=[1,7,30], feature="STORE_ID"):
    store_T=store_T.sort_values('Trans_DATETIME')
    store_T.index=store_T.Trans_DATETIME
    NB_FRAUD_DELAY=store_T['Trans_FRAUD'].rolling(str(delay_period)+'d').sum()
    NB_Trans_DELAY=store_T['Trans_FRAUD'].rolling(str(delay_period)+'d').count()
    update_features(store_T, delay_period, window, feature, NB_FRAUD_DELAY, NB_Trans_DELAY)
    store_T.index=store_T.TRANSACTION_ID
    # Replace NA values with 0 (all undefined risk scores where NB_Trans_WINDOW is 0) 
    store_T.fillna(0,inplace=True)
    return store_T


In [6]:
transactions_df.head()

Unnamed: 0,TRANSACTION_ID,Trans_DATETIME,CUSTOMER_ID,STORE_ID,Trans_AMOUNT,Trans_TIME_SECONDS,Trans_TIME_DAYS,Trans_FRAUD
0,0,2022-01-01 00:00:17,5820,2647,17.99,17,0,0
1,1,2022-01-01 00:00:17,6160,2980,44.48,17,0,0
2,2,2022-01-01 00:00:30,356,752,86.87,30,0,0
3,3,2022-01-01 00:00:31,1829,2266,16.65,31,0,0
4,4,2022-01-01 00:00:31,596,2344,98.33,31,0,0


In [7]:
transactions_df['Trans_DURING_WEEKEND']=transactions_df.Trans_DATETIME.apply(is_weekend)
transactions_df['Trans_DURING_NIGHT']=transactions_df.Trans_DATETIME.apply(is_night)
transactions_df[transactions_df.Trans_TIME_DAYS>=40]

Unnamed: 0,TRANSACTION_ID,Trans_DATETIME,CUSTOMER_ID,STORE_ID,Trans_AMOUNT,Trans_TIME_SECONDS,Trans_TIME_DAYS,Trans_FRAUD,Trans_DURING_WEEKEND,Trans_DURING_NIGHT
968873,968873,2022-02-10 00:00:23,7888,1642,23.07,3456023,40,0,0,1
968874,968874,2022-02-10 00:00:30,3066,6134,116.18,3456030,40,0,0,1
968875,968875,2022-02-10 00:01:07,7948,2572,68.97,3456067,40,0,0,1
968876,968876,2022-02-10 00:01:10,9791,9134,45.8,3456070,40,0,0,1
968877,968877,2022-02-10 00:02:25,1702,4933,80.24,3456145,40,0,0,1
...,...,...,...,...,...,...,...,...,...,...
3631920,3631920,2022-05-30 23:59:08,3779,6482,63.78,12959948,149,0,0,0
3631921,3631921,2022-05-30 23:59:11,2051,7172,17.76,12959951,149,0,0,0
3631922,3631922,2022-05-30 23:59:27,6519,3400,54.42,12959967,149,0,0,0
3631923,3631923,2022-05-30 23:59:39,304,338,54.93,12959979,149,0,0,0


In [8]:
transactions_df=transactions_df.groupby('CUSTOMER_ID').apply(lambda x: get_customer_spending_behaviour_features(x, windows_size_in_days=[1,7,30]))
transactions_df=transactions_df.sort_values('Trans_DATETIME').reset_index(drop=True)
transactions_df.head()

Unnamed: 0,TRANSACTION_ID,Trans_DATETIME,CUSTOMER_ID,STORE_ID,Trans_AMOUNT,Trans_TIME_SECONDS,Trans_TIME_DAYS,Trans_FRAUD,Trans_DURING_WEEKEND,Trans_DURING_NIGHT,CUSTOMER_ID_NB_Trans_1DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW,CUSTOMER_ID_NB_Trans_7DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,CUSTOMER_ID_NB_Trans_30DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW
0,0,2022-01-01 00:00:17,5820,2647,17.99,17,0,0,1,1,1.0,17.99,1.0,17.99,1.0,17.99
1,1,2022-01-01 00:00:17,6160,2980,44.48,17,0,0,1,1,1.0,44.48,1.0,44.48,1.0,44.48
2,2,2022-01-01 00:00:30,356,752,86.87,30,0,0,1,1,1.0,86.87,1.0,86.87,1.0,86.87
3,3,2022-01-01 00:00:31,1829,2266,16.65,31,0,0,1,1,1.0,16.65,1.0,16.65,1.0,16.65
4,5,2022-01-01 00:00:31,6820,8046,128.04,31,0,0,1,1,1.0,128.04,1.0,128.04,1.0,128.04


In [9]:
transactions_df=transactions_df.groupby('STORE_ID').apply(lambda x: get_count_risk_rolling_window(x, delay_period=7, windows_size_in_days=[1,7,30], feature="STORE_ID"))
transactions_df=transactions_df.sort_values('Trans_DATETIME').reset_index(drop=True)

# Output Data

In [10]:
# Outputing the data for future anaylysis, for the model building, we are not using the entire data
# we only use part of the data for training, and a valid gap period, and then a test set
OUTPUT = "./simulated-data-transformed/"
if not os.path.exists(OUTPUT):
    os.makedirs(OUTPUT)
start_date = datetime.datetime.strptime("2022-01-01", "%Y-%m-%d")
for day in range(transactions_df.Trans_TIME_DAYS.max()+1):
    transactions_day = transactions_df[
        transactions_df.Trans_TIME_DAYS==day].sort_values('Trans_TIME_SECONDS')
    date = start_date + datetime.timedelta(days=day)
    filename_output = date.strftime("%Y-%m-%d")+'.pkl'
    transactions_day.to_pickle(OUTPUT+filename_output)

In [11]:
transactions_df

Unnamed: 0,TRANSACTION_ID,Trans_DATETIME,CUSTOMER_ID,STORE_ID,Trans_AMOUNT,Trans_TIME_SECONDS,Trans_TIME_DAYS,Trans_FRAUD,Trans_DURING_WEEKEND,Trans_DURING_NIGHT,...,CUSTOMER_ID_NB_Trans_7DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,CUSTOMER_ID_NB_Trans_30DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW,STORE_ID_NB_Trans_1DAY_WINDOW,STORE_ID_RISK_1DAY_WINDOW,STORE_ID_NB_Trans_7DAY_WINDOW,STORE_ID_RISK_7DAY_WINDOW,STORE_ID_NB_Trans_30DAY_WINDOW,STORE_ID_RISK_30DAY_WINDOW
0,0,2022-01-01 00:00:17,5820,2647,17.99,17,0,0,1,1,...,1.0,17.990000,1.0,17.990000,0.0,0.0,0.0,0.0,0.0,0.000000
1,1,2022-01-01 00:00:17,6160,2980,44.48,17,0,0,1,1,...,1.0,44.480000,1.0,44.480000,0.0,0.0,0.0,0.0,0.0,0.000000
2,2,2022-01-01 00:00:30,356,752,86.87,30,0,0,1,1,...,1.0,86.870000,1.0,86.870000,0.0,0.0,0.0,0.0,0.0,0.000000
3,5,2022-01-01 00:00:31,6820,8046,128.04,31,0,0,1,1,...,1.0,128.040000,1.0,128.040000,0.0,0.0,0.0,0.0,0.0,0.000000
4,4,2022-01-01 00:00:31,596,2344,98.33,31,0,0,1,1,...,1.0,98.330000,1.0,98.330000,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3631920,3631920,2022-05-30 23:59:08,3779,6482,63.78,12959948,149,0,0,0,...,39.0,50.307949,133.0,56.797594,1.0,0.0,11.0,0.0,59.0,0.016949
3631921,3631921,2022-05-30 23:59:11,2051,7172,17.76,12959951,149,0,0,0,...,18.0,12.837222,83.0,14.233614,0.0,0.0,16.0,0.0,72.0,0.000000
3631922,3631922,2022-05-30 23:59:27,6519,3400,54.42,12959967,149,0,0,0,...,25.0,72.389200,115.0,70.241217,2.0,0.0,15.0,0.0,73.0,0.000000
3631923,3631923,2022-05-30 23:59:39,304,338,54.93,12959979,149,0,0,0,...,17.0,75.247059,94.0,78.893191,1.0,0.0,12.0,0.0,77.0,0.000000
