In [1]:
import os
import pandas as pd
import numpy as np
import math
import tensorflow as tf

In [2]:
!git clone https://github.com/IBM/TabFormer.git && \
cd TabFormer && \
git lfs pull && \
tar xzf data/credit_card/transactions.tgz

Cloning into 'TabFormer'...
remote: Enumerating objects: 114, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 114 (delta 11), reused 9 (delta 9), pack-reused 95 (from 1)[K
Receiving objects: 100% (114/114), 452.37 KiB | 7.42 MiB/s, done.
Resolving deltas: 100% (47/47), done.


In [3]:
# Read and load the csv file
df = pd.read_csv('./TabFormer/card_transaction.v1.csv')
df = df[0:16500000]

In [4]:
# Set sequence length for multivariate time series
seq_length = 7

df['Merchant Name'] = df['Merchant Name'].astype(str)
df.sort_values(by=['User','Card'], inplace=True)
df.reset_index(inplace=True, drop=True)
print (df.info())

# Get first of each User-Card combination
first = df[['User','Card']].drop_duplicates()
f = np.array(first.index)

# Drop the first N transactions
drop_list = np.concatenate([np.arange(x,x + seq_length - 1) for x in f])
index_list = np.setdiff1d(df.index.values,drop_list)

# Split into 0.5 train, 0.3 validate, 0.2 test
tot_length = index_list.shape[0]
train_length = tot_length // 2
validate_length = (tot_length - train_length) * 3 // 5
test_length = tot_length - train_length - validate_length
print (tot_length,train_length,validate_length, test_length)

# Generate list of indices for train, validate, test
np.random.seed(1111)
train_indices = np.random.choice(index_list, train_length, replace=False)
tv_list = np.setdiff1d(index_list, train_indices)
validate_indices = np.random.choice(tv_list, validate_length, replace=False)
test_indices = np.setdiff1d(tv_list, validate_indices)
print(train_indices, validate_indices, test_indices)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16500000 entries, 0 to 16499999
Data columns (total 15 columns):
 #   Column          Dtype  
---  ------          -----  
 0   User            int64  
 1   Card            int64  
 2   Year            int64  
 3   Month           int64  
 4   Day             int64  
 5   Time            object 
 6   Amount          object 
 7   Use Chip        object 
 8   Merchant Name   object 
 9   Merchant City   object 
 10  Merchant State  object 
 11  Zip             float64
 12  MCC             int64  
 13  Errors?         object 
 14  Is Fraud?       object 
dtypes: float64(1), int64(6), object(8)
memory usage: 1.8+ GB
None
16475254 8237627 4942576 3295051
[10564891  4293011 12990080 ...  7946973  8363815  6839602] [ 5898230  4785713  4951019 ...  3062006  6616067 11640215] [       6        8       27 ... 16499980 16499986 16499993]


In [5]:
# ----- CUSTOM MAPPING FUNCTIONS -----
def timeEncoder(X):
    X_hm = X['Time'].str.split(':', expand=True)
    d = pd.to_datetime(dict(year=X['Year'],month=X['Month'],day=X['Day'],hour=X_hm[0],minute=X_hm[1])).astype(int)
    return pd.DataFrame(d)

def amtEncoder(X):
    amt = X.apply(lambda x: x[1:]).astype(float).map(lambda amt: max(1,amt)).map(math.log)
    return pd.DataFrame(amt)

def decimalEncoder(X,length=5):
    dnew = pd.DataFrame()
    for i in range(length):
        dnew[i] = np.mod(X,10) 
        X = np.floor_divide(X,10)
    return dnew

def fraudEncoder(X):
    return np.where(X == 'Yes', 1, 0).astype(int)

In [6]:
artifact_dir = "/opt/artifacts/"
os.makedirs(artifact_dir, exist_ok=True)

In [7]:
import joblib

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer

mapper = DataFrameMapper([('Is Fraud?', FunctionTransformer(fraudEncoder)),
                          (['Merchant State'], [SimpleImputer(strategy='constant'), FunctionTransformer(np.ravel),
                                               LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          (['Zip'], [SimpleImputer(strategy='constant'), FunctionTransformer(np.ravel),
                                     FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          ('Merchant Name', [LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          ('Merchant City', [LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          ('MCC', [LabelEncoder(), FunctionTransformer(decimalEncoder), OneHotEncoder()]),
                          (['Use Chip'], [SimpleImputer(strategy='constant'), LabelBinarizer()]),
                          (['Errors?'], [SimpleImputer(strategy='constant'), LabelBinarizer()]),
                          (['Year','Month','Day','Time'], [FunctionTransformer(timeEncoder), MinMaxScaler()]),
                          ('Amount', [FunctionTransformer(amtEncoder), MinMaxScaler()])
                         ], input_df=True, df_out=True)
mapper.fit(df)
joblib.dump(mapper, open(os.path.join(artifact_dir, 'fitted_mapper.pkl'),'wb'))



In [8]:
mapped_sample = mapper.transform(df[:100])
mapped_size = mapped_sample.shape[-1]
print(mapped_size)

220


In [9]:
def gen_training_batch(df, mapper, index_list, batch_size):
    np.random.seed(98765)
    train_df = df.loc[index_list]
    non_fraud_indices = train_df[train_df['Is Fraud?'] == 'No'].index.values
    fraud_indices = train_df[train_df['Is Fraud?'] == 'Yes'].index.values
    fsize = fraud_indices.shape[0]
    while True:
        indices = np.concatenate((fraud_indices,np.random.choice(non_fraud_indices,fsize,replace=False)))
        np.random.shuffle(indices)
        rows = indices.shape[0]
        index_array = np.zeros((rows, seq_length), dtype=int)
        for i in range(seq_length):
            index_array[:,i] = indices + 1 - seq_length + i
        full_df = mapper.transform(df.loc[index_array.flatten()])
        target_buffer = full_df['Is Fraud?'].to_numpy().reshape(rows, seq_length, 1)
        data_buffer = full_df.drop(['Is Fraud?'],axis=1).to_numpy().reshape(rows, seq_length, -1)

        batch_ptr = 0
        while (batch_ptr + batch_size) <= rows:
            data = data_buffer[batch_ptr:batch_ptr+batch_size]
            targets = target_buffer[batch_ptr:batch_ptr+batch_size]
            batch_ptr += batch_size
            data_t = np.transpose(data, axes=(1,0,2))
            targets_t = np.transpose(targets, axes=(1,0,2))
            yield data_t,targets_t

In [10]:
class TP(tf.keras.metrics.TruePositives):
    def update_state(self, y_true, y_pred, sample_weight=None):
        super().update_state(y_true[-1,:,:], y_pred[-1,:,:], sample_weight)

class FP(tf.keras.metrics.FalsePositives):
    def update_state(self, y_true, y_pred, sample_weight=None):
        super().update_state(y_true[-1,:,:], y_pred[-1,:,:], sample_weight)

class FN(tf.keras.metrics.FalseNegatives):
    def update_state(self, y_true, y_pred, sample_weight=None):
        super().update_state(y_true[-1,:,:], y_pred[-1,:,:], sample_weight)

class TN(tf.keras.metrics.TrueNegatives):
    def update_state(self, y_true, y_pred, sample_weight=None):
        super().update_state(y_true[-1,:,:], y_pred[-1,:,:], sample_weight)

In [11]:
units = [200,200]
input_size = mapped_size - 1
output_size = 1

tf_input = ([seq_length, input_size])

lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units[0], input_shape=tf_input, return_sequences=True),
    tf.keras.layers.LSTM(units[1], return_sequences=True),
    tf.keras.layers.Dense(output_size, activation='sigmoid')
])

lstm_model.summary()

metrics=['accuracy', 
    TP(name='TP'),
    FP(name='FP'),
    FN(name='FN'),
    TN(name='TN'),
    tf.keras.metrics.TruePositives(name='tp'),
    tf.keras.metrics.FalsePositives(name='fp'),
    tf.keras.metrics.FalseNegatives(name='fn'),
    tf.keras.metrics.TrueNegatives(name='tn')
   ]

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=metrics)


  super().__init__(**kwargs)


In [12]:
steps_per_epoch = 10000
filepath = artifact_dir + ".weights.h5"
batch_size = 16

In [13]:
import warnings
warnings.filterwarnings("ignore")

print ("Learning...")
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=filepath, save_weights_only=True, verbose=1)
train_generate = gen_training_batch(df,mapper,train_indices,batch_size)
lstm_model.fit(train_generate, epochs=5, steps_per_epoch=steps_per_epoch, verbose=1, callbacks=[cp_callback])

Learning...
Epoch 1/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - FN: 5662.1484 - FP: 1091.2725 - TN: 38903.0625 - TP: 34351.5156 - accuracy: 0.9437 - fn: 18776.6387 - fp: 7782.0859 - loss: 0.1569 - tn: 421590.7188 - tp: 111906.5625
Epoch 1: saving model to /opt/artifacts/.weights.h5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 28ms/step - FN: 9148.0000 - FP: 2081.0000 - TN: 77923.0000 - TP: 70848.0000 - accuracy: 0.9599 - fn: 30356.0000 - fp: 14571.0000 - loss: 0.1073 - tn: 844198.0000 - tp: 230875.0000
Epoch 2/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - FN: 2069.2429 - FP: 864.9874 - TN: 39127.1719 - TP: 37946.5977 - accuracy: 0.9758 - fn: 7306.2383 - fp: 5973.7446 - loss: 0.0574 - tn: 423383.0000 - tp: 123393.0078
Epoch 2: saving model to /opt/artifacts/.weights.h5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 28ms/step - FN: 3976.0000 - FP: 1683.0000 - 

<keras.src.callbacks.history.History at 0x7ff760dfa5a0>

In [14]:
lstm_model.save(artifact_dir + 'model.h5')



In [15]:
def create_sample_data_to_insert(df, indices):
    print(indices)
    rows = indices.shape[0]
    index_array = np.zeros((rows, seq_length), dtype=np.int32)
    for i in range(seq_length):
        index_array[:,i] = indices + 1 - seq_length + i
    uniques = np.unique(index_array.flatten())
    df.loc[uniques].to_csv(artifact_dir + 'data_to_insert.csv', index_label='Index', header=False)

create_sample_data_to_insert(df, validate_indices[:100000])

[ 5898230  4785713  4951019 ... 13709314 11370319 13741356]
