In [17]:
# import required packages
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
import random
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re


In [18]:
# upload the dataset and load the data.
# this dataset is the original dataset 
# and does not contain the dates and times.
file = 'Transacation_outflows_3k.pqt'
data = pd.read_parquet(file, engine='auto')

In [19]:
# Check if a GPU is available, and use it if possible, otherwise use the CPU
# Define the device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [20]:
# load the first tenth datas in the dataset.
data[:10]

Unnamed: 0,prism_consumer_id,prism_account_id,memo_clean,amount,category_description
0,0,acc_0,Kroger,20.98,GROCERIES
1,0,acc_0,CASH APP*FREE XXXXXXXXXX CA XX/XX,200.0,GENERAL_MERCHANDISE
2,0,acc_0,LOAN,293.49,LOAN
3,0,acc_1,SELF_TRANSFER,280.0,SELF_TRANSFER
4,0,acc_1,EXTERNAL_TRANSFER,47.83,EXTERNAL_TRANSFER
5,0,acc_0,SELF_TRANSFER,25.0,SELF_TRANSFER
6,0,acc_0,GIFTS_DONATIONS,10.0,GIFTS_DONATIONS
7,0,acc_0,Amazon.com*HXXXWXXQX Amzn.com/bill WA XX/XX,33.2,GENERAL_MERCHANDISE
8,0,acc_0,CREDIT_CARD_PAYMENT,25.0,CREDIT_CARD_PAYMENT
9,0,acc_0,Amazon,42.79,GENERAL_MERCHANDISE


In [21]:
# Filter the required categories and define a new dataset
# which only contains these categories.
categories_filter = ['GENERAL_MERCHANDISE', 'FOOD_AND_BEVERAGES', 'GROCERIES', 'TRAVEL', 'PETS', 'EDUCATION', 'OVERDRAFT', 'RENT', 'MORTGAGE']
data1 = data[data['category_description'].isin(categories_filter)]

In [22]:
# Only inlcude a subset of the dataset 
# to prevent running out of memory problem.
data2 = data1[:50000]
len(data2)

50000

In [28]:
# Data Cleanning Process Part


## Changing memo_clean column values to all lower case first.
data2['memo_clean'] = data2['memo_clean'].str.lower()


## Use regular expressions to remove text after ".com*" 
## and keep the preceding text from ".com"
def clean_text1(text):
    # Use regular expressions to remove text after ".com*" and keep the preceding text from ".com"
    cleaned_text = re.sub(r'\.com\*.*?(?=\s|$)', '', text)
    return cleaned_text


## Removing useless pattenrs
def remove_key_phrases(text):
    phrases = [
        'pos debit - visa check card xxxx - ',
        'purchase authorized on xx/xx',
        'pos purchase',
        'purchase',
        'pos',
        'web id',
        'terminal id',
        'id'
    ]
    for phrase in phrases:
        text = re.sub(phrase, '', text)
    return text


## Removing special characters.
def remove_special_char(text):
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', text)


## Removing all the repeat 'x' patterns
def remove_xs(text):
    text = re.sub(r'(xx+)\b', ' ', text)
    text = re.sub(r'\b(x)\b', ' ', text)
    text = re.sub(r'\b(xx+)([a-zA-Z])', r'xx\2', text)
    return text


## Simplify repeating pattenrs for amazon and walmart
def standardize_phrase(text):
    text = re.sub(r'\b(amazon|amzn|amz)\b', 'amazon', text)
    text = re.sub(r'\b(wal\smart|wal|wm\ssupercenter|wm\ssuperc|wm)\b', 'walmart', text)
    return text


## Removing multiple spaces
def remove_multiple_spaces(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['memo_clean'] = data2['memo_clean'].str.lower()


In [33]:
# Applying thoese cleaning functions to the subset of the dataset
# that we choose.

data2['memo_clean'] = data2['memo_clean'].apply(clean_text1)
data2['memo_clean'] = data2['memo_clean'].apply(remove_key_phrases)
data2['memo_clean'] = data2['memo_clean'].apply(remove_special_char)
data2['memo_clean'] = data2['memo_clean'].apply(remove_xs)
data2['memo_clean'] = data2['memo_clean'].apply(standardize_phrase)
data2['memo_clean'] = data2['memo_clean'].apply(remove_multiple_spaces)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['memo_clean'] = data2['memo_clean'].apply(clean_text1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['memo_clean'] = data2['memo_clean'].apply(remove_key_phrases)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['memo_clean'] = data2['memo_clean'].apply(remove_special_char)
A value

In [36]:
# Check dataset after cleanning.
data2[:10]

Unnamed: 0,prism_consumer_id,prism_account_id,memo_clean,amount,category_description
0,0,acc_0,kroger,20.98,GROCERIES
1,0,acc_0,cash app free ca,200.0,GENERAL_MERCHANDISE
7,0,acc_0,amazon amazon com bill wa,33.2,GENERAL_MERCHANDISE
9,0,acc_0,amazon,42.79,GENERAL_MERCHANDISE
10,0,acc_0,kroger,36.55,GROCERIES
11,0,acc_0,paragon food beverage marksville la,12.65,FOOD_AND_BEVERAGES
12,0,acc_0,instacart ca,22.83,FOOD_AND_BEVERAGES
14,0,acc_0,ihop,10.0,FOOD_AND_BEVERAGES
15,0,acc_0,amazon,12.69,GENERAL_MERCHANDISE
17,0,acc_0,habaneros fresh tex mex copperhill tn,10.34,FOOD_AND_BEVERAGES


In [38]:
# Check numbers of each categories.
data2['category_description'].value_counts()

category_description
FOOD_AND_BEVERAGES     19091
GENERAL_MERCHANDISE    18683
GROCERIES               8730
TRAVEL                  2251
PETS                     530
EDUCATION                254
RENT                     226
OVERDRAFT                148
MORTGAGE                  87
Name: count, dtype: int64

In [40]:
# Assign labels to each categories.

labels = data2.category_description.unique()

label_dict = {}
for index, label in enumerate(labels):
    label_dict[label] = index
label_dict

{'GROCERIES': 0,
 'GENERAL_MERCHANDISE': 1,
 'FOOD_AND_BEVERAGES': 2,
 'TRAVEL': 3,
 'PETS': 4,
 'OVERDRAFT': 5,
 'RENT': 6,
 'EDUCATION': 7,
 'MORTGAGE': 8}

In [41]:
# Creating a label column for the dataset.

data2['label'] = data2.category_description.replace(label_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['label'] = data2.category_description.replace(label_dict)


In [42]:
# Check the current dataset
data2[:10]

Unnamed: 0,prism_consumer_id,prism_account_id,memo_clean,amount,category_description,label
0,0,acc_0,kroger,20.98,GROCERIES,0
1,0,acc_0,cash app free ca,200.0,GENERAL_MERCHANDISE,1
7,0,acc_0,amazon amazon com bill wa,33.2,GENERAL_MERCHANDISE,1
9,0,acc_0,amazon,42.79,GENERAL_MERCHANDISE,1
10,0,acc_0,kroger,36.55,GROCERIES,0
11,0,acc_0,paragon food beverage marksville la,12.65,FOOD_AND_BEVERAGES,2
12,0,acc_0,instacart ca,22.83,FOOD_AND_BEVERAGES,2
14,0,acc_0,ihop,10.0,FOOD_AND_BEVERAGES,2
15,0,acc_0,amazon,12.69,GENERAL_MERCHANDISE,1
17,0,acc_0,habaneros fresh tex mex copperhill tn,10.34,FOOD_AND_BEVERAGES,2


In [45]:
# split dataset into train, validation and test sets using stratify.
train_text, temp_text, train_labels, temp_labels = train_test_split(data2['memo_clean'], data2['label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=data2['label'])


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)


In [46]:
# Load the tokenizer from bert packages

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [48]:
# Tokenize the text in all train, val and test datasets.
# Set the max_length to 256 for safe.

encoded_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


encoded_val = tokenizer.batch_encode_plus(
    val_text.tolist(), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_test = tokenizer.batch_encode_plus(
    test_text.tolist(), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

In [49]:
# Convert the tokenized list to tensors

input_ids_train = encoded_train['input_ids']
attention_masks_train = encoded_train['attention_mask']
labels_train = torch.tensor(train_labels.tolist())

input_ids_val = encoded_val['input_ids']
attention_masks_val = encoded_val['attention_mask']
labels_val = torch.tensor(val_labels.tolist())

input_ids_test = encoded_test['input_ids']
attention_masks_test = encoded_test['attention_mask']
labels_test = torch.tensor(test_labels.tolist())


dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [50]:
# Load the model and push to the device which we defined at the beginning.

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
# Setting the batch size to three
# Using RandomSampler to randomly sample the training set.
# Using SequentialSampler for validation set to sequentially test the data.
# Using DataLoaer to improve efficient iteration and batching the data
# during training and validation.

batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)


In [52]:
# Define an optimizer
# Setting the epochs to be five
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)




In [64]:
# Define the perfomance metrics through F1_Score and Accuracy Score

label_dict_inverse = {v: k for k, v in label_dict.items()}

## Calculate the F1 score for a multi-class classification task.
## Args: preds-Predicted labels,  labels-True labels
def f1_func(preds, labels):
    p = np.argmax(preds, axis=1).flatten()
    l = labels.flatten()
    f1 = f1_score(l, p, average='weighted')
    return f1


## Calculate and print accuracy for each class
## Calculate and print overal accuracy score
## Args: preds-Predicted labels,  labels-True labels, lab_dict_inverse-Inverse label dictionary
def accuracy_per_class(preds, labels, label_dict_inverse):
    p = np.argmax(preds, axis=1).flatten()
    l = labels.flatten()

    class_accuracies = {}
    for label in np.unique(l):
        mask = l == label
        y_preds = p[mask]
        y_true = l[mask]
        class_name = label_dict_inverse[label]
        class_accuracy = accuracy_score(y_true, y_preds)
        class_accuracies[class_name] = class_accuracy

    overall_accuracy = accuracy_score(l, p)

    # Print class accuracies
    for class_name, class_accuracy in class_accuracies.items():
        print(f'Class: {class_name}\nAccuracy: {class_accuracy:.2%}\n')

    # Print overall accuracy
    print(f'Overall Accuracy: {overall_accuracy:.2%}')

In [55]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [58]:
# Define the evaluate function

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    loss_val_avg = loss_val_total/len(dataloader_val)

    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [59]:
# Train the model

for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    torch.save(model.state_dict(), 'saved_weights.pt')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/11667 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.39854813014562046
Validation loss: 0.16096235417008284
F1 Score (Weighted): 0.9702427704940141


Epoch 2:   0%|          | 0/11667 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.13115647523666576
Validation loss: 0.13700231841203786
F1 Score (Weighted): 0.9766724955401223


Epoch 3:   0%|          | 0/11667 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.0636162499348864
Validation loss: 0.15992878766612048
F1 Score (Weighted): 0.9778683357732633


Epoch 4:   0%|          | 0/11667 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.03264053829085513
Validation loss: 0.17659580529846824
F1 Score (Weighted): 0.9795875771106163


Epoch 5:   0%|          | 0/11667 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.015520779647802779
Validation loss: 0.17451936516817224
F1 Score (Weighted): 0.9805299765911736


In [65]:
# Calculating the Accuracy per calss and the overall Acurracy Score
# Caculating the precision, recall, and f1-score
_, predictions, true_vals = evaluate(dataloader_test)
accuracy_per_class(predictions, true_vals, label_dict_inverse)

preds = np.argmax(predictions, axis = 1)
print(classification_report(labels_test, preds))

Class: GROCERIES
Accuracy: 99.24%

Class: GENERAL_MERCHANDISE
Accuracy: 98.18%

Class: FOOD_AND_BEVERAGES
Accuracy: 97.97%

Class: TRAVEL
Accuracy: 94.07%

Class: PETS
Accuracy: 97.47%

Class: OVERDRAFT
Accuracy: 100.00%

Class: RENT
Accuracy: 91.18%

Class: EDUCATION
Accuracy: 76.32%

Class: MORTGAGE
Accuracy: 92.31%

Overall Accuracy: 97.95%
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1309
           1       0.98      0.98      0.98      2803
           2       0.98      0.98      0.98      2864
           3       0.98      0.94      0.96       337
           4       0.97      0.97      0.97        79
           5       1.00      1.00      1.00        23
           6       1.00      0.91      0.95        34
           7       0.97      0.76      0.85        38
           8       1.00      0.92      0.96        13

    accuracy                           0.98      7500
   macro avg       0.98      0.94      0.96      7500
weig