In [1]:
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from transformers import BertModel
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
lr = 1e-3
seq_len = 20
dropout = 0.5
num_epochs = 10
label_col = "Product"
tokens_path = r"C:\Users\conta\OneDrive\Desktop\Projects\NLP Project  using BERT Model\Modular_code\Modular_code\Output\tokens.pkl"
labels_path = r"C:\Users\conta\OneDrive\Desktop\Projects\NLP Project  using BERT Model\Modular_code\Modular_code\Output\labels.pkl"
data_path = r"C:\Users\conta\OneDrive\Desktop\Projects\NLP Project  using BERT Model\Modular_code\Modular_code\Input\complaints.csv"
model_path = r"C:\Users\conta\OneDrive\Desktop\Projects\NLP Project  using BERT Model\Modular_code\Modular_code\Output\bert_pre_trained.pth"
text_col_name = "Consumer complaint narrative"
label_encoder_path = r"C:\Users\conta\OneDrive\Desktop\Projects\NLP Project  using BERT Model\Modular_code\Modular_code\Output\label_encoder.pkl"
product_map = {'Vehicle loan or lease': 'vehicle_loan',
               'Credit reporting, credit repair services, or other personal consumer reports': 'credit_report',
               'Credit card or prepaid card': 'card',
               'Money transfer, virtual currency, or money service': 'money_transfer',
               'virtual currency': 'money_transfer',
               'Mortgage': 'mortgage',
               'Payday loan, title loan, or personal loan': 'loan',
               'Debt collection': 'debt_collection',
               'Checking or savings account': 'savings_account',
               'Credit card': 'card',
               'Bank account or service': 'savings_account',
               'Credit reporting': 'credit_report',
               'Prepaid card': 'card',
               'Payday loan': 'loan',
               'Other financial service': 'others',
               'Virtual currency': 'money_transfer',
               'Student loan': 'loan',
               'Consumer Loan': 'loan',
               'Money transfers': 'money_transfer'}

In [3]:
def save_file(name, obj):
    """
    Function to save an object as pickle file
    """
    with open(name, 'wb') as f:
        pickle.dump(obj, f)


def load_file(name):
    """
    Function to load a pickle object
    """
    return pickle.load(open(name, "rb"))

## Process text data
---

In [4]:
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2019-06-13,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,,CAPITAL ONE FINANCIAL CORPORATION,PA,186XX,,Consent not provided,Web,2019-06-13,Closed with explanation,Yes,,3274605
1,2019-11-01,Vehicle loan or lease,Loan,Struggling to pay your loan,Denied request to lower payments,I contacted Ally on Friday XX/XX/XXXX after fa...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,NJ,088XX,,Consent provided,Web,2019-11-01,Closed with explanation,Yes,,3425257
2,2019-04-01,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,19067,,Consent not provided,Web,2019-04-01,Closed with explanation,Yes,,3198225
3,2021-11-01,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Was not notified of investigation status or re...,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",GA,31707,,,Web,2021-11-01,In progress,Yes,,4863965
4,2021-11-02,Debt collection,Medical debt,Took or threatened to take negative or legal a...,Threatened or suggested your credit would be d...,,,"Medical Data Systems, Inc.",VA,22033,,,Web,2021-11-02,In progress,Yes,,4866449


In [5]:
data.dropna(subset=[text_col_name], inplace=True)

In [6]:
data.replace({label_col: product_map}, inplace=True)

### Encode labels

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(data[label_col])
labels = label_encoder.transform(data[label_col])

In [8]:
save_file(labels_path, labels)
save_file(label_encoder_path, label_encoder)

### Process the text column

In [9]:
input_text = list(data[text_col_name])

In [10]:
# I will run only 8000 for faster results
input_text = input_text[:8000]

In [11]:
len(input_text)

8000

### Convert text to lower case

In [12]:
input_text = [i.lower() for i in tqdm(input_text)]

100%|██████████████████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 604268.62it/s]


### Remove punctuations except apostrophe

In [13]:
input_text = [re.sub(r"[^\w\d'\s]+", " ", i)
             for i in tqdm(input_text)]

100%|███████████████████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 28330.68it/s]


### Remove digits

In [14]:
input_text = [re.sub("\d+", "", i) for i in tqdm(input_text)]

100%|███████████████████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 45749.58it/s]


### Remove more than one consecutive instance of 'x'

In [15]:
input_text = [re.sub(r'[x]{2,}', "", i) for i in tqdm(input_text)]

100%|███████████████████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 64059.75it/s]


### Remove multiple spaces with single space

In [16]:
input_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]

100%|███████████████████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 21290.61it/s]


### Tokenize the text

In [17]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [18]:
input_text[0]

'i contacted ally on friday after falling behind on payments due to being out of work for a short period of time due to an illness i chated with a representative after logging into my account regarding my opitions to ensure i protect my credit and bring my account current \n\nshe advised me that before an extenstion could be done i had to make a payment in the amount of i reviewed my finances as i am playing catch up on all my bills and made this payment on monday this rep advised me once this payment posts to my account to contact ally back for an extention or to have a payment deffered to the end of my loan \n\nwith this in mind i contacted ally again today and chatted with i explained all of the above and the information i was provided when i chatted with the rep last week she asked several questions and advised me that a one or two month extension deffered payment could be done however partial payment is needed what she advised me or there abouts would be due within days from me ac

In [19]:
sample_tokens = tokenizer(input_text[0], padding="max_length",
                         max_length=seq_len, truncation=True,
                         return_tensors="pt")

In [20]:
sample_tokens

{'input_ids': tensor([[  101,   178, 12017, 11989,  1113,   175, 22977,  1183,  1170,  4058,
          1481,  1113, 10772,  1496,  1106,  1217,  1149,  1104,  1250,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
sample_tokens["input_ids"]

tensor([[  101,   178, 12017, 11989,  1113,   175, 22977,  1183,  1170,  4058,
          1481,  1113, 10772,  1496,  1106,  1217,  1149,  1104,  1250,   102]])

In [22]:
sample_tokens["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [23]:
tokens = [tokenizer(i, padding="max_length", max_length=seq_len, 
                    truncation=True, return_tensors="pt") 
         for i in tqdm(input_text)]

100%|█████████████████████████████████████████████████████████████████████████████| 8000/8000 [00:27<00:00, 294.04it/s]


### Save the tokens

In [24]:
save_file(tokens_path, tokens)

## Create Bert model
---

In [25]:
class BertClassifier(nn.Module):

## Create PyTorch Dataset
---

In [26]:
class TextDataset(torch.utils.data.Dataset):

### Function to train the model

In [27]:
def train(train_loader, valid_loader, model, criterion, optimizer, 
          device, num_epochs, model_path):


### Function to test the model

In [28]:
def test(test_loader, model, criterion, device):
  

## Train Bert model
---

### Load the files

In [29]:
tokens = load_file(tokens_path)
labels = load_file(labels_path)
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)

In [30]:
# for speed up only 8000
labels = labels[:8000]

### Split data into train, validation and test sets

In [31]:
X_train, X_test, y_train, y_test = train_test_split(tokens, labels,
                                                   test_size=0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, 
                                                      y_train,
                                                     test_size=0.25)

### Create PyTorch datasets

In [32]:
train_dataset = TextDataset(X_train, y_train)
valid_dataset = TextDataset(X_valid, y_valid)
test_dataset = TextDataset(X_test, y_test)

### Create data loaders

In [33]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=16,
                                           shuffle=True,
                                           drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                         batch_size=16)

### Create model object

In [34]:
device = torch.device("cuda:0" if torch.cuda.is_available()
                     else "cpu")

In [35]:
model = BertClassifier(dropout, num_classes)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Define loss function and optimizer

In [36]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

### Move the model to GPU if available

In [37]:
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

### Training loop

In [38]:
train(train_loader, valid_loader, model, criterion, optimizer,
     device, num_epochs, model_path)

Epoch 1 of 10


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [08:26<00:00,  1.69s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:44<00:00,  2.27it/s]


Train Loss: 1.7030835258960724, Validation Loss: 1.6930737668275833
Best Validation Loss: 1.6930737668275833
Epoch 2 of 10


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [08:38<00:00,  1.73s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:49<00:00,  2.00it/s]


Train Loss: 1.6930189522107442, Validation Loss: 1.6753383910655975
Best Validation Loss: 1.6753383910655975
Epoch 3 of 10


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [13:23<00:00,  2.68s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:33<00:00,  3.01it/s]


Train Loss: 1.7490395891666413, Validation Loss: 1.682252868413925
Best Validation Loss: 1.6753383910655975
Epoch 4 of 10


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [09:35<00:00,  1.92s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:51<00:00,  1.93it/s]


Train Loss: 1.7182658741871515, Validation Loss: 1.689323068857193
Best Validation Loss: 1.6753383910655975
Epoch 5 of 10


100%|██████████████████████████████████████████████████████████████████████████████| 300/300 [3:46:17<00:00, 45.26s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:45<00:00,  2.22it/s]


Train Loss: 1.719649334748586, Validation Loss: 1.6949700462818145
Best Validation Loss: 1.6753383910655975
Epoch 6 of 10


100%|████████████████████████████████████████████████████████████████████████████| 300/300 [17:10:43<00:00, 206.15s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:38<00:00,  2.58it/s]


Train Loss: 1.6981363727649053, Validation Loss: 1.6659053421020509
Best Validation Loss: 1.6659053421020509
Epoch 7 of 10


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [09:52<00:00,  1.98s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:47<00:00,  2.12it/s]


Train Loss: 1.6861442536115647, Validation Loss: 1.7162499260902404
Best Validation Loss: 1.6659053421020509
Epoch 8 of 10


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [08:09<00:00,  1.63s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:37<00:00,  2.65it/s]


Train Loss: 1.6848732642332713, Validation Loss: 1.6718441593647002
Best Validation Loss: 1.6659053421020509
Epoch 9 of 10


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [09:03<00:00,  1.81s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:40<00:00,  2.48it/s]


Train Loss: 1.6878118932247161, Validation Loss: 1.6759009492397308
Best Validation Loss: 1.6659053421020509
Epoch 10 of 10


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [08:43<00:00,  1.74s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:39<00:00,  2.52it/s]

Train Loss: 1.685974834561348, Validation Loss: 1.6787320041656495
Best Validation Loss: 1.6659053421020509





### Test the model

In [39]:
test(test_loader, model, criterion, device)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:40<00:00,  2.47it/s]

Test Loss: 1.6601996886730195, Test Accuracy: 0.453125





## Predict on new text
---

In [40]:
input_text = '''I am a victim of Identity Theft & currently have an Experian account that 
I can view my Experian Credit Report and getting notified when there is activity on 
my Experian Credit Report. For the past 3 days I've spent a total of approximately 9 
hours on the phone with Experian. Every time I call I get transferred repeatedly and 
then my last transfer and automated message states to press 1 and leave a message and 
someone would call me. Every time I press 1 I get an automatic message stating than you 
before I even leave a message and get disconnected. I call Experian again, explain what 
is happening and the process begins again with the same end result. I was trying to have 
this issue attended and resolved informally but I give up after 9 hours. There are hard 
hit inquiries on my Experian Credit Report that are fraud, I didn't authorize, or recall 
and I respectfully request that Experian remove the hard hit inquiries immediately just 
like they've done in the past when I was able to speak to a live Experian representative 
in the United States. The following are the hard hit inquiries : BK OF XXXX XX/XX/XXXX 
XXXX XXXX XXXX  XX/XX/XXXX XXXX  XXXX XXXX  XX/XX/XXXX XXXX  XX/XX/XXXX XXXX  XXXX 
XX/XX/XXXX'''

In [41]:
input_text = input_text.lower()
input_text = re.sub(r"[^\w\d'\s]+", " ", input_text)
input_text = re.sub("\d+", "", input_text)
input_text = re.sub(r'[x]{2,}', "", input_text)
input_text = re.sub(' +', ' ', input_text)

In [42]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [43]:
tokens = tokenizer(input_text, padding="max_length",
                 max_length=seq_len, truncation=True,
                 return_tensors="pt")

In [44]:
input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]

In [45]:
device = torch.device("cuda:0" if torch.cuda.is_available()
                     else "cpu")

In [46]:
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

In [47]:
input_ids = torch.squeeze(input_ids, 1)

In [48]:
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)

In [49]:
# Create model object
model = BertClassifier(dropout, num_classes)

# Load trained weights
model.load_state_dict(torch.load(model_path))

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    
# Forward pass
out = torch.squeeze(model(input_ids, attention_mask))

# Find predicted class
prediction = label_encoder.classes_[torch.argmax(out)]
print(f"Predicted Class: {prediction}")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted Class: credit_report
