In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch


In [3]:
file_path = 'updated_banking_faq_with_solutions.csv'
df = pd.read_csv(file_path)

# Clean the text data
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

df['cleaned_variation'] = df['Variation'].apply(preprocess_text)

# Encode the target variable
le = LabelEncoder()
df['CAT_A_encoded'] = le.fit_transform(df['CAT_A'])


In [4]:
X = df['cleaned_variation'].values
y = df['CAT_A_encoded'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)


In [6]:
class QueryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = QueryDataset(train_encodings, y_train)
test_dataset = QueryDataset(test_encodings, y_test)


In [7]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_))


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
)


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


In [10]:
trainer.train()




Epoch,Training Loss,Validation Loss
1,3.4251,3.314991
2,1.2389,1.074304
3,0.289,0.414369


TrainOutput(global_step=720, training_loss=2.1985940148433047, metrics={'train_runtime': 1212.5738, 'train_samples_per_second': 4.75, 'train_steps_per_second': 0.594, 'total_flos': 62185659863040.0, 'train_loss': 2.1985940148433047, 'epoch': 3.0})

In [11]:
trainer.evaluate()


{'eval_loss': 0.4143691062927246,
 'eval_runtime': 11.8193,
 'eval_samples_per_second': 40.611,
 'eval_steps_per_second': 5.076,
 'epoch': 3.0}

In [12]:
def predict_solution(new_input):
    # Preprocess and tokenize the input
    cleaned_input = preprocess_text(new_input)
    inputs = tokenizer(cleaned_input, return_tensors='pt', truncation=True, padding=True, max_length=128)

    # Make the prediction
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=-1).item()

    # Convert the encoded label back to the original category
    predicted_cat_a = le.inverse_transform([predicted_class])[0]
    
    # Retrieve the corresponding solution from the DataFrame
    solution = df.loc[df['CAT_A'] == predicted_cat_a, 'solution'].values[0]
    return solution


In [13]:
# Example usage
new_query = "money transferred wrongly?"
predicted_solution = predict_solution(new_query)
print(f"The solution for '{new_query}' is: \n{predicted_solution}")


The solution for 'money transferred wrongly?' is: 
Please contact us immediately. We will attempt to recover the funds, but reversals depend on the other bank's policies.
