# Without Augmentation

In [None]:
!pip install transformers

In [None]:
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import random
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
# Set random seeds for reproducibility
random.seed(42)
torch.manual_seed(42)

# Load dataset
df = pd.read_csv("base_df.csv")
df['message'] = df["message"].astype(str)

# excluding newly added data
df = df[df['category'] != -1]
df

## Text Preprocessing

In [None]:
def clean_text(text):
    """
    text: a string

    return: cleaned text
    """
    text = text.lower()
    text = replace_symbols.sub(' ', text)
    text = bad_symbols.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text

# replace symbols by space in text
replace_symbols = re.compile('[/(){}\[\]\|@,;]')

# remove symbols wfrom text
bad_symbols = re.compile('[^0-9a-z #+_]')

# remove stopwors from text
stopwords = set(stopwords.words('english'))

# cleaning the data and adding to the same column
df['message'] = df['message'].apply(clean_text)
df

## BERT training

In [None]:

# train and validation/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(df["message"], df["field"], test_size = 0.2, random_state = 42)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size = 0.5, random_state = 42)

# label encodings
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)
val_labels = le.transform(val_labels)
test_labels = le.transform(test_labels)

# loading pretrained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [None]:
# tokenizing and encoding train and val set
train_encodings = tokenizer(train_texts.tolist(), truncation = True, padding = True)
val_encodings = tokenizer(val_texts.tolist(), truncation = True, padding = True)
test_encodings = tokenizer(test_texts.tolist(), truncation = True, padding = True)

# creating dataset class
class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # setting labels
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# getting dataset objects
train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)
test_dataset = MyDataset(test_encodings, test_labels)

# hyperparameters
batch_size = 16
num_epochs = 3

# loading pretrained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(le.classes_))

# setting optimizer
optimizer = AdamW(model.parameters(), lr = 5e-5)

# setting scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(train_dataset) * num_epochs // batch_size)

In [None]:
# Define function for computing accuracy
def compute_accuracy(preds, labels):
    return (preds == labels).mean()

# Train model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# model to device and setting for training
model.to(device)
model.train()

# loading train data
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)

In [None]:
def evaluate(model, val_texts, val_labels, tokenizer, device):
    """
    this function will return validation loss and accuracy on the validation set
    """
    # val dataloader
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    # evaluating model on the val set
    model.eval()
    val_loss = 0
    val_acc = 0

    with torch.no_grad():
        # for each batch in validation loader
        for batch in val_loader:
            # getting embeddings, attentionmask and labels
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # getting output for the batch
            outputs = model(input_ids, attention_mask = attention_mask, labels = labels)

            # getting losses and accuracies
            val_loss += outputs.loss.item()
            val_acc += (outputs.logits.argmax(dim = 1) == labels).sum().item()

    val_loss /= len(val_loader)
    val_acc /= len(val_dataset)

    return val_loss, val_acc

In [None]:
# starting to train
model.train()

for epoch in range(num_epochs):
    train_loss = 0
    train_preds = []
    train_labels = []

    # getting batches in train data
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # getting output and computing losses
        outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
        loss = outputs.loss
        logits = outputs.logits

        # bw pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # trian loss
        train_loss += loss.item()

        preds = torch.argmax(logits, dim = 1).detach().cpu().numpy().tolist()
        train_preds.extend(preds)
        train_labels.extend(labels.detach().cpu().numpy().tolist())

    # final accuracies on train and val set
    train_acc = accuracy_score(train_labels, train_preds)
    val_loss, val_acc = evaluate(model, val_texts, val_labels, tokenizer, device)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss / len(train_loader) : .3f}, Train Acc: {train_acc : .3f}, Val Acc: {val_acc : .3f}')


In [None]:
# evaluating on test set
model.eval()

# getting test loss and accuracy
test_loss, test_acc = evaluate(model, test_texts, test_labels, tokenizer, device)
print(f"Test Loss: {test_loss : .4f}, Test Accuracy: {test_acc : .4f}")

### Testing on Holdout Set

In [None]:
# testing on a specific query
query = "Hello there! How are you?"

# encoding query
encoded_query = tokenizer.encode_plus(query, add_special_tokens = True, return_tensors = 'pt')

# switching to device
input_ids = encoded_query['input_ids'].to(device)
attention_mask = encoded_query['attention_mask'].to(device)

# getting model output by a forward pass
with torch.no_grad():
    output = model(input_ids, attention_mask)

# getting probability
probs = output.logits.softmax(dim = 1).detach().cpu().numpy()

# getting index of label
label_index = np.argmax(probs)

# getting label
label_name = le.inverse_transform([label_index])[0]

print(f"The predicted label for the query '{query}': {label_name}.")

In [None]:
def test_model():
    """
    this function will predict anything any message text availabe on df_name given a model

    will return a dictionary dictionary with actual label and predicted label
    """
    # initial correct and wrong
    correct = 0
    wrong = 0
    result = {}

    for index, row in pd.read_csv("new_data.csv").iterrows():
        query = clean_text(row['message'])

        # encoding query
        encoded_query = tokenizer.encode_plus(query, add_special_tokens = True, return_tensors = 'pt')

        # switching to device
        input_ids = encoded_query['input_ids'].to(device)
        attention_mask = encoded_query['attention_mask'].to(device)

        # getting model output by a forward pass
        with torch.no_grad():
            output = model(input_ids, attention_mask)

        # getting probability
        probs = output.logits.softmax(dim = 1).detach().cpu().numpy()

        # getting index of label
        label_index = np.argmax(probs)

        # getting label
        predict_label = le.inverse_transform([label_index])[0]

        result[index] = {
            "actual_label" : row['category'],
            "predicted_label" : predict_label
        }

        if predict_label == row['category']:
            correct += 1
        else:
            wrong += 1

    print(f"Right: {correct}\tWrong: {wrong}")

    return result

test_model()

In [None]:
# classification reports
from sklearn.metrics import confusion_matrix, classification_report

def report_for_1k(dct):
    # getting confusion matrix for the holdout set
    y_true = []
    y_pred = []

    for value in list(dct.values()):
        y_true.append(value["actual_label"])
        y_pred.append(value['predicted_label'])

    # getting confustion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    display(pd.DataFrame(cm, columns = le.classes_, index = le.classes_))

    # getting classification report
    report = classification_report(y_true, y_pred)
    print("Classification Report:")
    print(report)

report_for_1k(test_model())

In [None]:
# saving model to disk
# model.save_pretrained("bert_pretrained_no_augment.pt")

In [None]:
# loading model from disk
# model = BertForSequenceClassification.from_pretrained("bert_pretrained_no_augment.pt")

# With Augmentation

In [None]:
# loading augmented data
df = pd.read_csv("updated_final_df.csv")
df['message'] = df["message"].astype(str)
df

In [None]:
# applying text preprocessing
df['message'] = df['message'].apply(clean_text)
df

In [None]:
# train/val/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(df["message"], df["topic_field"], test_size = 0.2, random_state = 42)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size = 0.5, random_state = 42)

# encoding labels
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)
val_labels = le.transform(val_labels)
test_labels = le.transform(test_labels)

# tokenizing and encoding
train_encodings = tokenizer(train_texts.tolist(), truncation = True, padding = True)
val_encodings = tokenizer(val_texts.tolist(), truncation = True, padding = True)
test_encodings = tokenizer(test_texts.tolist(), truncation = True, padding = True)

# creating readable dataset
train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)
test_dataset = MyDataset(test_encodings, test_labels)

In [None]:
# model, optimizer, and scheduler initialization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(le.classes_))
optimizer = AdamW(model.parameters(), lr = 5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(train_dataset) * num_epochs // batch_size)

# switching to device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

In [None]:
# setting up for training
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
model.train()


for epoch in range(num_epochs):
    train_loss = 0
    train_preds = []
    train_labels = []

    # getting batches in train data
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # getting output and computing losses
        outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
        loss = outputs.loss
        logits = outputs.logits

        # bw pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # trian loss
        train_loss += loss.item()

        preds = torch.argmax(logits, dim = 1).detach().cpu().numpy().tolist()
        train_preds.extend(preds)
        train_labels.extend(labels.detach().cpu().numpy().tolist())

    # final accuracies on train and val set
    train_acc = accuracy_score(train_labels, train_preds)
    val_loss, val_acc = evaluate(model, val_texts, val_labels, tokenizer, device)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss / len(train_loader) : .3f}, Train Acc: {train_acc : .3f}, Val Acc: {val_acc : .3f}')

In [None]:
# evaluating on test set
model.eval()

# getting test loss and accuracy
test_loss, test_acc = evaluate(model, test_texts, test_labels, tokenizer, device)
print(f"Test Loss: {test_loss : .4f}, Test Accuracy: {test_acc : .4f}")

In [None]:
# testing on the holdout data
test_model()

In [None]:
# classification report
report_for_1k(test_model())

In [None]:
# Save the model to disk
# model.save_pretrained("bert_pretrained_augment.pt")

# Model Summary

In [None]:
import torchsummary

# getting number of sample per batch
for batch in train_loader:
    num_s = len(batch['labels'])
    break

torchsummary.summary(model, input_size = [(num_s, tokenizer.max_len_single_sentence), (num_s, tokenizer.max_len_single_sentence)])

# TO-DOs

In [None]:
# TO DO
# - Confusion Matrix
# - Classification Report

# - Discord Integration
# - Online learning

# Online Learning (TO-DO)

In [None]:
# loading the learned model
model = BertForSequenceClassification.from_pretrained("bert_pretrained_augment.pt")

# setting optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 1, gamma = 0.1)

# new data to load
new_data = []
new_labels = []

# tokenizing the new data
new_encodings = tokenizer(new_data, truncation=True, padding=True)

# label conversion
# TO DO: label encoding is necessary
new_labels = torch.tensor(new_labels)

# torch readable data
new_dataset = torch.utils.data.TensorDataset(new_encodings['input_ids'], new_encodings['attention_mask'], new_labels)
new_loader = torch.utils.data.DataLoader(new_dataset, batch_size=16, shuffle=True)

# loading state dictionary from the saved model
# TO-DO: load the latest weight
model.load_state_dict(torch.load('updated_model_weights_N.pt'))

# training the model on new data
num_epochs = 3
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for batch in new_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
    scheduler.step()

# saving the updated model weight
torch.save(model.state_dict(), 'updated_model_weights_N-1.pth')

# TO-DO: better handle of saved model weights so that device don't get full
# Idea: Keep only latest version and latest - 1 version

# Application (Idea)

In [None]:
from joblib import load

# Defining an empty function which will simulate online learning
# Currently, not doing it because I don't want to ruin my model weights with test

def push_for_online_learning():
    # Currently doing nothing
    # TO-DO
    pass

def predict_text_label(query):
    """
    getting predicted label
    """
    model = BertForSequenceClassification.from_pretrained("bert_pretrained_augment.pt")
    model.to(device)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

    encoded_query = tokenizer.encode_plus(clean_text(query), add_special_tokens = True, return_tensors = 'pt')
    input_ids = encoded_query['input_ids'].to(device)
    attention_mask = encoded_query['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)

    probs = output.logits.softmax(dim=1).detach().cpu().numpy()
    label_index = np.argmax(probs)
    le = load('bert_pretrained_augment_le.joblib')
    label_name = le.inverse_transform([label_index])[0]

    return label_name

# Storing predicted classes and new inpurts
new_inputs = []
predicted_classes = []
saved_chat = {}

valid_switch = False
invalid_switch = False
invalid_switch_counter = 0

while(0):
    u1 = input("User 1:\t")
    lu1 = predict_text_label(u1)
    print(f"Text: {u1}\tClass:\t{lu1}")

    # Saving data for user 1
    if "u1" not in saved_chat:
        saved_chat["u1"] = []
    saved_chat['u1'].append((u1, lu1, "none"))
    predicted_classes.append(lu1)


    u2 = input("User 2:\t")
    lu2 = predict_text_label(u2)
    print(f"Text: {u2}\tClass:\t{lu2}")

    # Saving data for user 2
    if "u2" not in saved_chat:
        saved_chat["u2"] = []
    saved_chat['u2'].append((u2, lu2, "none"))
    predicted_classes.append(lu1)


    # Storing them for online leaning
    # We will perform online learning if we give user a wrongful warning
    # Saying that - they are getting out of topic!

    # Checking if we have some previous data or not.
    # Because, if the topic switch is from generic to some other topic > that is a valid switch
    # If, some other topic to generic switch, and it goes for more than two round, then we will warn user!
    # At the same time we will take a feedback if they were on the related topic or not!

    # At the starting of conversation
    if len(predicted_classes) <= 2:
        # If classified doesn't match
        if lu1 != lu2:

            # If it is a transition from "generic" topic
            if lu1 == "generic":
                valid_switch = True

    # They at least had one round of conversation, then -
    else:
        # Getting the last saved topic from user 2 to user 1
        u2_last_class = predicted_classes[-1]
        # u2_last_class = ltup[-1]

        # For user 2, last topic class is lu1
        u1_last_class = lu1

        # Now checking if in the beginning of round >=2,
        # They user 1 switched topic or not!

        # Meaning, user 1 switched topic
        if lu1 != u2_last_class:

            # Since user 1 switched topic, let's see if this switch is from generic or not!
            # If switched from generic, then it's valied switch.
            if u2_last_class == "generic":
                valid_switch = True
                invalid_switch_counter = 0

            # Else, it is a invalid switch of topic
            else:
                invalid_switch = True
                valid_switch = False
                invalid_switch_counter += 1

        # If both matches, then a valid topic switch!
        else:
            valid_switch = True
            invalid_switch_counter = 0


        # If invalid topic switch counter is 1, we get a feedback that,
        # If user 1 switched the topic or not

        if invalid_switch_counter >= 1:
            print(80 * "=")
            print("\t\tFeedback Section!")
            print(80 * "-")
            print("\t\tIt seems like you have switched your conversation topic!")
            print(80 * "-")
            print("\t\tWe detected (last users chat as) - ")
            print(f"\t\tPartners' Text:\t{ltuple[0]}")
            print(f"\t\tPartners' Text Class:\t{ltuple[1]}")
            print(70 * "-")
            print(f"\t\tYour Text:\t{u1}")
            print(f"\t\tYour Text Class:\t{lu1}")
            print(70 * "-")
            u1_input = input("\t\tHave you switched your topic in the mentioned class? ('yes'/'no')")

            # If user disagrees then
            if u1_input == "no":
                saved_chat['u1'][-1] = (u1, ltuple[0], "feedbacked")
            print(80 * "-")
            print("\t\tThank you for the feedback!")
            print(80 * "=")


        # Now checking, if at the end of round two if user 2 switched topic or not
        if lu2 != u1_last_class:

            # Since user 2 switched topic, let's see if this switch is to generic or not!
            # If switched from generic, then it's valied switch.
            if lu2 == "generic":
                valid_switch = True
                invalid_switch_counter = 0

            # Else, it is an invalid switch!
            else:
                invalid_switch = True
                valid_switch = False
                invalid_switch_counter += 1

        # If both matches, then a valid topic switch!
        else:
            valid_switch = True
            invalid_switch_counter = 0

        if invalid_switch_counter >= 1:
            print(80 * "=")
            print("\t\tFeedback Section!")
            print(80 * "-")
            print("\t\tIt seems like you have switched your conversation topic!")
            print(80 * "-")
            print("\t\tWe detected (last users chat as) - ")
            print(f"\t\tPartners' Text:\t{u1}")
            print(f"\t\tPartners' Text Class:\t{lu1}")
            print(70 * "-")
            print(f"\t\tYour Text:\t{u2}")
            print(f"\t\tYour Text Class:\t{lu2}")
            print(70 * "-")
            u1_input = input("\t\tHave you switched your topic in the mentioned class? ('yes'/'no')")

            # If user disagrees then
            if u1_input == "no":
                saved_chat['u2'][-1] = (u2, lu1, "feedbacked")
            print(80 * "-")
            print("\t\tThank you for the feedback!")
            print(80 * "=")

    # If concurrent topic switch counter get more a certain number, then we can warn the users
    # so that they can stick to the main topic.

    # This could be set to 2 (1 or 2, using trial and error)
    # Also, not forcing this warning by checking the length of predicted class (can be a fixed number by trial and error)
    topic_switch_level = 1
    if invalid_switch_counter >= topic_switch_level and len(predicted_classes):
        print("Warning! Please stick to a topic!")

    # If last 4 classified labels are of generic class, then they should avoid talking
    # on general issues and stick to the main issue

    # This last 4 class cloud be fixed by tiral and error
    if len(predicted_classes) >= 4:
        tolerance_lavel_on_general_topic = 4

        last_set = set(predicted_classes[-tolerance_lavel_on_general_topic:])
        if len(last_set) == 1 and list(last_set)[-1] == "generic":
            print("Warning! Avoid conversation on generic topic!")
    ltuple = (u2, lu2)