In [None]:
# Using BERT embedding approach to classify Decision according to Issues

In [None]:
#mount the google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pandas torch transformers datasets tokenizers scikit-learn openpyxl

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from transformers import BertTokenizerFast
from tokenizers import BertWordPieceTokenizer

In [None]:
INPUT_EXCEL_FILE1 = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready_Issues_train.xlsx"
INPUT_EXCEL_FILE2 = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready_Issues_test.xlsx"

In [None]:
#load training and testing files for Issues
df_train_issues = pd.read_excel(INPUT_EXCEL_FILE1)
df_test_issues = pd.read_excel(INPUT_EXCEL_FILE2)

In [None]:
df_train_issues.columns

In [None]:
df_train_issues.shape
#should be 4882

In [None]:
df_test_issues.shape
#should be 1221

In [None]:
#check unique values in 'Decision' column
df_train_issues['Decision'].unique()

In [None]:
#check the distribution of Decsion variable in train and test as percentage
df_train_issues['Decision'].value_counts(normalize=True)
#should be about 63.25%-36.75%

In [None]:
df_test_issues['Decision'].value_counts(normalize=True)

In [None]:
train_t = df_train_issues['Preprocesses_Issues'].tolist()
test_text = df_test_issues['Preprocesses_Issues'].tolist()

train_l = df_train_issues['Decision'].tolist()
test_labels = df_test_issues['Decision'].tolist()

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

In [None]:
# Configuration
MODEL_NAME = 'bert-base-uncased'
MAX_LENGTH = 256  # Max sequence length for BERT
BATCH_SIZE = 16
EPOCHS = 16
LEARNING_RATE = 2e-5

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Split training data again into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_t, train_l, test_size=0.2, random_state=42, stratify=train_l
)

# 2. Tokenization
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

def tokenize_data(texts, labels):
    """Tokenizes text data and prepares inputs for BERT."""
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                            max_length=MAX_LENGTH,
                            pad_to_max_length=True,
                            return_attention_mask=True,
                            return_tensors='pt',     # Return PyTorch tensors
                            truncation=True
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

train_input_ids, train_attention_masks, train_labels_tensor = tokenize_data(train_texts, train_labels)
val_input_ids, val_attention_masks, val_labels_tensor = tokenize_data(val_texts, val_labels)

# 3. Create PyTorch Datasets and DataLoaders
class IssueDataset(Dataset):
    """Custom Dataset class for BERT."""
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

train_dataset = IssueDataset(train_input_ids, train_attention_masks, train_labels_tensor)
val_dataset = IssueDataset(val_input_ids, val_attention_masks, val_labels_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# 4. Define and Initialize BERT Model
# BertForSequenceClassification has a linear layer on top for classification.
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,  # Binary classification (0 or 1)
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device) # Move model to GPU if available

# 5. Optimizer and Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, # Default value
    num_training_steps=total_steps
)

# 6. Training Loop
def train_epoch(model, data_loader, optimizer, device, scheduler):
    """Trains the model for one epoch."""
    model.train()
    total_loss = 0
    for batch in data_loader:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        model.zero_grad()

        # Forward pass
        outputs = model(input_ids,
                        token_type_ids=None, # Not needed for BERT sequence classification
                        attention_mask=attention_mask,
                        labels=labels)

        loss = outputs.loss
        logits = outputs.logits # Raw scores, before SoftMax

        total_loss += loss.item()
        loss.backward() # Backward pass to calculate gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip gradients to prevent exploding gradients
        optimizer.step() # Update weights
        scheduler.step() # Update learning rate

    avg_train_loss = total_loss / len(data_loader)
    return avg_train_loss

# 7. Evaluation Function
def eval_model(model, data_loader, device):
    """Evaluates the model on the validation set."""
    model.eval() # Set model to evaluation mode

    predictions = []
    actual_labels = []
    total_eval_loss = 0

    with torch.no_grad(): # Disable gradient calculations
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids,
                            token_type_ids=None,
                            attention_mask=attention_mask,
                            labels=labels)

            loss = outputs.loss
            logits = outputs.logits

            total_eval_loss += loss.item()

            # Move logits and labels to CPU for sklearn
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            predictions.extend(np.argmax(logits, axis=1).flatten())
            actual_labels.extend(label_ids.flatten())

    avg_val_loss = total_eval_loss / len(data_loader)
    accuracy = accuracy_score(actual_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(actual_labels, predictions, average='binary', zero_division=0)

    return avg_val_loss, accuracy, precision, recall, f1

# --- Training and Evaluation ---
print("\nStarting training...")
for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")

    avg_train_loss = train_epoch(model, train_dataloader, optimizer, device, scheduler)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    avg_val_loss, accuracy, precision, recall, f1 = eval_model(model, val_dataloader, device)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

print("\nTraining complete.")

# --- (Optional) Save the model and tokenizer ---
# output_dir = './bert_decision_classifier/'
# import os
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)
# print(f"Saving model to {output_dir}")
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

# --- (Optional) How to make predictions on new data ---
def predict(texts_to_predict, model, tokenizer, device, max_length=MAX_LENGTH):
    """Makes predictions on a list of new texts."""
    model.eval()

    all_predictions = []

    for text in texts_to_predict:
        encoded_review = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            pad_to_max_length=True,
            return_tensors='pt',
            truncation=True
        )

        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

        prediction = torch.argmax(logits, dim=1).cpu().numpy()[0]
        all_predictions.append(prediction)

    return all_predictions

# Example of predicting new issues:
# new_issues = [
#     "The website is down again.",
#     "Everything seems to be working fine today.",
#     "Need help with password reset."
# ]
# predictions = predict(new_issues, model, tokenizer, device)
# for issue, pred in zip(new_issues, predictions):
#     print(f"Issue: '{issue}' --> Predicted Decision: {pred}")

In [None]:
predictions = predict(test_text, model, tokenizer, device)

In [None]:
# create a code for calculating model performance metrics and deliver it as one single data frame
def model_metrics(pred, targets):
  #pred=model.predict(predictors)
 # pred_prob=model.predict_proba(predictors)[:,1]

  acc=accuracy_score(targets, pred)
  rec=recall_score(targets, pred)
  spec=specificity_score(targets, pred, average='binary')
  prec=precision_score(targets, pred)
  f1=f1_score(targets, pred)
  auc=roc_auc_score(targets,pred)

  df_metrics=pd.DataFrame({}, index=['Metrics'])
  df_metrics['Accuracy']=acc
  df_metrics['Recall']=rec
  df_metrics['Specificity']=spec
  df_metrics['Precision']=prec
  df_metrics['F1']=f1
  df_metrics['AUC']=auc

  return df_metrics

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Calculate and Display confusion matrix
from imblearn.metrics import specificity_score
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)

def display_confusion_matrix(pred, targets):
  #pred=model.predict(predictors)
  cm=confusion_matrix(targets, pred)
  #cm_percentage=cm.astype('float')/cm.sum()*100
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
  disp.plot(cmap=plt.cm.Blues)
  plt.title("Confusion Matrix")
  plt.show()

In [None]:
model_metrics(predictions, test_labels)

In [None]:
display_confusion_matrix(predictions, test_labels)