In [52]:
import pandas as pd
import numpy as np
import joblib
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import accelerate

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [54]:
processed_df = joblib.load("../../data/processed/processed_github_issues_df.joblib")

processed_df.head()

Unnamed: 0,issue_title,body,clean_text,priority
1,long strings of input on the command-line fail,long strings of input on the command-line caus...,long string input commandline fail long string...,High
2,raster simulation rendering,! lw4_raster_display-simulate_quirks https://c...,raster simulation rendering lw4 raster display...,Low
4,first reading from adc without adc start adc i...,"temperature.cpp , version 1.1.x and 2.0 in fuc...",first reading adc without adc start adc temper...,Medium
7,retrieve any bookings for a specific customer id,"use case as a concierge, i want to retrieve an...",retrieve booking specific customer id use case...,High
10,does not recognise the bucket name,i am using django-s3-storage==0.11.2 and boto3...,recognise bucket name using djangos3storage011...,High


In [55]:
processed_df = processed_df.drop(columns=["clean_text"])
processed_df.head()

Unnamed: 0,issue_title,body,priority
1,long strings of input on the command-line fail,long strings of input on the command-line caus...,High
2,raster simulation rendering,! lw4_raster_display-simulate_quirks https://c...,Low
4,first reading from adc without adc start adc i...,"temperature.cpp , version 1.1.x and 2.0 in fuc...",Medium
7,retrieve any bookings for a specific customer id,"use case as a concierge, i want to retrieve an...",High
10,does not recognise the bucket name,i am using django-s3-storage==0.11.2 and boto3...,High


In [56]:
# Merge issue_title and body (BERT input)
processed_df['text'] = processed_df['issue_title'] + " " + processed_df['body']
processed_df = processed_df[['text', 'priority']]
processed_df.head()

Unnamed: 0,text,priority
1,long strings of input on the command-line fail...,High
2,raster simulation rendering ! lw4_raster_displ...,Low
4,first reading from adc without adc start adc i...,Medium
7,retrieve any bookings for a specific customer ...,High
10,does not recognise the bucket name i am using ...,High


In [57]:
processed_df.shape

(38166, 2)

In [58]:
processed_df['priority'].value_counts()

priority
High        17555
Low          9866
Critical     5552
Medium       5193
Name: count, dtype: int64

In [97]:
# Use LabelEncoder to encode target labels with values 
le = LabelEncoder()
le.fit(["Low", "Medium", "High", "Critical"])
processed_df['label'] = le.fit_transform(processed_df['priority'])
processed_df.head()

Unnamed: 0,text,priority,label
1,long strings of input on the command-line fail...,High,1
2,raster simulation rendering ! lw4_raster_displ...,Low,2
4,first reading from adc without adc start adc i...,Medium,3
7,retrieve any bookings for a specific customer ...,High,1
10,does not recognise the bucket name i am using ...,High,1


In [60]:
joblib.dump(le, "../../data/processed/label_encoder.joblib")

['../../data/processed/label_encoder.joblib']

In [61]:
total_samples = processed_df.shape[0]
num_classes = 4
class_counts = processed_df['priority'].value_counts()
class_weights = total_samples / (num_classes * class_counts)
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

  weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)


In [62]:
train_df, test_df = train_test_split(
  processed_df,
  test_size=0.2,
  stratify=processed_df['label'],
  random_state=42
)

print("Train shape: ", train_df.shape)
print("Test shape: ", test_df.shape)

Train shape:  (30532, 3)
Test shape:  (7634, 3)


In [63]:
# Tokenize text column 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_inputs = tokenizer(
  train_df["text"].tolist(),
  padding=True,
  truncation=True,
  max_length=256,
  return_tensors="pt"
)

test_inputs = tokenizer(
  test_df["text"].tolist(),
  padding=True,
  truncation=True,
  max_length=256,
  return_tensors="pt"
)

In [64]:
class BugDataset(torch.utils.data.Dataset):
  """
  PyTorch Dataset for BERT inputs.
  
  Args:
    inputs (dict): Tokenized input dictionary.
    labels (tensor or list): Encoded target labels for classification.
    
  Returns:
    a dictionary for each sample.
  """
  def __init__(self, inputs, labels):
    self.inputs = inputs
    self.labels = labels
    
  def __len__(self):
    return len(self.labels)
  
  def __getitem__(self, index):
    return {
      "input_ids": self.inputs["input_ids"][index],
      "attention_mask": self.inputs["attention_mask"][index],
      "labels": self.labels[index]
    }

In [65]:
labels_tensor_train = torch.tensor(train_df['label'].values, dtype=torch.long)
labels_tensor_test = torch.tensor(test_df['label'].values, dtype=torch.long)

# Create PyTorch Dataset objects
train_dataset = BugDataset(train_inputs, labels_tensor_train)
test_dataset = BugDataset(test_inputs, labels_tensor_test)

In [68]:
class WeightedBert(BertForSequenceClassification):
  def __init__(self, config, class_weights=None, **kwargs):
    super().__init__(config)
    self.class_weights = class_weights
  
  def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
    outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    loss = None
    
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
      loss = loss_fct(logits, labels)
      return {"loss": loss, "logits": logits}
    return {"logits": logits}

In [69]:
num_labels = processed_df['label'].nunique()
model = WeightedBert.from_pretrained(
  'bert-base-uncased',
  num_labels=num_labels,
  class_weights=weights_tensor
)

Some weights of WeightedBert were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
processed_df["text"].apply(lambda x: len(x.split()))

1         47
2        223
4        144
7        133
10       139
        ... 
89979     67
89985     84
89988     84
89990     42
89995    154
Name: text, Length: 38166, dtype: int64

In [72]:
(processed_df["text"].apply(lambda x: len(x.split())) > 256).sum()

np.int64(807)

In [73]:
processed_df["text"].apply(lambda x: len(x.split())).mean()

np.float64(100.33689671435309)

In [74]:
train_dataset.__len__()

30532

In [75]:
train_dataset.__getitem__(0)

{'input_ids': tensor([  101, 10386,  7561,  7592,  1010,  1045,  2001,  5604,  1996,  2223,
         17928,  1998,  2363,  1037, 10386,  1999,  2026,  3898,  1012,  1045,
          7039,  2012, 11421,  9797,  1998,  1996, 17928,  2001,  2428,  2091,
          1012,  1045,  2699,  2000,  4608,  1996,  9413, 14544,  2007,  2223,
         13088,  9331,  1032,  3433,  1032,  8299, 12740,  2487,  2021,  2134,
          1005,  1056,  2147,  1012,  2151, 15690,  1029,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [76]:
test_dataset.__len__()

7634

In [77]:
test_dataset.__getitem__(0)

{'input_ids': tensor([  101,  6187,  8450,  3674,  5746,  6764,  7632,  3071,  1010,  3603,
         12849,  2884,  2096,  2559,  2005,  1037,  4942, 18585,  4522,  1012,
          2293,  1996, 21318,  1998,  2031,  2209,  1037,  2978,  2007,  1996,
          4684,  8278,  2364,  2224,  1045,  1005,  2222,  2031,  1998,  2009,
          3849,  2008,  2069,  1015,  2299,  2003, 17053,  2094,  2013,  1996,
          2085,  2652,  2862,  2012,  1037,  2051,  1012,  2383,  1996,  2878,
          2862, 17053,  2094,  2007,  2005,  6013,  1037,  5310,  1011,  2275,
          5787,  2006, 17053,  2946,  2052,  2022,  2307,  2005,  2216,  2146,
          6005,  9109,  1024,  1012,  4283,  1037,  2843,   999,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [78]:
# Define training arguments
args = TrainingArguments(
  output_dir="../models/",
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  learning_rate=3e-5,
  num_train_epochs=4.0,
  eval_strategy="epoch",
  save_strategy="epoch",
  logging_dir="../models/bert/logs/",
  logging_steps=50,
  weight_decay=0.01
)

In [79]:
def evaluate_metric(prediction_output):
  logits = prediction_output.predictions
  true_labels = prediction_output.label_ids
  
  predicted_labels = np.argmax(logits, axis=1)
  
  accuracy = accuracy_score(true_labels, predicted_labels)
  f1_weighted = f1_score(true_labels, predicted_labels, average="weighted")
  
  return {
    "accuracy": accuracy,
    "f1_weighted": f1_weighted
  }

In [80]:
trainer = Trainer(
  model=model,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  args=args,
  compute_metrics=evaluate_metric
)

In [81]:
next(model.parameters()).device

device(type='cuda', index=0)

In [82]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.1696,0.169119,0.960178,0.960133
2,0.0987,0.188315,0.962012,0.961927
3,0.0356,0.192983,0.96306,0.962945
4,0.0576,0.211602,0.964239,0.964162


TrainOutput(global_step=15268, training_loss=0.15429682486620067, metrics={'train_runtime': 7325.8891, 'train_samples_per_second': 16.671, 'train_steps_per_second': 2.084, 'total_flos': 1.6066901995782144e+16, 'train_loss': 0.15429682486620067, 'epoch': 4.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.211602121591568,
 'eval_accuracy': 0.9642389310977207,
 'eval_f1_weighted': 0.9641618390039496,
 'eval_runtime': 123.4503,
 'eval_samples_per_second': 61.839,
 'eval_steps_per_second': 7.736,
 'epoch': 4.0}

max_iter=128<br>
learning_rate=5e-5<br>
num_train_epochs=4.0<br>

result:<br>
{'eval_loss': 0.5069653987884521,<br>
 'eval_accuracy': 0.8882089903506707,<br>
 'eval_f1_weighted': 0.887788724595662,<br>
 'eval_runtime': 32.0354,<br>
 'eval_samples_per_second': 132.634,<br>
 'eval_steps_per_second': 16.607,<br>
 'epoch': 4.0}<br>

In [84]:
trainer.save_model("../models/bert/trainer")
tokenizer.save_pretrained("../models/bert/tokenizer")

('../models/bert/tokenizer\\tokenizer_config.json',
 '../models/bert/tokenizer\\special_tokens_map.json',
 '../models/bert/tokenizer\\vocab.txt',
 '../models/bert/tokenizer\\added_tokens.json')

In [None]:
model = BertForSequenceClassification.from_pretrained("../models/bert/trainer")
tokenizer = BertTokenizer.from_pretrained("../models/bert/tokenizer")

In [113]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [141]:
# Test text
bug_title = "Payment processing fails for all transactions"
bug_description = "Users cannot complete any payments. Transactions are rejected with a server error 500 immediately after submitting payment info."
bug_text = [bug_title + " " + bug_description]

tokenized_input = tokenizer(
  bug_text,
  padding=True,
  truncation=True,
  max_length=128,
  return_tensors="pt"
)

logits = model(**tokenized_input).logits
probabilities = torch.nn.functional.softmax(logits, dim=-1)
predicted_index = probabilities.argmax(dim=-1).item()
predicted_priority = le.inverse_transform([predicted_index])

class_probs = {label: float(prob) for label, prob in zip(le.classes_, probabilities[0])}
class_probs

{'Critical': 0.998690664768219,
 'High': 0.0007220040424726903,
 'Low': 0.0003455271653365344,
 'Medium': 0.00024173948622774333}