# Classification of Bug and Enhancement Reports with RoBERTa

This notebook demonstrates how to train a RoBERTa model for bug and enhancement report classification using the Hugging Face `transformers` framework.

## Setup

In [None]:
%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install torch torchvision
%pip install transformers
%pip install scikit-learn
%pip install accelerate
%pip install imbalanced-learn
%pip install sentence-transformers
%pip install alibi-detect

## Importing Libraries

Let's start by importing all the libraries needed for our project.

In [1]:
import os
import zipfile
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import yaml
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, util
from alibi_detect.cd import KSDrift
from alibi_detect.cd.tensorflow import UAE
from imblearn.under_sampling import RandomUnderSampler
from dateutil.relativedelta import relativedelta

## Choice of Dataset and Parameters
Write the parameters in the config.yaml file:
- `dataset_path`: defines the path of the dataset in CSV format
- `start_year_train`: defines the start year of the train dataset
- `end_year_train`: defines the end year of the train dataset
- `undersampling_flag`: defines with a boolean whether to perform undersampling
- `start_year`: defines the start year for the test dataset
- `start_month`: defines the start month for the test dataset
- `end_year`: defines the end year for the test dataset
- `end_month`: defines the end month for the test dataset

*NB*: 
- `start_year_train` and `end_year_train` are the range of years that make up the train dataset
- `start-year/start-month` and `end-year/end-month` make up two dates which will be the range in which the code will start testing, similarity and drift detection.
Each phase will analyze month by month within that range

In [None]:
config_path = './config.yaml'

In [None]:
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

dataset_path = config['dataset_path']
start_year_train = config['start_year_train']
end_year_train = config['end_year_train']
undersampling_flag = config['undersampling_flag']
start_year = config['start_year']
end_year = config['end_year']
start_month = config['start_month']
end_month = config['end_month']

## Setup Directories

In [3]:
# Extracting the dataset name from the dataset_path
dataset_name = os.path.basename(dataset_path).split('.')[0]

# Defining the results and model save paths using the dataset name
results_path = f'./RESULTS/{dataset_name}'

model_save_path = os.path.join(results_path, "model")

if not os.path.exists(results_path):
    os.makedirs(results_path)
    
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

## Data Preparation

We load the data from the CSV file and prepare it for evaluation.

In [None]:
# Load the dataset
df_all = pd.read_csv(dataset_path)
df_all['date'] = pd.to_datetime(df_all['date'], errors='coerce', format='%Y-%m-%dT%H:%M:%S.%f+0000')
df_all = df_all[df_all['label'].isin(['Bug', 'Enhancement'])]

# Filter data for training and validation (start_year_train - end_year_train)
df_train_val = df_all[(df_all['date'].dt.year >= start_year_train) & (df_all['date'].dt.year <= end_year_train)]
df_train_val['text'] = df_train_val['title'] + " " + df_train_val['body']
df_train_val['labels'] = df_train_val['label'].apply(lambda x: 1 if x == 'Enhancement' else 0)

## Data Division in Train e Validation
We split the data in train (70%) and validation (30%)

In [None]:
train_df, validation_df = train_test_split(df_train_val, test_size=0.3, random_state=42, stratify=df_train_val['labels'])

## Undersampling

This code initializes a RandomUnderSampler, fits it to the training data, and creates a new dataframe (train_df_resampled) with the undersampled data.
You can then use train_df_resampled to train the model instead of train_df.

In [None]:
if undersampling_flag:

    # Initialize the RandomUnderSampler
    rus = RandomUnderSampler(random_state=42)

    # Resample the training data
    x_train_resampled, y_train_resampled = rus.fit_resample(train_df.drop('labels', axis=1), train_df['labels'])

    # Reconstruct the training dataframe with the resampled data
    train_df = pd.concat([x_train_resampled, y_train_resampled], axis=1)

## Creation of the Dataset

We define a `CustomDataset` class to prepare the data for training with BERT.

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_token_len=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        if pd.isna(text):
            text = ""  # Replaces NaN values ​​with empty strings
        labels = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

## Load Model and Tokenizer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
train_dataset = CustomDataset(train_df['text'].to_numpy(), train_df['labels'].to_numpy(), tokenizer)
validation_dataset = CustomDataset(validation_df['text'].to_numpy(), validation_df['labels'].to_numpy(), tokenizer)

## Trainer

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    report = classification_report(labels, preds, output_dict=True)
    return report
    
training_args = TrainingArguments(
    output_dir=results_path,         # Directory where to save the trained models
    per_device_train_batch_size=32,  # Batch size for training
    per_device_eval_batch_size=32,   # Batch size for evaluation
    do_train=False,                  # Prevents training from starting
    do_eval=True,                    # Enable evaluation
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

## Date Preparation

In [None]:
# The start date is the first day of the start_year-start_month
start_date = pd.Timestamp(year=start_year, month=start_month, day=1)
# The end date is the last day of the end_year-end_month
end_date = pd.Timestamp(year=end_year, month=end_month, day=1) + relativedelta(months=1) - relativedelta(days=1)

## Test Month by Month
To test the model on data from subsequent months, one at a time, we load the data for each month and evaluate the model on them.

In [None]:
def evaluate_model_for_month(model, tokenizer, year, month, df):
    start_date = pd.Timestamp(year=year, month=month, day=1)
    end_date = start_date + relativedelta(months=1)

    test_df = df[(df['date'] >= start_date) & (df['date'] < end_date)]
    if test_df.empty:
        print(f"No data for {year}-{month}")
        return None
    
    test_df['text'] = test_df['title'] + " " + test_df['body']
    test_df['labels'] = test_df['label'].apply(lambda x: 1 if x == 'Enhancement' else 0)
    
    test_dataset = CustomDataset(test_df['text'].to_numpy(), test_df['labels'].to_numpy(), tokenizer)
    predictions = trainer.predict(test_dataset)
    metrics = compute_metrics(predictions)
    
    return metrics

results_by_month = {}

current_date = start_date
while current_date <= end_date:
    year = current_date.year
    month = current_date.month
    print(f"Testing for {year}-{month:02d}...")
    month_metrics = evaluate_model_for_month(model, tokenizer, year, month, df_all)
    if month_metrics:
        results_by_month[f"{year}-{month:02d}"] = month_metrics
    current_date += relativedelta(months=1)

### Save Results in CSV

In [None]:
# Convert test results to a DataFrame
test_results_df = pd.DataFrame.from_dict(results_by_month, orient='index').reset_index()
test_results_df.columns = ['Month', 'Metrics']
test_results_file_path = f"{results_path}/test_results_monthly.csv"
test_results_df.to_csv(test_results_file_path, index=False)
print(f"Test results saved to {test_results_file_path}")

## Definition of methods for generating plots
We define a parametric method for generating plots with respect to the desired metrics

In [33]:
def plot_metrics(results, metrics, title, start_ylim=None, end_ylim=None):
    # Every Six Months
    def plot_half_yearly(periods, results, title):
        plt.figure(figsize=(10, 6))
        
        if start_ylim is not None and end_ylim is not None and start_ylim < end_ylim:
            plt.ylim(start_ylim, end_ylim)
        
        for metric in metrics:
            if metric == 'accuracy':
                values = [results[period][metric] for period in periods]
            else:
                values = [results[period]['macro avg'][metric] for period in periods]
            plt.plot(periods, values, label=metric.capitalize(), marker='o')
        
        plt.title(title + " (Every 6 months)")
        plt.xlabel('Year-Month')
        plt.ylabel('Score')
        plt.legend()
        plt.grid(True)
        plt.xticks(periods[::6], rotation=90)
        plt.tight_layout()
        
        file_name = f"{results_path}/{title.replace(' ', '_').lower()}_every_6_months.png"
        plt.savefig(file_name)
        print(f"Plot saved: {file_name}")
        
        plt.show()
        
    # Every Year
    def plot_yearly(periods, results, title):
        plt.figure(figsize=(10, 6))
        
        if start_ylim is not None and end_ylim is not None and start_ylim < end_ylim:
            plt.ylim(start_ylim, end_ylim)
        
        for metric in metrics:
            if metric == 'accuracy':
                values = [results[period][metric] for period in periods]
            else:
                values = [results[period]['macro avg'][metric] for period in periods]
            plt.plot(periods, values, label=metric.capitalize(), marker='o')
        
        plt.title(title + " (Every 12 months)")
        plt.xlabel('Year-Month')
        plt.ylabel('Score')
        plt.legend()
        plt.grid(True)
        annual_ticks = [period for period in periods if period.endswith('-01')]
        plt.xticks(annual_ticks, rotation=90)
        plt.tight_layout()
        
        file_name = f"{results_path}/{title.replace(' ', '_').lower()}_every_6_months.png"
        plt.savefig(file_name)
        print(f"Plot saved: {file_name}")
        
        plt.show()
        
    periods = sorted(results.keys())
        
    plot_half_yearly(periods, results, title)
    plot_yearly(periods, results, title)
    
def plot_class_metrics(results, classes, metrics, title_prefix, start_ylim=None, end_ylim=None):
    periods = sorted(results.keys())
    
    # Every Six Months
    def plot_half_yearly(periods, results):
        for index, class_name in enumerate(classes):
            plt.figure(figsize=(10, 6))

            for metric in metrics:
                metric_values = [results[period][str(index)][metric] for period in periods if str(index) in results[period]]
                plt.plot(periods, metric_values, label=f'{metric} ({class_name})', marker='o')

            plt.title(f'{title_prefix} for {class_name} Every 6 Months')
            plt.xlabel('Year-Month')
            plt.ylabel('Score')
            plt.legend()
            plt.grid(True)
            plt.xticks(periods[::6], rotation=90)
            if start_ylim is not None and end_ylim is not None:
                plt.ylim(start_ylim, end_ylim)
            plt.tight_layout()

            # Save the plot to file
            file_name = f"{results_path}/{title_prefix.replace(' ', '_').lower()}_{class_name.lower()}.png"
            plt.savefig(file_name)
            print(f"Plot saved: {file_name}")
            
            plt.show()
    
    # Every Year
    def plot_yearly(periods, results):
        for index, class_name in enumerate(classes):
            plt.figure(figsize=(10, 6))

            for metric in metrics:
                metric_values = [results[period][str(index)][metric] for period in periods if str(index) in results[period]]
                plt.plot(periods, metric_values, label=f'{metric} ({class_name})', marker='o')

            plt.title(f'{title_prefix} for {class_name} Every Year')
            plt.xlabel('Year-Month')
            plt.ylabel('Score')
            plt.legend()
            plt.grid(True)
            annual_ticks = [period for period in periods if period.endswith('-01')]
            plt.xticks(annual_ticks, rotation=90)
            if start_ylim is not None and end_ylim is not None:
                plt.ylim(start_ylim, end_ylim)
            plt.tight_layout()

            # Save the plot to file
            file_name = f"{results_path}/{title_prefix.replace(' ', '_').lower()}_{class_name.lower()}.png"
            plt.savefig(file_name)
            print(f"Plot saved: {file_name}")
            
            plt.show()
            
    plot_half_yearly(periods, results, title_prefix)
    plot_yearly(periods, results, title_prefix)

## Plot Printing [1]
Precision, Recall, F1-Score, Accuracy

In [None]:
# Print Plot: Precision and Recall by Month (ylim: 0-1)
plot_metrics(results_by_month, ['precision', 'recall'], 'Precision and Recall by Month', 0, 1)
# Print Plot: F1 Score by Month (ylim: None-None)
plot_metrics(results_by_month, ['precision', 'recall'], 'Precision and Recall by Month')

In [None]:
# Print Plot: F1 Score and Accuracy by Month (ylim: 0-1)
plot_metrics(results_by_month, ['f1-score', 'accuracy'], 'F1 Score and Accuracy by Month', 0, 1)
# Print Plot: F1 Score and Accuracy by Month (ylim: None-None)
plot_metrics(results_by_month, ['f1-score', 'accuracy'], 'F1 Score and Accuracy by Month')

## Plot Printing [2]
Metrics by class

In [None]:
classes = ['Bug', 'Enhancement'] 
metrics = ['precision', 'recall']
# Print Plot: Class Metrics for each class (ylim: 0-1)
plot_class_metrics(results_by_month, classes, metrics, 'Class Metrics', 0, 1)
# Print Plot: Class Metrics for each class (ylim: None-None)
plot_class_metrics(results_by_month, classes, metrics, 'Class Metrics')

## Load Sentence-Transformer Model

In [None]:
model_l6 = SentenceTransformer("all-MiniLM-L6-v2")

## Measure of Corpus Variability
It measures how similar the embeddings (vector representations of text) of two data sets (for example, the training set and the test set for a given month) are to each other. A higher value indicates greater similarity, which may suggest that the two datasets have a similar linguistic distribution or cover related topics.

In [None]:
model_l6 = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize a list to store the average cosine similarity by month
monthly_similarity = []

current_date = start_date
while current_date <= end_date:
    year = current_date.year
    month = current_date.month
    test_df = df_all[(df_all['date'].dt.year == year) & (df_all['date'].dt.month == month)]
    
    current_date += relativedelta(months=1)

    if test_df.empty:
        print(f"No data for {year}-{month}")
        monthly_similarity.append(None)
        continue
    
    # Handle NaN with empty strings
    test_df['title'].fillna('', inplace=True)
    test_df['body'].fillna('', inplace=True)
    train_df['title'].fillna('', inplace=True)
    train_df['body'].fillna('', inplace=True)
    
    # Prepare the texts
    test_texts = test_df['title'] + " " + test_df['body']
    train_texts = train_df['title'] + " " + train_df['body']
    
    # Compute embeddings
    train_embeddings = model_l6.encode(train_texts.tolist(), convert_to_tensor=True, show_progress_bar=True)
    test_embeddings = model_l6.encode(test_texts.tolist(), convert_to_tensor=True, show_progress_bar=True)
    
    # Calculate cosine similarity
    similarity_matrix = util.pytorch_cos_sim(train_embeddings, test_embeddings)
    monthly_similarity.append(similarity_matrix.mean().item())

# Create a list of monthly periods from start to finish
monthly_periods = pd.date_range(start=start_date, end=end_date, freq='M').strftime('%Y-%m').tolist()

### Save Results in CSV

In [None]:
# Create a DataFrame for cosine similarity
cosine_similarity_df = pd.DataFrame({
    'Month': monthly_periods,
    'Cosine Similarity': monthly_similarity
})
cosine_similarity_file_path = f"{results_path}/cosine_similarity_monthly.csv"
cosine_similarity_df.to_csv(cosine_similarity_file_path, index=False)
print(f"Cosine similarity data saved to {cosine_similarity_file_path}")

### Plot Cosine Similarity Labeled Every 6 Months

In [None]:
# Plot the average cosine similarity by month
plt.figure(figsize=(10, 6))
plt.plot(monthly_periods, monthly_similarity, marker='o')
plt.title('Average Cosine Similarity per Month')
plt.xlabel('Month')
plt.ylabel('Average Cosine Similarity')
plt.grid(True)
plt.xticks(monthly_periods[::6], rotation=90)
# Save the plot to a file
plot_file_name = f"{results_path}/cosine_similarity_month.png"
plt.savefig(plot_file_name)
print(f"Plot saved: {plot_file_name}")
plt.show()

### Plot Cosine Similarity Labeled Every Year

In [None]:
# Plot the average cosine similarity by month
plt.figure(figsize=(10, 6))
plt.plot(monthly_periods, monthly_similarity, marker='o')
plt.title('Average Cosine Similarity per Month')
plt.xlabel('Month')
plt.ylabel('Average Cosine Similarity')
plt.grid(True)
plt.xticks(rotation=90)
annual_ticks = [period for period in monthly_periods if period.endswith('-01')]
plt.xticks(annual_ticks, rotation=90)
# Save the plot to a file
plot_file_name = f"{results_path}/cosine_similarity_month.png"
plt.savefig(plot_file_name)
print(f"Plot saved: {plot_file_name}")
plt.show()

## Drift Detection
We'll use the alibi-detect library to implement drift detection on text data processed by RoBERTa model.

In [None]:
def calculate_embeddings(texts):
    return model_l6.encode(texts, batch_size=32, show_progress_bar=True)

# Embeddings for the training data (Handle NaN with empty strings)
train_texts = train_df['title'].fillna('') + " " + train_df['body'].fillna('')
train_embeddings = calculate_embeddings(train_texts.tolist())

# Autoencoder
enc_dim = 32
shape = train_embeddings.shape[1:]
uae = UAE(shape=shape, enc_dim=enc_dim)

# Initialize KSDrift detector
ks_drift = KSDrift(train_embeddings, p_val=0.05)

# Dictionary to store drift detection results
drift_results = {}

current_date = start_date
while current_date <= end_date:
    year = current_date.year
    month = current_date.month
    test_df = df_all[(df_all['date'].dt.year == year) & (df_all['date'].dt.month == month)]
    
    current_date += relativedelta(months=1)

    if test_df.empty:
        print(f"No data for {year}-{month}")
        current_date += relativedelta(months=1)
        continue

    test_texts = test_df['title'].fillna('') + " " + test_df['body'].fillna('')
    test_embeddings = calculate_embeddings(test_texts.tolist())

    preds = ks_drift.predict(test_embeddings)
    drift_results[f"{year}-{month:02d}"] = {
        'data_drift': preds['data']['is_drift'],
        'p_value': preds['data']['p_val'],
        'd_statistic': preds['data']['distance']
    }

# Convert the drift results dictionary to a DataFrame
drift_results_df = pd.DataFrame.from_dict(drift_results, orient='index', columns=['data_drift', 'p_value', 'd_statistic'])
drift_results_df.index.name = 'Period'
drift_results_df.reset_index(inplace=True)
drift_results_df[['Year', 'Month']] = drift_results_df['Period'].str.split('-', expand=True)

# Sort the DataFrame by year and month
drift_results_df.sort_values(by=['Year', 'Month'], inplace=True)
drift_results_df.reset_index(drop=True, inplace=True)

# Print the drift detection results
print(drift_results_df[['Year', 'Month', 'data_drift', 'p_value', 'd_statistic']])

# Save the drift detection results to a CSV file
drift_results_csv_path = f"{results_path}/drift_detection_results_monthly.csv"
drift_results_df[['Year', 'Month', 'data_drift', 'p_value', 'd_statistic']].to_csv(drift_results_csv_path, index=False)
print(f"Drift detection results saved to {drift_results_csv_path}")

## Archiving of Results

We compress and save the training results exluding checkpoints.

In [None]:
def zip_results(results_dir=results_path, zip_name='results.zip'):
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(results_dir):
            dirs[:] = [d for d in dirs if 'checkpoint' not in d]
            for file in files:
                if 'checkpoint' not in root:
                    file_path = os.path.join(root, file)
                    zipf.write(file_path, os.path.relpath(file_path, start=os.path.join(results_dir, '..')))
    print(f"Results archived in {zip_name}")

# Call the function to create the zip archive
zip_results()