# Classification of Bug and Enhancement Reports with BERT

This notebook demonstrates how to train a BERT model for bug and enhancement report classification using the Hugging Face `transformers` framework.

## Setup

In [None]:
%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install torch torchvision torchaudio
%pip install transformers
%pip install scikit-learn

## Importing Libraries

Let's start by importing all the libraries needed for our project.

In [2]:
import os
import zipfile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments

## Setup Directory

In [3]:
results_dir = './RESULTS' 

if not os.path.exists(results_dir):
    os.makedirs(results_dir)

## Data Preparation

We load the data from the CSV file and prepare it for training and evaluation. 
We filter the data to include only those between 2000 and 2007 and correctly label them as Bug (0) or Enhancement (1).

In [None]:
# Fetch the complete dataset
df_path = "../CSV/STANDARD/MAPPED/Apache.csv"
df_all = pd.read_csv(df_path)
df_all['date'] = pd.to_datetime(df_all['date'], errors='coerce', format='%Y-%m-%dT%H:%M:%S.%f+0000')
df_all = df_all[df_all['label'].isin(['Bug', 'Enhancement'])]

# Filter data for training and validation (2000-2007)
df_train_val = df_all[(df_all['date'].dt.year >= 2000) & (df_all['date'].dt.year <= 2007)]
df_train_val['text'] = df_train_val['title'] + " " + df_train_val['body']
df_train_val['labels'] = df_train_val['label'].apply(lambda x: 1 if x == 'Enhancement' else 0)

## Data Division in Train e Validation
We split the data in train (70%) and validation (30%)

In [None]:
train_df, validation_df = train_test_split(df_train_val, test_size=0.3, random_state=42)

## Creation of the Dataset

We define a `CustomDataset` class to prepare the data for training with BERT.

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_token_len=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

## Tokenization

We use the BERT tokenizer to convert text into tokens that the model can understand.

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_dataset = CustomDataset(train_df['text'].to_numpy(), train_df['labels'].to_numpy(), tokenizer)
validation_dataset = CustomDataset(validation_df['text'].to_numpy(), validation_df['labels'].to_numpy(), tokenizer)

## Model Training

We configure and train the BERT model for classification.

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
training_args = TrainingArguments(
    output_dir=results_dir,          # Directory where to save the trained models
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Weight decay if applicable
    logging_dir='./logs',            # Directory where to save logs
    evaluation_strategy="epoch",     # Can be "no", "steps", or "epoch"
    eval_steps=100,                  # Number of training steps between two evaluations
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

## Test Year by Year
To test the model on data from subsequent years, one at a time, we load the data for each year after 2007 and evaluate the model on them.

In [None]:
results_by_year = {}

def evaluate_model_for_year(model, tokenizer, year, df_all):
    test_df = df_all[df_all['date'].dt.year == year]
    if test_df.empty:
        print(f"No data for year {year}")
        return None
    
    test_df['text'] = test_df['title'] + " " + test_df['body']
    test_dataset = CustomDataset(test_df['text'].to_numpy(), test_df['labels'].to_numpy(), tokenizer)
    predictions = trainer.predict(test_dataset)
    metrics = compute_metrics(predictions)
    
    return metrics

for year in range(2008, 2023):
    metrics = evaluate_model_for_year(model, tokenizer, year, df_all)
    if metrics:
        results_by_year[year] = metrics

## Plot of Results


In [None]:
def plot_results(results_by_year):
    years = list(results_by_year.keys())
    accuracies = [results_by_year[year]['accuracy'] for year in years]
    f1_scores = [results_by_year[year]['f1'] for year in years]
    precisions = [results_by_year[year]['precision'] for year in years]
    recalls = [results_by_year[year]['recall'] for year in years]

    plt.figure(figsize=(10, 6))
    plt.plot(years, accuracies, label='Accuracy', marker='o')
    plt.plot(years, f1_scores, label='F1 Score', marker='o')
    plt.plot(years, precisions, label='Precision', marker='o')
    plt.plot(years, recalls, label='Recall', marker='o')
    plt.title('Model Performance by Year')
    plt.xlabel('Year')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    plt.xticks(years, rotation=45)
    plt.tight_layout()
    
    # Mostra il grafico
    plt.show()
    
    # Salva il grafico su file
    plt.savefig('./RESULTS/model_performance_by_year.png')

# Visualizza e salva i risultati
plot_results(results_by_year)

## Archiving of Results

We compress and save the training results.

In [None]:
def zip_results(results_dir=results_dir, zip_name='results.zip'):
    with zipfile.ZipFile(zip_name, 'w') as zipf:
        for root, dirs, files in os.walk(results_dir):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(results_dir, '..')))

zip_results()