In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainerCallback, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset, load_dataset
from copy import deepcopy

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

RANDOM_STATE = 42

In [None]:
id2label = {0: "CORRECT", 1: "BUGGY"}
label2id = {"CORRECT": 0, "BUGGY": 1}

In [None]:
tokenizer = AutoTokenizer.from_pretrained('dipudl/codet5-base')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

In [None]:
!pip install huggingface_hub
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('hf_...')"

-----
## Load Dataset
-----

In [None]:
train_df = pd.read_csv("/kaggle/input/c-operator-precedence-bug-dataset-split-shuffle/train__operator_precedence_bug_full_dataset_preprocessed.tsv", sep="\t")
validation_df = pd.read_csv("/kaggle/input/c-operator-precedence-bug-dataset-split-shuffle/validation__operator_precedence_bug_full_dataset_preprocessed.tsv", sep="\t")
test_df = pd.read_csv("/kaggle/input/c-operator-precedence-bug-dataset-split-shuffle/test__operator_precedence_bug_full_dataset_preprocessed.tsv", sep="\t")

In [None]:
train_df

In [None]:
validation_df

In [None]:
test_df

In [None]:
train_df.isna().sum()

In [None]:
validation_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
train_df.shape, validation_df.shape, test_df.shape

In [None]:
train_df.drop(['file_path', 'method'], axis=1, inplace=True)
validation_df.drop(['file_path', 'method'], axis=1, inplace=True)
test_df.drop(['file_path', 'method'], axis=1, inplace=True)

In [None]:
train_df.sample(5)

In [None]:
validation_df.sample(5)

In [None]:
test_df.sample(5)

In [None]:
exp_for_checking = train_df.iloc[10].operator_expression

print(exp_for_checking)
print("-" * 80)
print(tokenizer.tokenize(exp_for_checking, truncation=True, max_length=128, padding=True))
print("-" * 80)
print(tokenizer(exp_for_checking, truncation=True, max_length=128, padding=True))

In [None]:
def tokenize_text(examples):
    return tokenizer(examples["operator_expression"], truncation=True, max_length=128, padding=True)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
train_dataset

In [None]:
validation_dataset = Dataset.from_pandas(validation_df)
validation_dataset

In [None]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset

-----
## Tokenization
-----

In [None]:
# train_dataset = train_dataset.map(tokenize_text, batched=True, remove_columns=["operator_expression", "__index_level_0__"])
train_dataset = train_dataset.map(tokenize_text, batched=True, remove_columns=["operator_expression"])
train_dataset

In [None]:
# validation_dataset = validation_dataset.map(tokenize_text, batched=True, remove_columns=["operator_expression", "__index_level_0__"])
validation_dataset = validation_dataset.map(tokenize_text, batched=True, remove_columns=["operator_expression"])
validation_dataset

In [None]:
# test_dataset = test_dataset.map(tokenize_text, batched=True, remove_columns=["operator_expression", "__index_level_0__"])
test_dataset = test_dataset.map(tokenize_text, batched=True, remove_columns=["operator_expression"])
test_dataset

In [None]:
def softmax(x):
    result = np.zeros_like(x)
    
    for i in range(len(x)):
        result[i] = np.exp(x[i]) / np.sum(np.exp(x[i]), axis=0)
    return result

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}

-----
## Model Training
-----

In [None]:
# ! rm -rd /kaggle/working/codeT5-DistilBERT-function-swap-bug-model

batch_size = 64
logging_steps = len(train_dataset) // batch_size
output_dir = "codeT5-DistilBERT-operator-precedence-bug-model"

training_args = TrainingArguments(output_dir,
                                  num_train_epochs=1,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay = 0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  save_strategy="epoch",
                                  # save_steps=10000,
                                  # fp16=True,
                                  load_best_model_at_end=True,
                                  metric_for_best_model='eval_loss',
                                  greater_is_better=False,
                                  push_to_hub=False
                                )

In [None]:
class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer, test_dataset) -> None:
        super().__init__()
        self._trainer = trainer
        self.test_dataset = test_dataset

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            train_metrics = self._trainer.predict(self._trainer.train_dataset, metric_key_prefix="train").metrics
            eval_metrics = self._trainer.predict(self._trainer.eval_dataset, metric_key_prefix="eval").metrics
            test_metrics = self._trainer.predict(self.test_dataset, metric_key_prefix="test").metrics
            
            print("Epoch:", state.epoch)
            print(train_metrics)
            print(eval_metrics)
            print(test_metrics)
            
            with open("log.txt", "a") as file:
                file.write(f"Epoch: {state.epoch}\n")
                file.write(f"Train metrics: {train_metrics}\n")
                file.write(f"Eval metrics: {eval_metrics}\n")
                file.write(f"Test metrics: {test_metrics}\n")
            
            train = self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            self._trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")
            return control_copy

In [None]:
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,      # number of evaluations to wait before stopping
    early_stopping_threshold=0.01,  # threshold for relative improvement in metric
)

-----
## Hyperparameters Search
-----

In [None]:
learning_rates = [0.000002, 0.00002, 0.0002, 0.002]
batch_sizes = [64, 32]

# Loop over the learning rates
for lr in learning_rates:
    for bs in batch_sizes:
        # Update the learning rate in the TrainingArguments
        training_args.learning_rate = lr
        training_args.per_device_train_batch_size = bs
        training_args.per_device_eval_batch_size = bs

        # Create a new Trainer with the updated TrainingArguments
        trainer = Trainer(
            model=model,                         
            args=training_args,                  
            train_dataset=train_dataset,         
            eval_dataset=validation_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            callbacks=[early_stopping]
        )

        # Train the model and evaluate it on the validation set
        trainer.train()
        print(trainer.evaluate())

-----
## Choosing Best Hyperparameters and Training Final Model
-----

In [None]:
BEST_LEARNING_RATE = 2e-5
BEST_BATCH_SIZE = 32
training_args.learning_rate = BEST_LEARNING_RATE
training_args.push_to_hub = False
training_args.per_device_train_batch_size = BEST_BATCH_SIZE
training_args.per_device_eval_batch_size = BEST_BATCH_SIZE

print("\n\n\nStarting training...\n")
print(f"Learning rate: {BEST_LEARNING_RATE}\n")
print(f"Batch size: {BEST_BATCH_SIZE}\n")
print("*" * 50 + "\n")

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

trainer.add_callback(CustomCallback(trainer, test_dataset))

In [None]:
trainer.train()

In [None]:
trainer.save_model(f"codeT5-DistilBERT-operator-precedence-bug-model-{BEST_LEARNING_RATE}-{BEST_BATCH_SIZE}")

-----
## Model Testing
-----

In [None]:
prediction = trainer.predict(test_dataset)

In [None]:
prediction

In [None]:
labels = prediction.label_ids
labels

In [None]:
preds = prediction.predictions.argmax(-1)
preds

In [None]:
print(f"f1_score: {f1_score(labels, preds)}")

In [None]:
preds_probability = softmax(prediction.predictions)
positive_preds_probability = preds_probability[:, 1]

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(labels, preds, labels=[0, 1])
print(f"Confusion matrix: {cm}")
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot()
plt.show()

In [None]:
import sklearn.metrics as metrics

fpr, tpr, threshold = metrics.roc_curve(labels, positive_preds_probability)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
# plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
precision, recall, threshold = metrics.precision_recall_curve(labels, positive_preds_probability)

# plt.title('Precision-Recall Curve')
plt.plot(recall, precision, 'b')
plt.ylabel('Precision')
plt.xlabel('Recall')

plt.xlim([0, 1])
plt.ylim([0, 1])

plt.show()

In [None]:
test_df

In [None]:
test_df["predictions"] = preds

In [None]:
test_df

In [None]:
positive_probabilities = softmax(prediction.predictions)[:, 1]
positive_probabilities

In [None]:
test_df["positive_probabilities"] = positive_probabilities
test_df

In [None]:
test_df.to_csv("operator_precedence_bug_test_dataset_predictions.csv", sep="\t", index=False)