In [2]:
import random
import os

import evaluate
import torch
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
from transformers import DataCollatorWithPadding

NUM_CLASSES = 3
IDS2LABELS = {
    0: 'safe',
    1: 'sensitive',
    2: 'harmful'
}

LABELS2IDS = {
    'safe': 0,
    'sensitive': 1,
    'harmful': 2
}

# Set random seeds for reproducibility
seed = 1035
torch.manual_seed(seed)  # Set the seed for PyTorch
random.seed(seed)        # Set the seed for Python's random module
np.random.seed(seed)     # Set the seed for NumPy

# Metrics
metric = evaluate.load("accuracy")


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## Data

In [2]:
class QueryDataset(Dataset):
    def __init__(self, file_paths: list[str], backbone_name: str, max_len: int=128) -> dict:
        """
        Args:
            file_paths (list): List of file paths for each class.
            tokenizer: The tokenizer to use for text preprocessing.
            max_len: The maximum length of tokenized sentences.
        """
        self.texts = []
        self.labels = []
        self.tokenizer = AutoTokenizer.from_pretrained(backbone_name)
        self.max_len = max_len
        
        # Read and process each file
        for label, file_path in enumerate(file_paths):
            with open(file_path, 'r') as file:
                for line in file:
                    sentence = line.strip()
                    self.texts.append(sentence)
                    self.labels.append(label)
                        
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the sentence
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            truncation=True,
            return_tensors='pt',  # Return as PyTorch tensors
        )
        
        input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze(0)  # Remove batch dimension

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

## Model

In [None]:
backbone_name = 'answerdotai/ModernBERT-large'
config = AutoConfig.from_pretrained(backbone_name)
config.id2label = IDS2LABELS
config.label2id = LABELS2IDS
config.problem_type == 'multi_label_classification'
config.num_labels = NUM_CLASSES
config.reference_compile = False
model = AutoModelForSequenceClassification.from_pretrained(backbone_name, config=config)

## Training

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,4"

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(
    per_device_train_batch_size=64,
    evaluation_strategy='steps',
    do_eval=True,
    num_train_epochs=2,
    lr_scheduler_type='cosine',
    fp16=True,
    logging_steps=10,
    eval_steps=100,
    learning_rate=5e-5,
    remove_unused_columns=False,
    push_to_hub=False,
    logging_dir='./logs',
    output_dir='./results'
)

fp = ['../queries/gemini-2.0-flash-experimental/normal/normal.txt',
      '../queries/gemini-2.0-flash-experimental/sensitive/sensitive_cleaned_v2.csv',
      '../queries/gemini-2.0-flash-experimental/harmful/harmful_cleaned_v2.csv']
tokenizer = AutoTokenizer.from_pretrained(backbone_name)
ds = QueryDataset(fp, backbone_name)
train_ratio = 0.8
eval_ratio = 0.1
test_ratio = 0.1
dataset_size = len(ds)
train_size = int(train_ratio * dataset_size)
eval_size = int(eval_ratio * dataset_size)
test_size = dataset_size - train_size - eval_size
ds_train, ds_eval, ds_test = random_split(ds, [train_size, eval_size, test_size])
data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

In [None]:
trainer.evaluate(ds_test)

- Hyperparams:
- lr=1e-4, ep=5, acc=0.96200, base
- lr=5e-5, ep=4, acc=0.97733, base
- lr=2e-5, ep=4, acc=0.97079, base
- lr=5e-5, ep=2, acc=0.98388, large

In [None]:
model.save_pretrained("./moralBERT/")
tokenizer.save_pretrained("./moralBERT/")

## Inference examples

In [None]:
from datasets import Dataset as DSV2

queries = [
    'Gift for my wife, who likes art',
    'Trump for president poster',
    'Something to quietly kill my neighbor',
]

df_queries = pd.DataFrame({'query': queries})


# Load your fine-tuned ModernBERT model and tokenizer
model_name = "./results/checkpoint-288"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Example pandas dataframe
df = df_queries

# Tokenize the query column
def preprocess_function(examples):
    return tokenizer(
        examples["query"], 
        truncation=True, 
        max_length=128,
        add_special_tokens=True,
        #return_tensors='pt'
    )

# Convert pandas dataframe to Hugging Face Dataset
dataset = DSV2.from_pandas(df)

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Remove the original 'query' column to avoid issues with unexpected inputs
tokenized_dataset = tokenized_dataset.remove_columns(["query"])

# Convert to PyTorch tensors (if needed for Trainer)
tokenized_dataset.set_format("torch")

trainer = Trainer(
    model=model,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)


predictions = trainer.predict(tokenized_dataset)
pred_probs = torch.softmax(torch.tensor(predictions.predictions), axis=1)
# Output predictions
#print(predictions.predictions)  # Raw logits
#print(torch.argmax(torch.tensor(predictions.predictions), axis=1))  # Predicted classes

In [None]:
print(torch.argmax(torch.tensor(predictions.predictions), axis=1))
print(pred_probs)

## Evaluation

### BM25

In [20]:
import bm25s
import Stemmer

from rank_bm25 import BM25Okapi
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [None]:
corpus = [text for (i, text) in enumerate(ds_train.dataset.texts) if i in ds_train.indices]
train_labels =  [label for (i, label) in enumerate(ds_train.dataset.labels) if i in ds_train.indices]

stemmer = Stemmer.Stemmer("english")
corpus_tokens = [doc.split(" ") for doc in corpus] #bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)
retriever = BM25Okapi(corpus_tokens) #bm25s.BM25()
#retriever.index(corpus_tokens)

# 
test_queries = [text for (i, text) in enumerate(ds_test.dataset.texts) if i in ds_test.indices]
test_labels = [label for (i, label) in enumerate(ds_test.dataset.labels) if i in ds_test.indices]

In [None]:
predictions = []
for query in tqdm(test_queries):
    query_tokens = query.split(" ")#bm25s.tokenize(query, stemmer=stemmer)
    scores = retriever.get_scores(query_tokens)
    #print(scores)
    #print(results)
    pred = train_labels[scores.argmax()]
    predictions.append(pred)

In [None]:
len(predictions)

In [None]:
len(test_labels)

In [None]:
accuracy = accuracy_score(predictions, test_labels)
accuracy.__round__(5)

### Sentence_transformer

In [33]:
import faiss

In [59]:
qn_emb = np.load('../embeddings/queries/safe.npy')
qs_emb = np.load('../embeddings/queries/sensitive.npy')
qh_emb = np.load('../embeddings/queries/harmful.npy')

train_indices = ds_train.indices
train_indices.sort()
test_indices = ds_test.indices
test_indices.sort()

emb_data = np.r_[qn_emb, qs_emb, qh_emb]
emb_train = emb_data[train_indices]
emb_test = emb_data[test_indices]

d = 768
index = faiss.IndexFlatIP(d)
faiss.normalize_L2(emb_train)
index.add(emb_train)

In [None]:
predictions = []
k = 1

for emb in tqdm(emb_test):
    distances, indices = index.search(emb[None, :], k)
    pred = train_labels[indices[0,0]]
    predictions.append(pred)

In [None]:
accuracy = accuracy_score(predictions, test_labels)
accuracy.__round__(5)