In [None]:
# Classification head (binary dialect identification Lithuanian vs Samogitian) 
# TO DO: if not using the previous fine-tuned model, then change the pathway of the model used in the last code block

In [None]:
# Load necessary packages
import numpy as np
import torch
from datasets import Dataset, load_from_disk, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import gc
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Prepare data for model
# Load and create test/train for preprocessed datasets

samogitian_dataset = load_from_disk("processed_corpus/samogitian_dataset")
lithuanian_dataset = load_from_disk("processed_corpus/lithuanian_dataset")

# raw splits used by both tokenizers
smg_splits = samogitian_dataset.train_test_split(test_size=0.1, seed=42)
smg_splits = DatasetDict({"train": smg_splits["train"],"test": smg_splits["test"]})
    
lt_splits = lithuanian_dataset.train_test_split(test_size=0.1, seed=42)
lt_splits = DatasetDict({"train": lt_splits["train"],"test": lt_splits["test"]})

# label datasets with language ID (used for classification head and used for mlm to balance dataset)
smg_train = smg_splits["train"].map(lambda ex: {"labels": 0})
lt_train  = lt_splits["train"].map(lambda ex: {"labels": 1})
smg_eval = smg_splits["test"].map(lambda ex: {"labels": 0})
lt_eval   = lt_splits["test"].map(lambda ex: {"labels": 1})

print("Datatsets with train/test split loaded.")

# Loads Litlat bert
tokenizer = AutoTokenizer.from_pretrained("EMBEDDIA/litlat-bert") #loads the tokenizer from Hugging Face
model = AutoModelForMaskedLM.from_pretrained("EMBEDDIA/litlat-bert")  #loads the model to embed Samogitian in MLM
print(f"Successfully loaded model with {model.num_parameters()} parameters")

In [None]:
# Tokenize dataset for classification

remove_columns=["text", "source", "id"] 
columns=["input_ids","attention_mask","labels"] #return the language id label
max_length = 256 #less context needed
SEED = 42
gc.collect()
torch.cuda.empty_cache()

def tokenize_ds(dataset, tokenizer):    
    def tokenize_function(page):
        return tokenizer(
            page["text"],
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_special_tokens_mask=False,
        )
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        batch_size=64,
        num_proc=4,
        remove_columns=remove_columns,
        desc="Tokenizing dataset"
    ).with_format("torch", columns=columns)
    return tokenized_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

smg_train_cls = tokenize_ds(smg_train, tokenizer)
lt_train_cls = tokenize_ds(lt_train, tokenizer)
smg_eval_cls = tokenize_ds(smg_eval, tokenizer)
lt_eval_cls = tokenize_ds(lt_train, tokenizer)

# Downsampling is sufficient for this task. Runs faster with less data, and less is required
print("Downsampling of Lithuanian dataset:")
lt_train = lt_train.shuffle(seed=SEED).select(range(len(smg_train)))

smg_train_cls.save_to_disk("tokenized_smg_train_cls")
lt_train_cls.save_to_disk("tokenized_lt_train_cls")
smg_eval_cls.save_to_disk("tokenized_smg_eval_cls")
lt_eval_cls.save_to_disk("tokenized_lt_eval_cls")
train_ds_cls = concatenate_datasets([smg_train_cls, lt_train_cls]).shuffle(seed=SEED)
eval_ds_cls  = concatenate_datasets([smg_eval_cls, lt_eval_cls]).shuffle(seed=SEED)
train_ds_cls.save_to_disk("train_ds_cls")
eval_ds_cls.save_to_disk("eval_ds_cls")

In [None]:
# Classifier head for distinguishing Standard Lithuanian and Samogitian using the fine-tuned bert from previous step
train_ds_cls = load_from_disk("train_ds_cls")
eval_ds_cls = load_from_disk("eval_ds_cls")

class LanguageClassifier:
    def __init__(self, tokenizer, output_dir="./dialect_classifier_model",
                 batch_size=16, learning_rate=3e-5, epochs=1):
        self.tokenizer = tokenizer
        self.output_dir = output_dir
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def train(self, train_dataset, eval_dataset):
        gc.collect()
        torch.cuda.empty_cache()
        
        model = AutoModelForSequenceClassification.from_pretrained(
            "./samogitian_litlat_bert2",
            num_labels=2,
            problem_type="single_label_classification",
            low_cpu_mem_usage=True
        )
        model = model.to(self.device)
        model.gradient_checkpointing_enable()
        
        if self.device.type == 'cuda':
            print(f"GPU Memory after model load: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
        
        # metrics and statistics
        def compute_metrics(pred):
            labels = pred.label_ids
            preds = np.argmax(pred.predictions, axis=1)
            accuracy = accuracy_score(labels, preds)
            f1 = f1_score(labels, preds)
            precision = precision_score(labels, preds)
            recall = recall_score(labels, preds)
            
            return {
                "accuracy": accuracy,
                "f1": f1,
                "precision": precision,
                "recall": recall
            }
        
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=self.epochs,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size * 2,
            gradient_accumulation_steps=2,
            learning_rate=self.learning_rate,
            warmup_ratio=0.1,
            weight_decay=0.01,
            eval_strategy="steps",
            eval_steps=1000,
            save_strategy="steps",
            save_steps=1000,
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model="eval_accuracy",
            push_to_hub=False,
            report_to="none",
            fp16=True,
            fp16_opt_level="O1",
            max_grad_norm=1.0,
            dataloader_num_workers=2,
            dataloader_pin_memory=True,
            logging_steps=100,
            logging_first_step=True,
            optim="adamw_torch",
            gradient_checkpointing=True,
            auto_find_batch_size=True
        )
        
        # Training phase
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            callbacks=[
                EarlyStoppingCallback(
                    early_stopping_patience=3,
                    early_stopping_threshold=0.01
                )
            ]
        )
        
        print("Starting classifier training...")
        train_result = trainer.train()
        print(f"Training completed in {train_result.metrics['train_runtime']:.2f} seconds")
        
        print("Evaluating classifier...")
        metrics = trainer.evaluate()
        print(f"Classification metrics: {metrics}")
        
        trainer.save_model(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)
        
        if self.device.type == 'cuda':
            torch.cuda.empty_cache()
        
        return metrics, model

classifier = LanguageClassifier(tokenizer=tokenizer,output_dir="./samogitian_litlat_bert_classifier",
    batch_size=16, learning_rate=5e-5,epochs=3)

metrics, model = classifier.train(train_dataset=train_ds_cls, eval_dataset=eval_ds_cls)