In [None]:


import pandas as pd
import numpy as np
from datasets import Dataset, ClassLabel
from transformers import (XLMRobertaTokenizerFast, XLMRobertaForTokenClassification,
                          DistilBertTokenizerFast, DistilBertForTokenClassification,
                          BertTokenizerFast, BertForTokenClassification,
                          Trainer, TrainingArguments)
from sklearn.metrics import precision_recall_fscore_support

# Step 2: Load the dataset in CoNLL format
def load_conll_dataset(file_path):
    sentences, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence, label = [], []
        for line in f:
            line = line.strip()
            if line:
                token, tag = line.split()
                sentence.append(token)
                label.append(tag)
            else:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

# Load your CoNLL dataset
file_path = r'/content/drive/MyDrive/labele_data.conll'  
sentences, labels = load_conll_dataset(file_path)


data = {'tokens': sentences, 'ner_tags': labels}
df = pd.DataFrame(data)

unique_labels = set(tag for label in labels for tag in label)
class_labels = ClassLabel(names=list(unique_labels))

dataset = Dataset.from_pandas(df)


def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if id is None else class_labels.str2int(label[id]) for id in word_ids]
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

def train_and_evaluate_model(model_name, model_class, tokenizer_class):
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name, num_labels=len(class_labels))

    # Tokenize the dataset
    tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        evaluation_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,  
        tokenizer=tokenizer,
    )

    trainer.train()

    
    predictions, label_ids, _ = trainer.predict(tokenized_dataset)
    predictions = np.argmax(predictions, axis=2)


    true_labels = [[class_labels.int2str(label) for label in label_row if label != -100] for label_row in label_ids]
    pred_labels = [[class_labels.int2str(pred) for pred, label in zip(pred_row, label_row) if label != -100] for pred_row, label_row in zip(predictions, label_ids)]


    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')

    return {
        'model': model_name,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


models = [
    ('xlm-roberta-base', XLMRobertaForTokenClassification, XLMRobertaTokenizerFast),
    ('distilbert-base-multilingual-cased', DistilBertForTokenClassification, DistilBertTokenizerFast),
    ('bert-base-multilingual-cased', BertForTokenClassification, BertTokenizerFast),
]

results = []

for model_name, model_class, tokenizer_class in models:
    print(f"Training and evaluating {model_name}...")
    result = train_and_evaluate_model(model_name, model_class, tokenizer_class)
    results.append(result)


results_df = pd.DataFrame(results)
print(results_df)


best_model = results_df.loc[results_df['f1'].idxmax()]
print(f"Best performing model: {best_model['model']} with F1 Score: {best_model['f1']}")