<a href="https://colab.research.google.com/github/eliaslimmer/PraktikumMushroom/blob/main/data_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Requirements Installation



In [None]:
!pip install transformers datasets evaluate accelerate seqeval scipy
!pip install --upgrade datasets

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Get helper functions from github


In [None]:
# Clone the GitHub repository
!git clone https://github.com/eliaslimmer/PraktikumMushroom.git

# Change directory to the cloned repository
%cd PraktikumMushroom

import src.baseline_model as baseline_model
import src.data_preparation as data_preparation
import src.helpers as helpers
import src.metrics as metrics
!ls

#Login to huggingface

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 1. Download the MKQA datasets



In [None]:
lang = 'fr' #Set the language. Could be 'de' if german, 'fr' if french
language_data = data_preparation.prepare_dataset(lang)

#Print the length of the dataset and a sample query and answer
print(f"Length of {lang} dataset: {len(language_data['queries'])}, "
      f"sample {lang} query: {language_data['queries'][0]}, "
      f"sample {lang} answer: {', '.join(language_data['answers'][0])}")

# 2. Generate Answers and probabilities

Define input and output paths

In [None]:
answers_file_path = f'data/generated_answers/output_answers_{lang}.jsonl'
span_file_path = f'data/outputs_partial_span_{lang}.jsonl'
train_data_path = f'data/trainings_set/train_dataset_{lang}.jsonl'
val_data_path = f'data/validation_set/validation_dataset_{lang}.jsonl'

# path to provided files from https://github.com/Helsinki-NLP/mu-shroom/tree/main/splits
source_val_path = f'data/source_data/mushroom.{lang}-val.v2.extra.jsonl'
source_test_path = f'data/source_data/mushroom.{lang}-tst.v1.extra.jsonl'


Load the model and tokenizer from Huggingface


In [None]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

Generate the answers using the model

In [None]:
data_preparation.generate_answers(model, tokenizer, language_data['queries'], lang, answers_file_path)

#3. Span mapping

In [None]:
data_preparation.span_mapping(answers_file_path, span_file_path, tokenizer)

#4. Convert soft labels to hard labels

In [None]:
data_preparation.generate_hard_labels(span_file_path, train_data_path)

#5. Validation dataset reformat

In [None]:
data_preparation.validation_reformat(source_val_path, val_data_path)

#6. Model Training

In [None]:
LABEL_LIST = ['O', 'B']
MODEL_NAME = 'FacebookAI/xlm-roberta-base'
data_files = {
        "train": train_data_path,
        "validation": val_data_path
    }
baseline_model.train_model(MODEL_NAME, LABEL_LIST, data_files, output_dir=f"./results_xmr_{lang}")

#7. Model Testing

In [None]:
baseline_model.test_model(
    MODEL_NAME,
    test_lang=lang,
    model_path=f'results_xmr_{lang}/checkpoint-15000',
    data_path=source_test_path
)

#8. Evaluation

In [None]:
reference = helpers.load_jsonl(source_test_path)
hard_references, soft_references = helpers.convert_reference_to_dict(reference)
predictions = helpers.load_jsonl(f"{lang}-pred.jsonl")

results = metrics.evaluate_predictions(predictions, hard_references, soft_references)

print("📊 Evaluation Results:")
print(f"Mean IoU: {results['mean_iou']:.4f}")
print(f"Mean Spearman: {results['mean_spearman']:.4f}")

📊 Evaluation Results:
Mean IoU: 0.4789
Mean Spearman: 0.2181
