<a href="https://colab.research.google.com/github/ever-oli/MLby22/blob/main/MultiLingualNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Executing on device: {device}\n")

# 1. Loading Multilingual Data
print("Loading papluca/language-identification dataset...")
train_dataset = load_dataset("papluca/language-identification", split="train").shuffle(seed=42).select(range(1000))
val_dataset = load_dataset("papluca/language-identification", split="validation").shuffle(seed=42).select(range(400))

# Extract unique labels to create a dynamic integer mapping
unique_labels = sorted(train_dataset.unique("labels"))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

def adjust_labels(example):
    # Convert the string label (e.g., 'es') into its corresponding integer
    return {'label': label2id[example['labels']]}

train_dataset = train_dataset.map(adjust_labels)
test_dataset = val_dataset.map(adjust_labels)

train_dataset = train_dataset.remove_columns(["labels"])
test_dataset = test_dataset.remove_columns(["labels"])

# 2. Tokenization with XLM-RoBERTa
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

print("Tokenizing multilingual text...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

columns_to_keep = ['input_ids', 'attention_mask', 'label']
tokenized_train = tokenized_train.remove_columns([col for col in tokenized_train.column_names if col not in columns_to_keep])
tokenized_test = tokenized_test.remove_columns([col for col in tokenized_test.column_names if col not in columns_to_keep])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Multilingual Model Initialization
# Pass the dynamic mappings directly into the configuration
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)
model.to(device)

# 4. Training Setup
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./xlm-roberta-multilingual-langid",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 5. Training and Multilingual Inference
print("\nTraining for Language Identification...")
trainer.train()

print("\nTesting Multilingual Inference for Language Identification:")
samples = [
    "Hello, how are you?",
    "Hola, ¿cómo estás?",
    "Bonjour, comment allez-vous?",
    "Guten Tag, wie geht es Ihnen?",
    "こんにちは、お元気ですか？"
]

model.eval()
with torch.no_grad():
    for text in samples:
        inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
        # Retrieve the string code using the model's internal id2label dictionary
        print(f"Input: {text} --> Predicted Language Code: {model.config.id2label[prediction]}")

Executing on device: cpu

Loading papluca/language-identification dataset...


Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Tokenizing multilingual text...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mXLMRobertaForSequenceClassification LOAD REPORT[0m from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
roberta.pooler.dense.bias   | UNEXPECTED | 
classifier.dense.bias       | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.out_proj.weight  | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m



Training for Language Identification...


  super().__init__(loader)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.626044,0.4725
2,No log,2.092243,0.9075


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  super().__init__(loader)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye


Testing Multilingual Inference for Language Identification:
Input: Hello, how are you? --> Predicted Language Code: en
Input: Hola, ¿cómo estás? --> Predicted Language Code: es
Input: Bonjour, comment allez-vous? --> Predicted Language Code: fr
Input: Guten Tag, wie geht es Ihnen? --> Predicted Language Code: de
Input: こんにちは、お元気ですか？ --> Predicted Language Code: ja
