In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

### Load the dataset

In [2]:
dataset = load_dataset("papluca/language-identification")

Downloading readme:   0%|          | 0.00/4.99k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.69M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

### Check the data

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 70000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
})

In [4]:
dataset["train"][100]

{'labels': 'zh', 'text': '如果自己家用，建议买7mm的，这个11mm的出胶太快太多了'}

In [5]:
dataset["validation"][100]

{'labels': 'nl', 'text': 'Een man speelt gitaar.'}

In [6]:
dataset["test"][100]

{'labels': 'nl',
 'text': 'Voormalig Pakistaans president Pervez Musharraf weer gearresteerd...'}

### Initialize the tokenizer

In [33]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased")

Downloading tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

#### Tokenize the dataset

In [34]:
def preprocess_data(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

encoded_dataset = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Beacuse we found that the labels are text labels (language codes like 'zh' for Chinese) rather than integer labels. We'll need to convert these text labels into integer labels for the model to work correctly.

### Convert text labels to integer labels

In [35]:
label_encoder = LabelEncoder()

#### Fit the label encoder on the training set labels

In [36]:
label_encoder.fit(dataset['train']['labels'])

### # Apply the label encoder to the datasets

In [37]:
def encode_labels(examples):
    examples['labels'] = label_encoder.transform(examples['labels'])
    return examples

encoded_dataset = encoded_dataset.map(encode_labels, batched=True)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

### Prepare the data collator

In [38]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Split the dataset into training and validation sets

In [39]:
train_dataset = encoded_dataset["train"]
val_dataset = encoded_dataset["validation"]

### Check the split dataset and encoded labels 

In [40]:
train_dataset

Dataset({
    features: ['labels', 'text', 'input_ids', 'attention_mask'],
    num_rows: 70000
})

In [41]:
train_dataset['labels']

[12,
 1,
 19,
 15,
 13,
 11,
 17,
 14,
 16,
 17,
 12,
 5,
 0,
 8,
 7,
 2,
 11,
 2,
 3,
 13,
 12,
 12,
 14,
 19,
 10,
 5,
 5,
 1,
 15,
 13,
 6,
 12,
 2,
 12,
 6,
 6,
 15,
 18,
 5,
 4,
 3,
 5,
 0,
 9,
 13,
 11,
 15,
 10,
 2,
 4,
 2,
 18,
 19,
 8,
 7,
 10,
 2,
 0,
 17,
 15,
 8,
 19,
 12,
 18,
 5,
 1,
 6,
 18,
 14,
 16,
 11,
 13,
 15,
 14,
 14,
 3,
 10,
 1,
 9,
 6,
 19,
 17,
 4,
 15,
 10,
 8,
 19,
 7,
 19,
 19,
 16,
 6,
 7,
 1,
 15,
 18,
 13,
 2,
 17,
 2,
 19,
 5,
 9,
 19,
 18,
 3,
 13,
 18,
 6,
 16,
 9,
 3,
 9,
 9,
 12,
 4,
 7,
 8,
 2,
 13,
 9,
 11,
 9,
 8,
 13,
 1,
 0,
 17,
 4,
 15,
 7,
 11,
 8,
 13,
 16,
 4,
 4,
 9,
 7,
 8,
 11,
 2,
 13,
 8,
 1,
 2,
 4,
 17,
 5,
 19,
 13,
 6,
 15,
 1,
 4,
 12,
 18,
 6,
 1,
 3,
 16,
 10,
 7,
 14,
 8,
 15,
 1,
 3,
 8,
 19,
 6,
 12,
 16,
 13,
 9,
 10,
 9,
 11,
 18,
 17,
 16,
 9,
 13,
 12,
 5,
 9,
 10,
 0,
 12,
 11,
 2,
 14,
 10,
 18,
 7,
 14,
 13,
 7,
 19,
 19,
 3,
 4,
 13,
 7,
 6,
 9,
 7,
 7,
 2,
 9,
 4,
 9,
 8,
 9,
 13,
 10,
 4,
 16,
 6,
 3,
 11,
 10,
 1

In [42]:
val_dataset

Dataset({
    features: ['labels', 'text', 'input_ids', 'attention_mask'],
    num_rows: 10000
})

Seems we successfully encode the label and split the data!

### Get the number of unique labels

In [43]:
num_labels = len(label_encoder.classes_)
num_labels

20

### Fine-tune the model

In [44]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

### Load the pre-trained model

In [45]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=num_labels)

Downloading model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Define the training arguments

In [48]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

### Define the compute_metrics function

In [29]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [47]:
import torch
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


### Initialize a Trainer with compute_metrics

In [55]:
trainer_with_metrics = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

### Train the model

In [56]:
trainer_with_metrics.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0239,0.057572,0.9927,0.992727,0.993024,0.9927
2,0.0025,0.066791,0.9935,0.993528,0.99381,0.9935
3,0.0044,0.072872,0.9935,0.993531,0.993822,0.9935


TrainOutput(global_step=26250, training_loss=0.010143891422663417, metrics={'train_runtime': 11794.8467, 'train_samples_per_second': 17.804, 'train_steps_per_second': 2.226, 'total_flos': 2.78270834688e+16, 'train_loss': 0.010143891422663417, 'epoch': 3.0})

### Evaluate the model

In [57]:
eval_results = trainer_with_metrics.evaluate()

In [58]:
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 0.07287164777517319, 'eval_accuracy': 0.9935, 'eval_f1': 0.9935310587705737, 'eval_precision': 0.9938219583210134, 'eval_recall': 0.9935, 'eval_runtime': 163.8783, 'eval_samples_per_second': 61.021, 'eval_steps_per_second': 7.628, 'epoch': 3.0}


In [59]:
print(f"Validation Loss: {eval_results['eval_loss']}")
print(f"Validation Accuracy: {eval_results['eval_accuracy']}")
print(f"Validation Precision: {eval_results['eval_precision']}")
print(f"Validation Recall: {eval_results['eval_recall']}")
print(f"Validation F1: {eval_results['eval_f1']}")

Validation Loss: 0.07287164777517319
Validation Accuracy: 0.9935
Validation Precision: 0.9938219583210134
Validation Recall: 0.9935
Validation F1: 0.9935310587705737


### Save the fine-tuned model and tokenizer

In [60]:
model_dir = "./fine_tuned_model"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

### Load the saved model and tokenizer

In [61]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch

# Load the saved model and tokenizer
loaded_model_q1 = DistilBertForSequenceClassification.from_pretrained(model_dir)
loaded_tokenizer_q1 = DistilBertTokenizerFast.from_pretrained(model_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Example inference

In [63]:
def predict_language_q1(text):
    inputs = loaded_tokenizer_q1(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        logits = loaded_model_q1(**inputs).logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    predicted_label = label_encoder.inverse_transform([predicted_class_id])
    return predicted_label[0]

# Example text for inference
example_text_q1 = "早安，你吃早餐了嗎？"
predicted_language_q1 = predict_language_q1(example_text_q1)
print(f"Predicted language for Q1: {predicted_language_q1}")

Predicted language for Q1: zh


### Quesiton 2

In [64]:
import torch
import torch.nn as nn
from transformers import AutoModel, TrainingArguments, Trainer, DataCollatorWithPadding, DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
from datasets import load_dataset

In [132]:
# Load dataset
dataset = load_dataset("papluca/language-identification")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

# Preprocess the data
def preprocess_data(examples):
    return tokenizer(examples['text'], padding=True, truncation=True)

encoded_dataset = dataset.map(preprocess_data, batched=True)

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(dataset['train']['labels'])

def encode_labels(examples):
    examples['labels'] = label_encoder.transform(examples['labels'])
    return examples

encoded_dataset = encoded_dataset.map(encode_labels, batched=True)

# Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset
train_dataset = encoded_dataset["train"]
val_dataset = encoded_dataset["validation"]
num_labels = len(label_encoder.classes_)

### Custom model for using the [SEP] Token

In [133]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Initialize the custom model

In [125]:
sep_token_model = SEPTokenModel("distilbert-base-multilingual-cased", num_labels=num_labels)

### Training arguments (same as before)

In [134]:
training_args = TrainingArguments(
    output_dir="./results_sep_token",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

### Metrics calculation

In [129]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Training setup

In [135]:
trainer_with_sep_token = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

### Train the model

In [136]:
trainer_with_sep_token.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0243,0.056515,0.9934,0.993432,0.993732,0.9934
2,0.0088,0.064906,0.994,0.994018,0.994264,0.994
3,0.0007,0.064295,0.9938,0.993818,0.994055,0.9938


TrainOutput(global_step=26250, training_loss=0.020685963184989633, metrics={'train_runtime': 11177.0799, 'train_samples_per_second': 18.788, 'train_steps_per_second': 2.349, 'total_flos': 2.764987008329568e+16, 'train_loss': 0.020685963184989633, 'epoch': 3.0})

### Evaluate the model

In [137]:
eval_results_sep_token = trainer_with_sep_token.evaluate()

In [138]:
print(f"Evaluation results with [SEP] token: {eval_results_sep_token}")

Evaluation results with [SEP] token: {'eval_loss': 0.06490599364042282, 'eval_accuracy': 0.994, 'eval_f1': 0.9940175466934492, 'eval_precision': 0.9942640919760731, 'eval_recall': 0.994, 'eval_runtime': 122.769, 'eval_samples_per_second': 81.454, 'eval_steps_per_second': 10.182, 'epoch': 3.0}


In [139]:
print(f"Validation Loss: {eval_results_sep_token['eval_loss']}")
print(f"Validation Accuracy: {eval_results_sep_token['eval_accuracy']}")
print(f"Validation Precision: {eval_results_sep_token['eval_precision']}")
print(f"Validation Recall: {eval_results_sep_token['eval_recall']}")
print(f"Validation F1: {eval_results_sep_token['eval_f1']}")

Validation Loss: 0.06490599364042282
Validation Accuracy: 0.994
Validation Precision: 0.9942640919760731
Validation Recall: 0.994
Validation F1: 0.9940175466934492


### Save the fine-tuned model and tokenizer

In [140]:
model_dir_sep = "./fine_tuned_sep_token_model"
model.save_pretrained(model_dir_sep)
tokenizer.save_pretrained(model_dir_sep)

('./fine_tuned_sep_token_model/tokenizer_config.json',
 './fine_tuned_sep_token_model/special_tokens_map.json',
 './fine_tuned_sep_token_model/vocab.txt',
 './fine_tuned_sep_token_model/added_tokens.json',
 './fine_tuned_sep_token_model/tokenizer.json')

### Load the saved model and tokenizer

In [160]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch

# Load the saved model and tokenizer
loaded_model_q2 = AutoModelForSequenceClassification.from_pretrained(model_dir_sep, num_labels=num_labels)
loaded_tokenizer_q2 = AutoTokenizer.from_pretrained(model_dir_sep)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Example inference

In [169]:
def predict_language_q2_simple(text):
    inputs = loaded_tokenizer_q2(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = loaded_model_q2(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()
        predicted_label = label_encoder.inverse_transform([predicted_class_id])
        return predicted_label[0]

In [172]:
# Example text for inference
example_text_q2 = "Bonjour, comment allez-vous?"
predicted_language_q2_simple = predict_language_q2_simple(example_text_q2)
print(f"Predicted language for Q2: {predicted_language_q2_simple}")

Predicted language for Q2: fr


In [164]:
print("Comparison of Evaluation Metrics:")
print("=================================")

print("Metrics with [CLS] token:")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"F1-score: {eval_results['eval_f1']:.4f}")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")

print("\nMetrics with [SEP] token:")
print(f"Accuracy: {eval_results_sep_token['eval_accuracy']:.4f}")
print(f"F1-score: {eval_results_sep_token['eval_f1']:.4f}")
print(f"Precision: {eval_results_sep_token['eval_precision']:.4f}")
print(f"Recall: {eval_results_sep_token['eval_recall']:.4f}")


Comparison of Evaluation Metrics:
Metrics with [CLS] token:
Accuracy: 0.9935
F1-score: 0.9935
Precision: 0.9938
Recall: 0.9935

Metrics with [SEP] token:
Accuracy: 0.9940
F1-score: 0.9940
Precision: 0.9943
Recall: 0.9940


* **Accuracy:** The model achieved an accuracy of 0.9940 with the [SEP] token, which is marginally higher than the 0.9935 accuracy obtained with the [CLS] token.
* **F1-score:** Similarly, the F1-score with the [SEP] token was 0.9940, compared to 0.9935 with the [CLS] token.
* **Precision and Recall:** Both precision and recall were higher with the [SEP] token, at 0.9943 and 0.9940 respectively, compared to 0.9938 and 0.9935 with the [CLS] token.

Additionally, the model evaluated with the [SEP] token exhibited a lower loss (0.0649) compared to the [CLS] token (0.0729), indicating better overall performance. The evaluation runtime was also shorter for the [SEP] token, processing samples and steps at a faster rate.
The slight improvements in accuracy, F1-score, precision, and recall, along with the lower evaluation loss and faster runtime, demonstrate that utilizing the [SEP] token for embedding yields better performance for the language identification task. This suggests that the [SEP] token captures relevant information more effectively for this particular model and dataset.