In [12]:
!pip install sentencepiece



In [13]:
!pip install datasets



In [14]:
# Import necessary libraries
import torch
import sentencepiece
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, load_metric

In [15]:
# Ensuring reproducibility
RANDOM_SEED = 42

torch.manual_seed(RANDOM_SEED)
# If you are using CUDA
torch.cuda.manual_seed_all(RANDOM_SEED)

In [16]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
# Step 1: Load the Pre-trained Model
model_name = "Helsinki-NLP/opus-mt-vi-en"  # Example model for Bahnar to English
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)



In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [19]:
# Step 2: Load and Prepare Bahnar to English Dataset

# Read the data from files
with open('/kaggle/input/ba-vi-en-dataset/all.vi-en-ba.bdq-filtered.bdq.train', 'r', encoding='utf-8') as file_ba:
    ba_data = file_ba.readlines()

with open('/kaggle/input/ba-vi-en-dataset/all.vi-en-ba.eng-filtered.eng.train', 'r', encoding='utf-8') as file_en:
    en_data = file_en.readlines()


# Check that both files have the same number of lines
assert len(ba_data) == len(en_data), "The files don't have the same number of lines."

# Assuming ba_data and en_data are lists of sentences
train_df = pd.DataFrame({'Bahnar': ba_data, 'English': en_data})
print(len(train_df))

with open('/kaggle/input/ba-vi-en-dataset/all.vi-en-ba.bdq-filtered.bdq.eval', 'r', encoding='utf-8') as file_ba:
    ba_data = file_ba.readlines()

with open('/kaggle/input/ba-vi-en-dataset/all.vi-en-ba.eng-filtered.eng.eval', 'r', encoding='utf-8') as file_en:
    en_data = file_en.readlines()


# Check that both files have the same number of lines
assert len(ba_data) == len(en_data), "The files don't have the same number of lines."

# Assuming ba_data and en_data are lists of sentences
val_df = pd.DataFrame({'Bahnar': ba_data, 'English': en_data})



# Convert the dataframes to Hugging Face Datasets
ba_en_train_dataset = Dataset.from_pandas(train_df)
ba_en_val_dataset = Dataset.from_pandas(val_df)



30030


In [20]:
# Read the test samples from files
with open('/kaggle/input/ba-vi-en-dataset/all.vi-en-ba.bdq-filtered.bdq.test', 'r', encoding='utf-8') as file_ba:
    ba_data = file_ba.readlines()

with open('/kaggle/input/ba-vi-en-dataset/all.vi-en-ba.eng-filtered.eng.test', 'r', encoding='utf-8') as file_en:
    en_data = file_en.readlines()

# Check that both files have the same number of lines
assert len(ba_data) == len(en_data), "The files don't have the same number of lines."

# Assuming ba_data and en_data are lists of sentences
test_df = pd.DataFrame({'Bahnar': ba_data, 'English': en_data})

# Convert the dataframes to Hugging Face Datasets
ba_en_test_dataset = Dataset.from_pandas(test_df)
print(len(ba_en_train_dataset), len(ba_en_val_dataset), len(ba_en_test_dataset))

30030 2000 2000


In [21]:

def preprocess_function(examples):
    inputs = examples['Bahnar']
    targets = examples['English']
    model_inputs = tokenizer(inputs, max_length=228, padding='max_length', truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=228, padding='max_length', truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_ba_en = ba_en_train_dataset.map(preprocess_function, batched=True)
tokenized_ba_en_val = ba_en_val_dataset.map(preprocess_function, batched=True)


  0%|          | 0/31 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback

print((len(tokenized_ba_en)))
# Use data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Step 3: Fine-Tuning the Model on Bahnar to English
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.015,  # L2 regularization
    save_total_limit=3,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ba_en,
    eval_dataset=tokenized_ba_en_val,
    data_collator=data_collator,
)

trainer.train()

# Step 4: Evaluate the Model
# Load a metric to evaluate the model, for example BLEU
from datasets import load_metric
bleu_metric = load_metric("bleu")


30030


Epoch,Training Loss,Validation Loss


In [None]:
from tqdm.auto import tqdm

def evaluate_model(model, tokenizer, dataset, device):
    model.eval()

    predictions, references = [], []

    # Add tqdm for progress tracking
    for example in tqdm(dataset, desc="Translating"):
        # Move inputs to the correct device
        input_ids = tokenizer.encode(example['Bahnar'], return_tensors='pt').to(device)

        # Generate translation using model
        output_ids = model.generate(input_ids)[0]

        # Decode the output and add to predictions
        pred = tokenizer.decode(output_ids, skip_special_tokens=True)
        predictions.append([pred])

        # Add the actual sentence to references
        references.append([example['English']])

    return predictions, references

In [None]:
# Convert the dataset to a list of dictionaries
test_examples = [{'Bahnar': ex['Bahnar'], 'English': ex['English']} for ex in ba_en_test_dataset]


# Evaluate the model
predictions, references = evaluate_model(model, tokenizer, test_examples, device)

### Saving the model

In [None]:
# Step 7: Save the Model
model.save_pretrained("./model")
tokenizer.save_pretrained("./tokenizer")

In [None]:
predictions[0], references[0]

In [None]:
tokenized_predictions = [pred[0].split() for pred in predictions]  # Tokenize predictions
tokenized_references = [[ref[0].split()] for ref in references]    # Tokenize references, note the double list
tokenized_predictions[0], tokenized_references[0]

In [None]:
prt() # Co tinh de sai cho dung test

### Calculate BLEU score

In [None]:
# Calculate BLEU score
bleu_metric = load_metric('bleu')
bleu_score = bleu_metric.compute(predictions=tokenized_predictions, references=tokenized_references)
print(f"BLEU score: {bleu_score['bleu']}")

In [None]:
# Display the first 10 prediction-reference pairs
for i in range(5):
    print(f"Prediction {i+1}: {predictions[i][0]}")
    print(f"Reference {i+1}: {references[i][0]}")



In [None]:
# Function to calculate BLEU score for each pair
def calculate_bleu_score(predictions, references):
    scores = []
    for pred, ref in tqdm(zip(predictions, references)):
        score = bleu_metric.compute(predictions=[pred[0].split(' ')],references=[[ref[0].split(' ')]])
        scores.append(score['bleu'])

    return scores

# Calculate BLEU scores for each pair
bleu_scores = calculate_bleu_score(predictions, references)

# Combine predictions, references, and scores into a single list
combined = list(zip(predictions, references, bleu_scores))

# Sort the combined list based on BLEU scores in descending order
sorted_pairs = sorted(combined, key=lambda x: x[2], reverse=True)

In [None]:
# Select the top 20 pairs
top_20_pairs = sorted_pairs[20:220]

# Display the top 20 pairs along with their BLEU scores
for i, (pred, ref, score) in enumerate(top_20_pairs):
    print(f"Pair {i+1} - BLEU Score: {score}")
    print(f"Prediction: {pred[0]}")
    print(f"Reference: {ref[0]}")
    print()  # For better readability

### Saving the test_samples to pickle format

In [None]:
import pickle

# Save test_samples with Pickle
with open('/kaggle/working/test_samples.pkl', 'wb') as file:
    pickle.dump(test_examples, file)


# Clean up the output

In [None]:
import os
import shutil

def clear_directory_except_subdir(directory, subdir):
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        if os.path.isdir(item_path) and item_path not in subdir:
            shutil.rmtree(item_path)  # Remove directories
        elif os.path.isfile(item_path) and item_path not in subdir:
            os.remove(item_path)  # Remove files

# Usage
working_dir = '/kaggle/working'
results_dir = ['/kaggle/working/results', '/kaggle/working/model', '/kaggle/working/tokenizer', '/kaggle/working/test_samples.pkl']
clear_directory_except_subdir(working_dir, results_dir)


# Zipping for downloads

In [None]:
!zip -r model.zip /kaggle/working/model

In [None]:
!zip -r tokenizer.zip /kaggle/working/tokenizer

In [None]:
!zip -r checkpoints.zip /kaggle/working/results

# Load the model and test again

In [None]:

model_path = '/kaggle/working/model'
tokenizer_path = '/kaggle/working/tokenizer'

# Load the model and tokenizer
loaded_model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
loaded_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)


In [None]:
import pickle
# Load test_samples with Pickle
with open('/kaggle/working/test_samples.pkl', 'rb') as file:
    loaded_test_examples = pickle.load(file)

loaded_test_examples

In [None]:

# Evaluate the model
predictions, references = evaluate_model(loaded_model, loaded_tokenizer, loaded_test_examples, device)

In [None]:
tokenized_predictions = [pred[0].split() for pred in predictions]  # Tokenize predictions
tokenized_references = [[ref[0].split()] for ref in references]    # Tokenize references, note the double list
tokenized_predictions[:2], tokenized_references[:2]

In [None]:
# Calculate BLEU score
bleu_metric = load_metric('bleu')
bleu_score = bleu_metric.compute(predictions=tokenized_predictions, references=tokenized_references)
print(f"BLEU score: {bleu_score['bleu']}")

### Test by bleurt by Google

In [None]:
# references = []

# # Add tqdm for progress tracking
# for example in tqdm(loaded_test_examples, desc="Translating"):

#     # Add the actual sentence to references
#     references.append([example['English']])

In [None]:
!pip install git+https://github.com/lucadiliello/bleurt-pytorch.git


In [None]:
import torch
from bleurt_pytorch import BleurtConfig, BleurtForSequenceClassification, BleurtTokenizer

config = BleurtConfig.from_pretrained('lucadiliello/BLEURT-20')
bleurt = BleurtForSequenceClassification.from_pretrained('lucadiliello/BLEURT-20')
bleurt_tokenizer = BleurtTokenizer.from_pretrained('lucadiliello/BLEURT-20')

In [None]:
bleurt.eval()
with torch.no_grad():
    inputs = bleurt_tokenizer(references, predictions, padding='longest', return_tensors='pt')
#     res = bleurt(**inputs).logits.flatten().tolist()
    res = bleurt(**inputs).logits.flatten().mean()
print(res)
# [0.9990496635437012, 0.7930182218551636]

In [None]:
tokenized_predictions = [pred.split() for pred in preds]  # Tokenize predictions
tokenized_references = [[ref.split()] for ref in refs]
tokenized_predictions, tokenized_references
bleu_metric = load_metric('bleu')
bleu_score = bleu_metric.compute(predictions=tokenized_predictions, references=tokenized_references)
print(f"BLEU score: {bleu_score['bleu']}")