In [1]:
!pip install transformers \
bitsandbytes \
sentencepiece \
transformers[sentencepiece] \
accelerate \
datasets \
trl \
seacrowd



In [2]:
!pip install -U peft



In [3]:
import torch
import transformers
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import (
        get_peft_model,
        prepare_model_for_kbit_training,
        LoraConfig
    )
from trl import SFTTrainer

2024-12-16 05:18:18.258439: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-16 05:18:18.641681: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-16 05:18:18.641764: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-16 05:18:18.712203: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-16 05:18:18.850640: I tensorflow/core/platform/cpu_feature_guar

In [4]:
data = load_dataset("SEACrowd/indo_general_mt_en_id", trust_remote_code=True)
data_train, data_test, data_val = data["train"], data["test"], data["validation"]
# Randomly select 10k indices
import random
random_indices = random.sample(range(len(data_train)), 10000)

# Select the 10k rows
data_train = data_train.select(random_indices)
print(data_train, data_test, data_val)

# example
data_train[0]

Dataset({
    features: ['id', 'src', 'tgt'],
    num_rows: 10000
}) Dataset({
    features: ['id', 'src', 'tgt'],
    num_rows: 2000
}) Dataset({
    features: ['id', 'src', 'tgt'],
    num_rows: 2000
})


{'id': '1341140',
 'src': '"It was developed by Visual Concepts and published by 2K Sports, a subsidiary of Take-Two Interactive."',
 'tgt': '"Civilization VI dikembangkan oleh Firaxis Games, diterbitkan oleh 2K Games, dan didistribusikan oleh Take-Two Interactive."'}

In [5]:
model_name = "microsoft/Phi-3.5-mini-instruct"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                            load_in_8bit=True,
                                             torch_dtype=torch.float16,
                                            device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["qkv_proj", "o_proj"],  # Adjust based on your model architecture
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    # bnb_8bit=True  # Enable bitsandbytes 8-bit quantization
)

In [7]:
# this should be set for finutning and batched inference
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [8]:
from peft import PeftModel

peft_model_id = "checkpoint-620"
peft_model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float16, offload_folder="lora_results/lora_7/temp")



In [9]:
def generate_prompt(dialogue, output=None, eos_token="</s>"):
  instruction = """
Translate the following sentences from English to Indonesian:

English: I am going to the market. What do you want to buy?.
Indonesian: Saya akan pergi ke pasar. Apa yang ingin kamu beli?.

English: """
  input = f"{dialogue}\n"
  output = f"""Indonesian:
#answer: 
{output + ' ' + eos_token if output else ''} """
  prompt = (" ").join([instruction, input, output])
  return prompt

In [10]:

input_prompt = generate_prompt(data_test[37]["src"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.amp.autocast('cuda'):
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
output_text = tokenizer.decode(generation_output[0], skip_special_tokens=True)
translated_text = ""
if "#answer" in output_text:
    # Split by lines and find the text after '#answer'
    lines = output_text.split("\n")
    try:
        index = next((i for i, line in enumerate(lines) if line.startswith("#answer")), -1) +1# Get the line after '#answer'
        translated_text = lines[index].strip()  # Clean up any extra spaces
        translated_text = translated_text.replace("Indonesian:", "").strip()
    except IndexError:
        translated_text = "Translation not found."
else:
    translated_text = "No '#answer' marker in output."
print(output_text)
print(translated_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Translate the following sentences from English to Indonesian:

English: I am going to the market. What do you want to buy?.
Indonesian: Saya akan pergi ke pasar. Apa yang ingin kamu beli?.

English:  Now a damane.
 Indonesian:
#answer: 
  Sekarang ada daerahan.
Sekarang ada daerahan.


In [None]:
import torch
import csv
import numpy as np
import os
from nltk.translate.bleu_score import sentence_bleu
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# List to store translations
translations = []
bleu_scores = []
# Process each sentence in the test dataset
start = 229
for key, (input_text, output_text_real) in enumerate(zip(data_test['src'][start:], data_test['tgt'][start:]), start=229):    # Create the prompt for the current sentence
    
    # Tokenize the prompt
    inputs = tokenizer(generate_prompt(input_text), return_tensors="pt").to("cuda")

    # Generate the translation
    # generate_ids = model.generate(inputs['input_ids'], max_length=500)
    # with torch.amp.autocast('cuda'):
    #     generation_output = peft_model.generate(
    #         input_ids=inputs,
    #         max_new_tokens=300,
    #         do_sample=True,
    #         top_k=10,
    #         top_p=0.9,
    #         temperature=0.3,
    #         repetition_penalty=1.15,
    #         num_return_sequences=1,
    #         eos_token_id=tokenizer.eos_token_id,
    #     )
    with torch.no_grad():
        with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
            generation_output = peft_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=150,  # Shortened for efficiency
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=0.8,
                repetition_penalty=1.15,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
            )
    
    # Decode the output and extract the translation
    output_text = tokenizer.decode(generation_output[0], skip_special_tokens=True)
    
    # Extract the sentence one line below `#answer`
    translated_text = ""
    if "#answer" in output_text:
        # Split by lines and find the text after '#answer'
        lines = output_text.split("\n")
        try:
            # index = lines.index("#answer") + 1  # Get the line after '#answer'
            index = next((i for i, line in enumerate(lines) if line.startswith("#answer")), -1) +1
            translated_text = lines[index].strip()  # Clean up any extra spaces
            translated_text = translated_text.replace("Indonesian:", "").strip()
        except IndexError:
            translated_text = "Translation not found."
    else:
        translated_text = "No '#answer' marker in output."
    # Calculate BLEU score for this sentence
    bleu_score = sentence_bleu([output_text_real], translated_text)
    with open('predictions_phi35_finetune_one_shot_test.csv', mode='a', newline='') as file:
        if not os.path.exists('predictions_phi35_finetune_one_shot_test.csv'):
            writer.writerow(['Index', 'Input', 'Predicted Output', 'Correct Output', 'BLEU Score'])
        writer = csv.writer(file)
        writer.writerow([key + 1, input_text, translated_text, output_text_real, bleu_score])
    # Print the results
    print(f"Key: {key}")
    print(f"Input: {input_text}")
    print(f"Predicted: {translated_text}")
    print(f"Reference: {output_text_real}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print("-" * 50)
    bleu_scores.append(bleu_score)
print(np.mean(bleu_scores))



Key: 229
Input: He also ordained Richard Whatcoat and Thomas Vasey as presbyters.
Predicted: Dia juga memperintah Richard Whatcote dan Thomasasuspresbiterannya.
Reference: Ia juga menahbiskan Richard Whatcoat dan Thomas Vasey sebagai presbiter.
BLEU Score: 0.5884
--------------------------------------------------
Key: 230
Input: "On the way back to Thessaly, Medea prophesied that Euphemus, the helmsman of Jason's ship, the Argo, would one day rule over all Libya."
Predicted: Belum diindikasikan dalam benda-benda kembali Ke Tesiselan, Medea coba tercipta prekaryanya bahwa Eufemeiah, pil dan kapitan laut Pergubun jari Janyasa Prambawa sampai tinggal menjabat osewasi setiap Kabupaten Lifuri pada masa edfesi akhirnya dulu ya?
Reference: "Dalam perjalananya, Medeia meramalkan bahwa Eufemos, jurumudi kapal Argo, suatu hari akan berkuasa di Libya."
BLEU Score: 0.2016
--------------------------------------------------
Key: 231
Input: "In 2014, the attendance is 4,000 persons."
Predicted: Belum

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Key: 236
Input: """Theme park plan still on, timing unclear: GEN Malaysia""."
Predicted: Pengaturan taman hiburan masih berjalannya, waktunya ketidakklarifikasional dalam RAN MALAYSIHA.:
Reference: """Venue Menembak Asian Para Games Belum Beres, Adaptasi Indonesia Minim"". detiksport."
BLEU Score: 0.0000
--------------------------------------------------
Key: 237
Input: Rows in blue are not present in ISO 3166-2.
Predicted: Rowan unggul di biru tidak ada dalam ISOTO3166-2.
Reference: Kode-kode ini tidak terdaftar dalam ISO 3166-2.
BLEU Score: 0.4547
--------------------------------------------------
Key: 238
Predicted: India Raja Bintang Sarjana (Sebelum) - Perdana Menteripnanti Teknik dan Pengetahuan Telatan Indon Sergio Kandhalib diberitakan dua hari seteret dikerjakannya melawan pemutuh kegiatankamunnyih mengatakatlah sebuyanya berboikot terhadap warna-warni Nikmat Waktur Terakhir Suroju Januarsendulika dengan menjelaskarkan dirimorlan adalah bocok tadi ("babuk").
Reference: "Tiga h

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Key: 293
Input: "These scholars fear that users will be unable to "" beyond narrow self-interest"" as filter bubbles create personalized social feeds, isolating them from diverse points of view and their surrounding communities."
Predicted: هذا الفقهاء خوف أن تكون المستخدمين غير بعيدة جدًا عن "المصلحة الشخصية"، כثير الزبائن الإضافية؛ حيث تظهر أحزمة الفضاء الآلي DNA الشخصي النشط للتغذية الاجتماعية، وهذه الحزم التفاعل
Reference: "Para sarjana ini khawatir bahwa pengguna tidak akan dapat "" di luar kepentingan pribadi yang sempit"" karena gelembung filter membuat umpan sosial yang dipersonalisasi, mengisolasi mereka dari beragam sudut pandang dan komunitas sekitar mereka."
BLEU Score: 0.0000
--------------------------------------------------
Key: 294
Input: Nick Raider's stories are set in New York City.
Predicted: Stori Nikk raidir terjebos di Kota Jalan Besar NYCnya.
Reference: Konflik Nick Jago terjadi di Skotlandia.
BLEU Score: 0.2153
--------------------------------------------------

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Key: 434
Input: "As the audience is waiting for her to perform, scenes of Rihanna approaching the stage are shown."
Predicted: ภายในหลังประชากรที่พวกเข้าอยู่จำแม็กส์ด้วยความตื่นตาบ้าง，โดมไดเธอร์สิทธิ์ของไรห์ต่อศูน์สุดคือภาพที่สาด看ได้ .
Reference: "Untuk mencegah terjadinya kelebihan penonton, maka Stadion Rajamangala mengadaakn perluasan."
BLEU Score: 0.0000
--------------------------------------------------
Key: 435
Input: "It was, at that point, the first song the group put out after having split with longtime manager and producer Maurice Starr."
Predicted: ,""Sepertinya itulah pertama lagu tim tersebut setelah bersjalinkan dengan pengurus dan produsen lebisy-lebisikan maurice starr panjang sejak awalnya""
Reference: "Setelah merilis album berikutnya pada tahun 1990, Step By Step, mereka berpisah dari manajer dan produser mereka Maurice Starr dan tidak lama kemudian membubarkan diri."
BLEU Score: 0.3336
--------------------------------------------------
Key: 436
Input: "Maximum negat

In [None]:
# import torch
# import csv
# import numpy as np
# import os
# from nltk.translate.bleu_score import sentence_bleu
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# batch_size = 4
# translations = []
# bleu_scores = []

# # Open file once
# key=0
#     # Process in batches
# for i in range(0, len(data_test['src']), batch_size):
#     batch_inputs = data_test['src'][i:i + batch_size]
#     batch_references = data_test['tgt'][i:i + batch_size]

#     # Create prompts
#     prompt_template = """
# Translate the following sentences from English to Indonesian:

# English: {input_text}
# Indonesian:
# #answer
# """
#     batch_prompts = [prompt_template.format(input_text=input_text) for input_text in batch_inputs]

#     # Tokenize
#     inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True).to("cuda")

#     # Generate translations
#     # generate_ids = model.generate(inputs['input_ids'], max_length=300)
#     with torch.amp.autocast('cuda'):
#         generate_ids = peft_model.generate(
#             input_ids=inputs["input_ids"],
#             max_new_tokens=300,
#             do_sample=True,
#             top_k=10,
#             top_p=0.9,
#             temperature=0.2,
#             repetition_penalty=1.15,
#             num_return_sequences=1,
#             eos_token_id=tokenizer.eos_token_id,
#         )
#     batch_outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True)

#     # Extract translations and calculate BLEU
#     for idx, (input_text, output_text_real, output_text) in enumerate(zip(batch_inputs, batch_references, batch_outputs)):

#         # Extract the sentence one line below `#answer`
#         translated_text = ""
#         if "#answer" in output_text:
#             # Split by lines and find the text after '#answer'
#             lines = output_text.split("\n")
#             try:
#                 # index = lines.index("#answer") + 1  # Get the line after '#answer'
#                 index = next((i for i, line in enumerate(lines) if line.startswith("#answer")), -1) +1
#                 translated_text = lines[index].strip()  # Clean up any extra spaces
#                 translated_text = translated_text.replace("Indonesian:", "").strip()
#             except IndexError:
#                 translated_text = "Translation not found."
#         else:
#             translated_text = "No '#answer' marker in output."
#         # Calculate BLEU score for this sentence
#         bleu_score = sentence_bleu([output_text_real], translated_text)
#         with open('predictions_phi35_finetune_test.csv', mode='a', newline='') as file:
#             writer = csv.writer(file)
#             if not os.path.exists('predictions_phi35_finetune_test.csv'):
#                 writer.writerow(['Index', 'Input', 'Predicted Output', 'Correct Output', 'BLEU Score'])
#             writer.writerow([key + 1, input_text, translated_text, output_text_real, bleu_score])
#         # Print the results
#         print(f"Input: {input_text}")
#         print(f"Predicted: {translated_text}")
#         print(f"Reference: {output_text_real}")
#         print(f"BLEU Score: {bleu_score:.4f}")
#         print("-" * 50)
#         bleu_scores.append(bleu_score)
#         key +=1

# # Print average BLEU score
# print(f"Average BLEU Score: {np.mean(bleu_scores):.4f}")