# 1. Load json file and classify small and big documents

In [25]:
import json

with open('raw.json', 'r', encoding='utf-8') as f:
    data_dict = json.load(f)

In [32]:
import nltk
from nltk.tokenize import word_tokenize

def check_size(doc: dict) -> bool:
    #true -> big
    #false -> small
    
    # nltk.download('punkt')
    # nltk.download('punkt_tab')
    
    return len(word_tokenize(str(doc['english_sentences']))) > 1000
    

In [35]:
len(data_dict)

445

In [40]:
small_docs, big_docs = [], []

for doc in data_dict:
    if check_size(doc):
        big_docs.append(doc)
        
    else:
        small_docs.append(doc)

Preprocess with small docs

In [44]:
small_preprocessed = []

for doc in small_docs:
    doc_preprocessed = {}
    en_text = "\n".join(doc['english_sentences'])
    doc_preprocessed['en'] = doc['title_english'] + '\n' + en_text
    vi_text = "\n".join(doc['vietnamese_sentences'])
    doc_preprocessed['vi'] = doc['title_vietnamese'] + '\n' + vi_text
    small_preprocessed.append(doc_preprocessed)

In [45]:
small_preprocessed[0]

{'en': 'THE BUM ...\nA bum approaches a well dressed gentleman on the street. "Hey, Buddy, can you spare two dollars?" The well-dressed gentleman responds, "You are not going to spend in on liquor are you?" "No, sir, I don t drink," retorts the bum. "You are not going to throw it away in some crap game, are you?" asks the gentleman. "No way, I don t gamble," answers the bum. "You wouldn t waste the money at a golf course for greens fees, would you?" asks the man. "Never," says the bum, "I don t play golf." The man asks the bum if he would like to come home with him for a home cooked meal.\nThe bum accepts eagerly.\nWhile they are heading for the man s house, the bum s curiosity gets the better of him. "Isn t your wife going to be angry when she sees a guy like me at your table?" "Probably," says the man, "but it will be worth it.\nI want her to see what happens to a guy who doesn t drink, gamble or play golf."',
 'vi': 'KẺ ĂN MÀY ...\nMột gã ăn mày tiến gần một quý ông ăn mặt sang trọn

In [62]:
import json

with open('small.json', 'w', encoding='utf-8') as f:
    json.dump(small_preprocessed, f, ensure_ascii=False, indent=2)

In [63]:
del small_preprocessed

Preprocess with big docs

In [55]:
import re

def extract_bilingual_blocks(text: str):
    pattern = r"<(en\d+)>(.*?)</\1>|<(vi\d+)>(.*?)</\3>"
    matches = re.findall(pattern, text, flags=re.DOTALL)

    data = {}
    result = []

    for en_tag, en_text, vi_tag, vi_text in matches:
        if en_tag:  # gặp đoạn tiếng Anh
            idx = int(en_tag[2:])  # lấy số sau 'en'
            data.setdefault(idx, {})['en'] = en_text.strip()
        if vi_tag:  # gặp đoạn tiếng Việt
            idx = int(vi_tag[2:])  # lấy số sau 'vi'
            data.setdefault(idx, {})['vi'] = vi_text.strip()

    # Đưa vào list theo thứ tự
    for idx in sorted(data.keys()):
        if 'en' in data[idx] and 'vi' in data[idx]:  # chỉ lấy cặp đầy đủ
            result.append({'en': data[idx]['en'], 'vi': data[idx]['vi']})

    return result


In [64]:
import json
with open('big.json', 'w', encoding='utf-8') as f:
    json.dump(big_docs, f, ensure_ascii=False, indent=2)

In [None]:
api_key = ['AIzaSyDBPxeJNTjC370xEbLFaLhSpkmnp7GLy3o', 'AIzaSyCH-Cth736EWV_U4lpBK3XXWkTADqAOgLg', 'AIzaSyB0W_RBQ_5dSGONkZSc_kscjQeb5hYyKQU']
import google.generativeai as genai
big_preprocessed = []

for i, doc in enumerate(big_docs):
    en_text = "\n".join(doc['english_sentences'])
    en_text = doc['title_english'] + '\n' + en_text
    vi_text = "\n".join(doc['vietnamese_sentences'])
    vi_text = doc['title_vietnamese'] + '\n' + vi_text
    
    # Khai báo API key
    genai.configure(api_key=api_key[(i+2)%3])

    # Chọn model (ví dụ gemini-1.5-flash hoặc gemini-1.5-pro)
    model = genai.GenerativeModel("gemini-2.5-flash")

    prompt = f"""##Ngữ cảnh:##\n
                Có 2 văn bản tiếng anh và tiếng việt là 2 bản dịch của nhau nhưng chúng quá dài
                ##Nhiệm vụ:##\n
                tách các văn bản trên thành các đoạn tương ứng với độ dài khoảng 200 từ trên 1 đoạn
                #Văn bản gốc tiếng anh:#\n
                {en_text}
                #Văn bản gốc tiếng việt:#\n
                {vi_text}
                ##Yêu cầu ouput có định dạng như sau:##\n
                <en1>đoạn tiếng anh 1</en1>
                <vi1>đoạn tiếng việt 1</vi1>
                <en2>đoạn tiếng anh 2</en2>
                <vi2>đoạn tiếng việt 2</vi2>
                ......
                ##Ví dụ:##
                #Input:             
                #Văn bản gốc tiếng anh:#\n
                Hi i am Gemini. i am very handsome
                #Văn bản gốc tiếng anh:#\n
                Tôi là Gemini. tôi đẹp trai
                #Ouput:
                <en1>Hi i am Gemini.</en1>
                <vi1>đTôi là Gemini</vi1>
                <en2>i am very handsome</en2>
                <vi2>tôi đẹp trai</vi2>
                ##Lưu ý:##
                ở mỗi văn bản con tối thiểu 200 từ.
            """

    # Gọi API sinh text
    response = model.generate_content(prompt)
    print(response.text)
    big_preprocessed.append(extract_bilingual_blocks(response.text))




<en1>Due to the war between humans and the Dalki, every individual was required to go to military school for the duration of two years. Although humans and the Dalki were currently embracing a peaceful period, nobody believed it would last forever. The Dalki were not to be trusted. Their taste and hunger for power and their desire to control creatures they considered to be beneath them, could hardly be kept at bay. Every single day there would be news regarding one race antagonising the other, even on the verge of waging war. The citizens had this foreboding that an altercation could break out at any moment.
Quinn quickly went to the toilet before heading off with Sergeant Griff. His bladder felt like it was going to burst any second now because of the sheer amount of water that he had drunk. Once he stepped out of his room, welcomed by the outside world, something strange happened. A new notification screen appeared in front of his eyes.
[Your body is being hit by direct sunlight] [Yo

# 2. Merge 2 part big and small then split to 3 part train, validation and train

In [1]:
import json
with open('big1.json', 'r', encoding='utf-8') as f:
    big1 = json.load(f)
with open('big2.json', 'r', encoding='utf-8') as f:
    big2 = json.load(f)    
with open('small.json', 'r', encoding='utf-8') as f:
    small = json.load(f)

In [7]:
merge = []
for docs in big1:
    for doc in docs:
        merge.append(doc)
        
for docs in big2:
    for doc in docs:
        merge.append(doc)   

for doc in small:

        merge.append(doc)    


In [12]:
len(merge)

746

In [16]:
import random

random.shuffle(merge)

In [18]:
train = merge[:600]
valid = merge[600:680]
test = merge[680:]

In [19]:
with open('train.json', 'w' , encoding='utf-8') as f:
    json.dump(train, f, indent=2, ensure_ascii=False)
    
with open('valid.json', 'w' , encoding='utf-8') as f:
    json.dump(valid, f, indent=2, ensure_ascii=False)
with open('test.json', 'w' , encoding='utf-8') as f:
    json.dump(test, f, indent=2, ensure_ascii=False)