In [1]:
import os
import math
import random
from collections import Counter

POSSIBLE_PATHS = [
    "/kaggle/input/final-data/processed", 
    "/kaggle/input/final-data", 
    "/kaggle/input/processed"
]
DATA_DIR = None
for p in POSSIBLE_PATHS:
    if os.path.exists(p) and "train_unpc.en" in os.listdir(p):
        DATA_DIR = p
        break

if not DATA_DIR:
    print("Data path not found. Please manually modify the DATA_DIR variable!")
else:
    print(f"Data Directory: {DATA_DIR}")

def analyze_corpus(name, src_filename, tgt_filename):
    src_path = os.path.join(DATA_DIR, src_filename)
    tgt_path = os.path.join(DATA_DIR, tgt_filename)
    
    if not os.path.exists(src_path) or not os.path.exists(tgt_path):
        print(f"⚠️ Skipping {name}: File not found")
        return

    print(f"\n======== Analyzing: {name} ========")
    
    # Statistical variables
    src_lines, tgt_lines = [], []
    src_lens_char, tgt_lens_char = [], []
    src_lens_word, tgt_lens_word = [], []
    src_vocab, tgt_vocab = set(), set()
    
    # Read Source
    with open(src_path, 'r', encoding='utf-8') as f:
        for line in f:
            l = line.strip()
            src_lines.append(l)
            src_lens_char.append(len(l))
            words = l.split() # Simple whitespace tokenization
            src_lens_word.append(len(words))
            src_vocab.update(words)

    # Read Target
    with open(tgt_path, 'r', encoding='utf-8') as f:
        for line in f:
            l = line.strip()
            tgt_lines.append(l)
            tgt_lens_char.append(len(l))
            words = l.split()
            tgt_lens_word.append(len(words))
            tgt_vocab.update(words)

    count = len(src_lines)
    print(f"1. Line Count (Sentence Pairs): {count}")
    
    avg_src_w = sum(src_lens_word) / count
    avg_tgt_w = sum(tgt_lens_word) / count
    avg_src_c = sum(src_lens_char) / count
    avg_tgt_c = sum(tgt_lens_char) / count
    
    print(f"2. Avg Length (Source EN): {avg_src_w:.2f} words, {avg_src_c:.2f} chars")
    print(f"   Avg Length (Target ZH): {avg_tgt_w:.2f} words, {avg_tgt_c:.2f} chars")

    print(f"3. Vocabulary Size (Unique Tokens): Source={len(src_vocab)}, Target={len(tgt_vocab)}")

    ratios = []
    for sl, tl in zip(src_lens_char, tgt_lens_char):
        if tl == 0: continue
        ratios.append(sl / tl)
    if ratios:
        avg_ratio = sum(ratios) / len(ratios)
        print(f"4. Avg Length Ratio (Src/Tgt Char Ratio): {avg_ratio:.2f}")

    def get_entropy(text_list):
        full_text = "".join(text_list)
        if not full_text: return 0
        counts = Counter(full_text)
        total = len(full_text)
        ent = 0
        for cnt in counts.values():
            p = cnt / total
            ent -= p * math.log2(p)
        return ent

    print("5. Calculating Character Entropy...")
    print(f"   Source Entropy: {get_entropy(src_lines):.4f}")
    print(f"   Target Entropy: {get_entropy(tgt_lines):.4f}")

    print("\n----- Random Sample (Qualitative Check) -----")
    indices = random.sample(range(count), min(20, count))
    for idx in indices:
        print(f"[{idx}] EN: {src_lines[idx]}")
        print(f"      ZH: {tgt_lines[idx]}")

if DATA_DIR:
    # Analyze UNPC Cleaned
    analyze_corpus("UNPC Cleaned (Train)", "train_unpc.en", "train_unpc.zh")
    
    # Analyze TED Cleaned
    analyze_corpus("TED Cleaned (Train)", "train_ted.en", "train_ted.zh")

Data Directory: /kaggle/input/final-data/processed

1. Line Count (Sentence Pairs): 462483
2. Avg Length (Source EN): 19.45 words, 123.11 chars
   Avg Length (Target ZH): 2.81 words, 49.63 chars
3. Vocabulary Size (Unique Tokens): Source=372300, Target=669711
4. Avg Length Ratio (Src/Tgt Char Ratio): 2.42
5. Calculating Character Entropy...
   Source Entropy: 4.8487
   Target Entropy: 8.5990

----- Random Sample (Qualitative Check) -----
[39781] EN: GPS works in very logical steps; each of the orbiting satellites beams a continuous radio signal to Earth, which is received by a GPS receiver in order to derive distances by measuring the travel time of the radio signals.
      ZH: 13. 全球定位系统工作步骤非常富有逻辑性；每一在轨卫星向地球发射连续无线电信号，该信号被全球定位系统接收机接收，以便通过量测无线电信号的传播时间得出距离。
[96625] EN: 56. At the invitation of the Chairman, Ms. Mardach Miguel (Women for Peace and Justice for Vieques) took a seat at the petitioners &apos; table.
      ZH: 56. 在主席的邀请下，Mardach Miguel女士（妇女促进别克斯和平与正义）在请愿者桌前就座。
[52380] EN: Ann

In [2]:
!pip install -q transformers[torch] datasets sacrebleu evaluate sentencepiece

import torch

print(f"GPU Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Current Device: {torch.cuda.get_device_name(0)}")
else:
    print("Warning: You are using CPU!")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hGPU Available: True
Current Device: Tesla T4


In [4]:
# Tokenizer Analysis & Data Path Verification
import os
import numpy as np
from transformers import AutoTokenizer

POSSIBLE_PATHS = [
    "/kaggle/input/final-data/processed", 
    "/kaggle/input/final-data",          
    "/kaggle/input/processed"            
]

DATA_DIR = None
for p in POSSIBLE_PATHS:
    if os.path.exists(p) and "train_unpc.en" in os.listdir(p):
        DATA_DIR = p
        break

if not DATA_DIR:
    print("Data not found! Please check the Data panel on the right, copy the 'processed' folder path, and update the paths above.")
else:
    print(f"Data path confirmed: {DATA_DIR}")

MODEL_CHECKPOINT = "Helsinki-NLP/opus-mt-en-mul"

# Load Tokenizer
print(f"Loading model vocabulary: {MODEL_CHECKPOINT}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def analyze_file(filename, label):
    filepath = os.path.join(DATA_DIR, filename)
    print(f"\n Analyzing: {label} ({filename})")
    
    lengths = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 10000: break 
            tokens = tokenizer.tokenize(line.strip())
            lengths.append(len(tokens))
            
            if i < 2:
                print(f"  [Sample {i+1}] Text: {line.strip()[:50]}...")
                print(f"  [Sample {i+1}] Tokens: {tokens}")

    print(f" {label} Average Length: {np.mean(lengths):.2f} tokens")

# Run Analysis
if DATA_DIR:
    analyze_file("train_unpc.en", "English Train Set")
    analyze_file("train_unpc.zh", "Chinese Train Set")

Data path confirmed: /kaggle/input/final-data/processed
Loading model vocabulary: Helsinki-NLP/opus-mt-en-mul...


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]




 Analyzing: English Train Set (train_unpc.en)
  [Sample 1] Text: a 1999 data are provisional....
  [Sample 1] Tokens: ['▁a', '▁1999', '▁data', '▁are', '▁provisional', '.']
  [Sample 2] Text: Recalling its resolution 49/251 of 20 July 1995 on...
  [Sample 2] Tokens: ['▁Recalling', '▁its', '▁resolution', '▁49', '/25', '1', '▁of', '▁20', '▁July', '▁1995', '▁on', '▁the', '▁financing', '▁of', '▁the', '▁Tribunal', '▁and', '▁its', '▁subsequent', '▁resolutions', '▁thereon', ',', '▁the', '▁latest', '▁of', '▁which', '▁was', '▁resolution', '▁53', '/21', '3', '▁of', '▁18', '▁December', '▁1998,']
 English Train Set Average Length: 27.84 tokens

 Analyzing: Chinese Train Set (train_unpc.zh)
  [Sample 1] Text: a 1999年数据为暂定数据。...
  [Sample 1] Tokens: ['▁a', '▁1999', '年', '数', '据', '为', '暂', '定', '数', '据', '。']
  [Sample 2] Text: 回顾其1995年7月20日关于该法庭经费筹措的第49/251号决议及其后各项有关决议,最近的一项是1...
  [Sample 2] Tokens: ['▁', '回', '顾', '其', '1995', '年', '7', '月', '20', '日', '关', '于', '该', '法', '庭', '经', '费', '筹措', '的'

In [6]:
# Training
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np 

# Core Configuration 
SOURCE_PREFIX = ">>zho<< " 
BATCH_SIZE = 16         
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3       
OUTPUT_DIR = "/kaggle/working/unpc_model"

def load_dataset_from_text(src_path, tgt_path):
    with open(src_path, "r", encoding="utf-8") as fs, open(tgt_path, "r", encoding="utf-8") as ft:
        return Dataset.from_dict({"source": [l.strip() for l in fs], "target": [l.strip() for l in ft]})

print("Loading training data...")
train_ds = load_dataset_from_text(os.path.join(DATA_DIR, "train_unpc.en"), os.path.join(DATA_DIR, "train_unpc.zh"))
dev_ds = load_dataset_from_text(os.path.join(DATA_DIR, "dev_unpc.en"), os.path.join(DATA_DIR, "dev_unpc.zh"))

# Data Preprocessing
def preprocess_function(examples):
    inputs = [SOURCE_PREFIX + ex for ex in examples["source"]]
    targets = examples["target"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Preprocessing data (Tokenizing)...")
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_dev = dev_ds.map(preprocess_function, batched=True)

# Training Settings
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple): preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",    
    # eval_steps=500,             
    save_strategy="epoch",
    # save_steps=1000,            
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,         
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True, 
    fp16=True,                  
    logging_steps=100,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start Training
print("Starting training UNPC...")
#trainer.train()
trainer.train(resume_from_checkpoint=True)

# Save final model
final_path = "/kaggle/working/final_unpc_model"
trainer.save_model(final_path)
print(f"Training complete! Model saved to: {final_path}")

Loading training data...
Preprocessing data (Tokenizing)...


Map:   0%|          | 0/462483 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


Starting training UNPC...


	save_steps: 500 (from args) != 1000 (from trainer_state.json)


Epoch,Training Loss,Validation Loss,Bleu
1,1.2315,1.116636,45.534853
2,1.1242,1.048062,46.40193
3,1.0792,1.028858,46.380594




Training complete! Model saved to: /kaggle/working/final_unpc_model


In [8]:
import os
import shutil


model_path = "/kaggle/working/final_unpc_model"
output_filename = "/kaggle/working/unpc_model_backup"

print("Compressing model folder, please wait...")
shutil.make_archive(output_filename, 'zip', model_path)
print(f"Compression complete! File created: {output_filename}.zip")

Compressing model folder, please wait...
Compression complete! File created: /kaggle/working/unpc_model_backup.zip
