In [1]:
import sys
from pathlib import Path

# Add project root to Python path
current_dir = Path.cwd()
project_root = (
    current_dir.parent.parent if current_dir.name == "datasets" else current_dir
)
sys.path.insert(0, str(project_root))

print(f"Added to Python path: {project_root}")
print(f"Current working directory: {current_dir}")

# Verify the fix worked
if (project_root / "src").exists():
    print("✅ 'src' directory found - imports should work now")
else:
    print("❌ 'src' directory not found - check your project structure")

Added to Python path: /Users/shaneryan_1/Downloads/binary_align_zh
Current working directory: /Users/shaneryan_1/Downloads/binary_align_zh/src/datasets
✅ 'src' directory found - imports should work now


In [2]:
print(Path.cwd())

/Users/shaneryan_1/Downloads/binary_align_zh/src/datasets


In [3]:
# import torch
from transformers import XLMRobertaTokenizer, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer.tokenize("Constantinople")

['▁Constantin', 'o', 'ple']

In [5]:
tokenizer.tokenize("晓美焰 来到 北京立方庭 参观 自然语义科技公司")

['▁',
 '晓',
 '美',
 '焰',
 '▁',
 '来到',
 '▁',
 '北京',
 '立',
 '方',
 '庭',
 '▁',
 '参观',
 '▁',
 '自然',
 '语',
 '义',
 '科技',
 '公司']

In [6]:
tokenizer.tokenize(
    ["晓美焰", "来到", "北京立方庭", "参观", "自然语义科技公司"],
    is_split_into_words=True,
)

['▁',
 '晓',
 '美',
 '焰',
 '▁',
 '来到',
 '▁',
 '北京',
 '立',
 '方',
 '庭',
 '▁',
 '参观',
 '▁',
 '自然',
 '语',
 '义',
 '科技',
 '公司']

In [7]:
# a = AlignmentDatasetGold(
#     tokenizer=XLMRobertaTokenizer.from_pretrained("xlm-roberta-base"),
#     source_lines_path="../../data/raw_data/english.txt",
#     target_lines_path="../../data/raw_data/chinese.txt",
#     alignments_path="../../data/raw_data/alignment.txt",
#     one_indexed=True,
#     save=False,
#     limit=21999,
# )

In [8]:
# a.data[0].keys()

In [9]:
# import pandas as pd

In [10]:
# data = pd.read_csv("../../data/cleaned_data/aa_df.csv")

In [11]:
# data.head()

In [12]:
# first_ten = data.head(1000000)

In [13]:
# first_ten["source"] = first_ten["final"].apply(lambda x: x.split(" ||| ")[0])
# first_ten["target"] = first_ten["final"].apply(lambda x: x.split(" ||| ")[1])

In [14]:
# first_ten.head()

In [15]:
# first_ten["source"].head(800000).to_csv(
#     path_or_buf="../../data/cleaned_data/train.src", sep="\t", header=False, index=False
# )
# first_ten["source"].tail(200000).to_csv(
#     path_or_buf="../../data/cleaned_data/dev.src", sep="\t", header=False, index=False
# )

In [16]:
# first_ten["target"].head(800000).to_csv(
#     path_or_buf="../../data/cleaned_data/train.tgt", sep="\t", header=False, index=False
# )
# first_ten["target"].tail(200000).to_csv(
#     path_or_buf="../../data/cleaned_data/dev.tgt", sep="\t", header=False, index=False
# )

In [17]:
# sample_alignments = pd.read_csv(
#     "../../data/cleaned_data/awesome_alignments.txt", sep="\t", header=None
# ).head(1000000)

In [18]:
# sample_alignments.head()

In [19]:
# sample_alignments.head(800000).to_csv(
#     path_or_buf="../../data/cleaned_data/train.talp",
#     sep="\t",
#     header=False,
#     index=False,
# )
# sample_alignments.tail(200000).to_csv(
#     path_or_buf="../../data/cleaned_data/dev.talp",
#     sep="\t",
#     header=False,
#     index=False,
# )

In [20]:
from src.configs.dataset_config import DataLoaderConfig, DatasetConfig
from src.configs.model_config import ModelConfig
from src.configs.train_config import TrainConfig
from transformers import AutoTokenizer
from src.models.binary_align_trainer import BinaryAlignTrainer
from src.datasets.datasets_silver import AlignmentDatasetSilver
from src.utils.helpers import collate_fn_span

In [21]:
model_config = ModelConfig(model_name_or_path="FacebookAI/roberta-base")
train_config = TrainConfig(experiment_name="trainer-test", mixed_precision="no")
train_dataset_config = DatasetConfig(
    source_lines_path="../../data/cleaned_data/train.src",
    target_lines_path="../../data/cleaned_data/train.tgt",
    alignments_path="../../data/cleaned_data/train.talp",
    limit=25,
)
eval_dataset_config = DatasetConfig(
    source_lines_path="../../data/cleaned_data/dev.src",
    target_lines_path="../../data/cleaned_data/dev.tgt",
    alignments_path="../../data/cleaned_data/dev.talp",
    limit=5,
    do_inference=True,
)
dataloader_config = DataLoaderConfig(collate_fn=collate_fn_span)
tok = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
train_data = AlignmentDatasetSilver(tokenizer=tok, **train_dataset_config.__dict__)
eval_data = AlignmentDatasetSilver(tokenizer=tok, **eval_dataset_config.__dict__)

trainer = BinaryAlignTrainer(
    tokenizer=tok,
    model_config=model_config,
    train_config=train_config,
    dataset_config=train_dataset_config,
    dataloader_config=dataloader_config,
    train_data=train_data,
    eval_data=eval_data,
    seed_num=1,
)
trainer.run()

[32m2025-07-17 15:49:35.893[0m | [1mINFO    [0m | [36msrc.datasets.base_dataset[0m:[36m__post_init__[0m:[36m41[0m - [1mPreparing dataset...[0m
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 0/25 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 25/25 [00:00<00:00, 171.48it/s]
[32m2025-07-17 15:49:36.064[0m | [1mINFO    [0m | [36msrc.utils.decorators[0m:[36mwrapper[0m:[36m16[0m - [1mFunction executed in: 0 hours, 0 minutes, 0.170 seconds[0m
100%|██████████| 25/25 [00:00<00:00, 194.00it/s]
[32m2025-07-17 15

 Num examples =  144
 Num Epochs =  5
 Batch Size per device =  32
 Total batches per epoch =  9
 Total optimization steps =  45


100%|██████████| 45/45 [00:15<00:00,  2.85it/s]
[32m2025-07-17 15:49:55.488[0m | [32m[1mSUCCESS [0m | [36msrc.models.binary_align_trainer[0m:[36mrun[0m:[36m238[0m - [32m[1mTraining completed successfully.[0m
[32m2025-07-17 15:49:55.639[0m | [1mINFO    [0m | [36msrc.utils.decorators[0m:[36mwrapper[0m:[36m16[0m - [1mFunction executed in: 0 hours, 0 minutes, 16.630 seconds[0m


In [22]:
train_data.data[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [23]:
eval_data.data[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'bpe2wordmap'])

In [37]:
eval_data.data[0]["bpe2wordmap"]

tensor([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  0.,  0.,  1.,  1.,
         2.,  2.,  2.,  2.,  3.,  3.,  3.,  3.,  3.,  4.,  4.,  4.,  4.,  5.,
         5.,  5.,  6.,  6.,  7.,  7.,  7.,  8.,  9.,  9.,  9., 10., 10., 11.,
        11., 11., 11., 11., 12., 12., 13., 13., 13., 13., 13., 14., 15., 15.,
        15., 15., 15., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
        16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
        16., 16., 16., 16., 17., 17., 18., 18., 18., 18., 19., -1.])

In [25]:
from torch.utils.data import DataLoader