In [1]:
import sys
from pathlib import Path

# Add project root to Python path
current_dir = Path.cwd()
project_root = (
    current_dir.parent.parent if current_dir.name == "datasets" else current_dir
)
sys.path.insert(0, str(project_root))

print(f"Added to Python path: {project_root}")
print(f"Current working directory: {current_dir}")

# Verify the fix worked
if (project_root / "src").exists():
    print("✅ 'src' directory found - imports should work now")
else:
    print("❌ 'src' directory not found - check your project structure")

Added to Python path: /Users/shaneryan_1/Downloads/binary_align_zh
Current working directory: /Users/shaneryan_1/Downloads/binary_align_zh/src/datasets
✅ 'src' directory found - imports should work now


In [2]:
print(Path.cwd())

/Users/shaneryan_1/Downloads/binary_align_zh/src/datasets


In [3]:
# import torch
from src.datasets.datasets_gold import AlignmentDatasetGold
from transformers import XLMRobertaTokenizer, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer.tokenize("Constantinople")

['▁Constantin', 'o', 'ple']

In [5]:
tokenizer.tokenize("晓美焰 来到 北京立方庭 参观 自然语义科技公司")

['▁',
 '晓',
 '美',
 '焰',
 '▁',
 '来到',
 '▁',
 '北京',
 '立',
 '方',
 '庭',
 '▁',
 '参观',
 '▁',
 '自然',
 '语',
 '义',
 '科技',
 '公司']

In [8]:
tokenizer.tokenize(["晓美焰", "来到",  "北京立方庭", "参观", "自然语义科技公司"], is_split_into_words=True)

['▁',
 '晓',
 '美',
 '焰',
 '▁',
 '来到',
 '▁',
 '北京',
 '立',
 '方',
 '庭',
 '▁',
 '参观',
 '▁',
 '自然',
 '语',
 '义',
 '科技',
 '公司']

In [None]:
a = AlignmentDatasetGold(
    tokenizer=XLMRobertaTokenizer.from_pretrained("xlm-roberta-base"),
    source_lines_path="../../data/raw_data/english.txt",
    target_lines_path="../../data/raw_data/chinese.txt",
    alignments_path="../../data/raw_data/alignment.txt",
    one_indexed=True,
    save=False,
    limit=21999,
)

100%|██████████| 21999/21999 [11:48<00:00, 31.04it/s]  ogger_config[0m:[36msetup_logger[0m:[36m58[0m - [32mLogger initialized. Logs will be saved to logs/logs[0m[32m 10:22:04[0m | [37mINFO    [0m | [36msrc.datasets.base_dataset[0m:[36m__post_init__[0m:[36m45[0m - [37mPreparing dataset...[0m
100%|██████████| 21999/21999 [50:59<00:00,  7.19it/s]  .decorators[0m:[36mwrapper[0m:[36m16[0m - [32mFunction executed in: 0 hours, 11 minutes, 48.682 seconds[0m
[32m 11:24:53[0m | [32mSUCCESS [0m | [36msrc.utils.decorators[0m:[36mwrapper[0m:[36m16[0m - [32mFunction executed in: 0 hours, 50 minutes, 59.743 seconds[0m[32m 11:24:53[0m | [32mSUCCESS [0m | [36msrc.datasets.datasets_gold[0m:[36m__post_init__[0m:[36m17[0m - [32mAlignmentDatasetGold initialized successfully[0m

In [9]:
a.data[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [48]:
import pandas as pd

In [59]:
data = pd.read_csv("../../data/cleaned_data/aa_df.csv")

In [60]:
data.head()

Unnamed: 0,final
0,欢迎 来到 联合国 ||| Welcome to the United Nations
1,跳 转 到 相关 资源 ||| Skip to resources
2,"欢迎 来到 联合国 , 您 的 世界 ! ||| Welcome to the United..."
3,联合国 搜索 ||| UN Search
4,"联合国 : 我 联合国 人民 , 团结 起来 追求 更 美好 的 世界 ! ||| Unit..."


In [61]:
first_ten = data.head(1000000)

In [62]:
first_ten["source"] = first_ten["final"].apply(lambda x: x.split(" ||| ")[0])
first_ten["target"] = first_ten["final"].apply(lambda x: x.split(" ||| ")[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_ten["source"] = first_ten["final"].apply(lambda x: x.split(" ||| ")[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_ten["target"] = first_ten["final"].apply(lambda x: x.split(" ||| ")[1])


In [63]:
first_ten.head()

Unnamed: 0,final,source,target
0,欢迎 来到 联合国 ||| Welcome to the United Nations,欢迎 来到 联合国,Welcome to the United Nations
1,跳 转 到 相关 资源 ||| Skip to resources,跳 转 到 相关 资源,Skip to resources
2,"欢迎 来到 联合国 , 您 的 世界 ! ||| Welcome to the United...","欢迎 来到 联合国 , 您 的 世界 !",Welcome to the United Nations. It's your world.
3,联合国 搜索 ||| UN Search,联合国 搜索,UN Search
4,"联合国 : 我 联合国 人民 , 团结 起来 追求 更 美好 的 世界 ! ||| Unit...","联合国 : 我 联合国 人民 , 团结 起来 追求 更 美好 的 世界 !",United Nations: We the peoples... A stronger U...


In [64]:
first_ten["source"].head(800000).to_csv(
    path_or_buf="../../data/cleaned_data/train.src", sep="\t", header=False, index=False
)
first_ten["source"].tail(200000).to_csv(
    path_or_buf="../../data/cleaned_data/dev.src", sep="\t", header=False, index=False
)

In [65]:
first_ten["target"].head(800000).to_csv(
    path_or_buf="../../data/cleaned_data/train.tgt", sep="\t", header=False, index=False
)
first_ten["target"].tail(200000).to_csv(
    path_or_buf="../../data/cleaned_data/dev.tgt", sep="\t", header=False, index=False
)

In [66]:
sample_alignments = pd.read_csv(
    "../../data/cleaned_data/awesome_alignments.txt", sep="\t", header=None
).head(1000000)

In [67]:
sample_alignments.head()

Unnamed: 0,0
0,2-3 1-1 2-4 0-0
1,1-0 2-1 4-2 0-0
2,0-7 2-4 3-4 7-7 0-0 1-1 4-6 2-3 6-7
3,1-1 0-0
4,0-1 4-4 8-8 9-10 0-0 1-1 11-9 2-2 12-11 13-11 ...


In [68]:
sample_alignments.head(800000).to_csv(
    path_or_buf="../../data/cleaned_data/train.talp",
    sep="\t",
    header=False,
    index=False,
)
sample_alignments.tail(200000).to_csv(
    path_or_buf="../../data/cleaned_data/dev.talp",
    sep="\t",
    header=False,
    index=False,
)