In [1]:
import sys
from pathlib import Path

# Add project root to Python path
current_dir = Path.cwd()
project_root = (
    current_dir.parent.parent if current_dir.name == "awesomealign" else current_dir
)
sys.path.insert(0, str(project_root))

print(f"Added to Python path: {project_root}")
print(f"Current working directory: {current_dir}")

# Verify the fix worked
if (project_root / "src").exists():
    print("✅ 'src' directory found - imports should work now")
else:
    print("❌ 'src' directory not found - check your project structure")

Added to Python path: /Users/shaneryan_1/Downloads/binary_align_zh
Current working directory: /Users/shaneryan_1/Downloads/binary_align_zh/src/awesomealign
✅ 'src' directory found - imports should work now


In [20]:
import hanlp
import pandas as pd
from transformers import AutoTokenizer

from src.awesomealign.awesome_align_data import AwesomeAlignDataset

In [3]:
target_tok = AutoTokenizer.from_pretrained("bert-large-cased")
target_tok.tokenize("Shingamu Sai Ajay")

['Shin', '##gam', '##u', 'Sai', 'A', '##jay']

In [4]:
target_tok.tokenize("Christian Asher Widjaja")

['Christian', 'Asher', 'W', '##id', '##ja', '##ja']

In [5]:
target_tok.tokenize("Raghav Narayanswamy")

['Ra', '##gh', '##av', 'Narayan', '##s', '##wamy']

In [None]:
source_tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)  # type: ignore
source_tok("康体通—网上租订康体设施及康体活动报名服务")

                                   

['康体通', '—', '网上', '租订', '康体', '设施', '及', '康体', '活动', '报名', '服务']

In [7]:
res = source_tok(
    ["康体通—网上租订康体设施及康体活动报名服务", "毕加索珍品展最后阶段门票发售"]
)

In [8]:
res = list(map(lambda x: " ".join(x), res))
res

['康体通 — 网上 租订 康体 设施 及 康体 活动 报名 服务', '毕加索 珍品 展 最后 阶段 门票 发售']

In [9]:
res[0]

'康体通 — 网上 租订 康体 设施 及 康体 活动 报名 服务'

In [10]:
res[1]

'毕加索 珍品 展 最后 阶段 门票 发售'

In [None]:
un = AwesomeAlignDataset(
    data_path="../../data/raw_data/UN.txt",
    save=True,
    save_path="../../data/raw_data/un.csv",
)

[32m2025-07-09 14:57:32.411[0m | [1mINFO    [0m | [36msrc.awesomealign.awesome_align_data[0m:[36m__post_init__[0m:[36m23[0m - [1mLoading Chinese tokenizer...[0m
[32m2025-07-09 14:57:32.727[0m | [1mINFO    [0m | [36msrc.awesomealign.awesome_align_data[0m:[36mread_data[0m:[36m45[0m - [1mReading data from ../../data/raw_data/UN.txt...[0m
[32m2025-07-09 14:58:52.841[0m | [32m[1mSUCCESS [0m | [36msrc.awesomealign.awesome_align_data[0m:[36mread_data[0m:[36m76[0m - [32m[1mRead data from ../../data/raw_data/UN.txt.[0m
[32m2025-07-09 14:58:52.845[0m | [32m[1mSUCCESS [0m | [36msrc.utils.decorators[0m:[36mwrapper[0m:[36m16[0m - [32m[1mFunction executed in: 0 hours, 1 minutes, 20.121 seconds[0m
[32m2025-07-09 14:58:53.046[0m | [32m[1mSUCCESS [0m | [36msrc.awesomealign.awesome_align_data[0m:[36m__post_init__[0m:[36m32[0m - [32m[1mAwesomeAlignDataset initialised.[0m


In [None]:
hk = AwesomeAlignDataset(
    data_path="../../data/raw_data/HK.txt",
    save=True,
    save_path="../../data/raw_data/hk.csv",
)

[32m2025-07-09 14:23:08.027[0m | [1mINFO    [0m | [36msrc.awesomealign.awesome_align_data[0m:[36m__post_init__[0m:[36m23[0m - [1mLoading Chinese tokenizer...[0m
[32m2025-07-09 14:23:08.317[0m | [1mINFO    [0m | [36msrc.awesomealign.awesome_align_data[0m:[36mread_data[0m:[36m45[0m - [1mReading data from ../../data/raw_data/HK.txt...[0m
[32m2025-07-09 14:49:18.569[0m | [32m[1mSUCCESS [0m | [36msrc.awesomealign.awesome_align_data[0m:[36mread_data[0m:[36m76[0m - [32m[1mRead data from ../../data/raw_data/HK.txt.[0m
[32m2025-07-09 14:49:18.646[0m | [32m[1mSUCCESS [0m | [36msrc.utils.decorators[0m:[36mwrapper[0m:[36m16[0m - [32m[1mFunction executed in: 0 hours, 26 minutes, 10.381 seconds[0m
[32m2025-07-09 14:49:21.418[0m | [32m[1mSUCCESS [0m | [36msrc.awesomealign.awesome_align_data[0m:[36m__post_init__[0m:[36m32[0m - [32m[1mAwesomeAlignDataset initialised.[0m


In [None]:
new_df = un.combine_data([hk], override=True)
un.save_data(save_path="../../data/cleaned_data/aadf_2.csv")

[32m2025-07-09 15:00:41.993[0m | [1mINFO    [0m | [36msrc.awesomealign.awesome_align_data[0m:[36mcombine_data[0m:[36m81[0m - [1mCombining AwesomeAlignDataset(s) now, please ensure you are joining the correct datasets to avoid duplicate data...[0m


### Read AADF ###

In [24]:
d = pd.read_csv("../../data/cleaned_data/aadf_2.csv")
len(d)

1348258

In [26]:
d.head()

Unnamed: 0,final
0,欢迎 来到 联合国 ||| Welcome to the United Nations
1,跳 转 到 相关 资源 ||| Skip to resources
2,"欢迎 来到 联合国 , 您 的 世界 ! ||| Welcome to the United..."
3,联合国 搜索 ||| UN Search
4,"联合国 : 我 联合国 人民 , 团结 起来 追求 更 美好 的 世界 ! ||| Unit..."
