In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path1 = "/content/drive/MyDrive/东乡语料文字部分1.txt"
path2 = "/content/drive/MyDrive/东乡语料文字部分2.txt" #located in `src/data` in the GitHub repository.

with open(path1, "r", encoding="gbk") as f1:
    text1 = f1.read()

with open(path2, "r", encoding="gbk") as f2:
    text2 = f2.read()

These two txt files are provided by local authorities and considered to be of high value because they were all compiled and collected locally by linguists for the purpose of creating bilingual textbooks (Mandarin/Dongxiang) for primary schools.

In [4]:
import re
import pandas as pd

def split_lines(s: str):
    if "\\n" in s and "\n" not in s:
        lines = s.split("\\n")
    else:
        lines = s.splitlines()
    lines = [ln.strip().strip("'").strip() for ln in lines if ln.strip()]
    return lines

def clean_dxg(s: str) -> str:
    s = re.sub(r"[^A-Za-z\s,\.?]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"[,.?]+$", "", s)
    return s

def clean_zh(s: str) -> str:
    s = re.sub(r"[^\u4e00-\u9fff，。？]", "", s)
    s = re.sub(r"[，。？]+$", "", s)
    return s

def make_pairs(raw: str) -> pd.DataFrame:
    lines = split_lines(raw)
    pairs = []
    for i in range(0, len(lines) - 1, 2):
        dxg = clean_dxg(lines[i])
        zh  = clean_zh(lines[i+1])
        if dxg or zh:
            pairs.append({"Dongxiang": dxg, "Chinese": zh})
    return pd.DataFrame(pairs, columns=["Dongxiang", "Chinese"])


In [5]:
df_pairs = make_pairs(text1)
df_pairs2 = make_pairs(text2)
df = pd.concat([df_pairs, df_pairs2])

In [7]:
df.to_csv('/content/drive/MyDrive/df_interior.csv', index=False, encoding='utf-8-sig')

# Check

In [8]:
import re

df["Dongxiang_is_valid"] = df["Dongxiang"].apply(
    lambda x: bool(re.fullmatch(r"[A-Za-z\s.,!?;:'\"\-()]+", str(x)))
)
df["Chinese_is_valid"] = df["Chinese"].apply(
    lambda x: bool(re.fullmatch(r"[\u4e00-\u9fff\s。，、！？；：“”‘’（）《》【】—…,.!?;:'\"\-()]+", str(x)))
)


In [9]:
print("Only English Character", df["Dongxiang_is_valid"].all())
print("Only Chinese Character", df["Chinese_is_valid"].all())

Only English Character True
Only Chinese Character True
