In [1]:
import json
import os
import pandas as pd

fp_train = r"data/train.csv"
fp_test = r"data/test.txt"
fp_valid = r"data/valid.csv"
fp_dict = r"data/dict.txt"

data_train = pd.read_csv(fp_train, names=["谜底", "谜面"]).to_dict(orient="records")
data_valid = pd.read_csv(fp_valid, names=["谜底", "谜面"]).to_dict(orient="records")

data_dict = open(fp_dict, "r").read().strip().split("\n")
data_test = open(fp_test, "r").read().strip().split("\n")

In [2]:
import json
import os
import pandas as pd

fp_zi_dataset = r"resource/zi-dataset.tsv"
fp_stroke = r"resource/stroke.csv"
fp_xinhua_word = r"resource/word.json"

data_zi_dataset = pd.read_csv(fp_zi_dataset, sep="\t").to_dict(orient="records")
data_zi_dataset = {item["zi"]:item for item in data_zi_dataset}
# print(len(data_zi_dataset)) 无重复

data_xinhua_word = pd.read_json(fp_xinhua_word).to_dict(orient="records")
# data_xinhua_word = {item["word"]:item for item in data_xinhua_word}
# print(len(data_xinhua_word))


In [3]:
characters_train = list(set([item["谜底"] for item in data_train]))
print(len(characters_train), len(data_train))
characters_valid = list(set([item["谜底"] for item in data_valid]))
print(len(characters_valid), len(data_valid))
characters_test = list(set(data_dict))
print(len(characters_test))
characters_all = list(set(characters_train+characters_valid+characters_test))
print(len(characters_all))

4371 16631
1457 5480
1458
7286


In [4]:
from pypinyin import pinyin as py

# 从zi-dataset里获取基本信息
character_info = dict()
for character in characters_all:
    if character in data_zi_dataset:
        item = data_zi_dataset[character]
        stroke_count = item["stroke_count"].replace("画", "")
        if not isinstance(item["mandarin_pinyin"], float):
            pinyin = item["mandarin_pinyin"].replace(", ", "、")
        else:
            pinyin = py(character)[0][0]
        radical = item["radical"].replace("/", "")
        radical_count = item["radical_stroke_count"]
        radical_pinyin = item["radical_pinyin"].replace(", ", "、")
        components = item["leaf_component"].replace("/", "") if not isinstance(item["leaf_component"], float) else character
    else:
        print(character)
        continue

    character_info[character] = {
        "汉字": character,
        "笔画数": stroke_count,
        "拼音": pinyin,
        "部首": radical,
        "部首的笔画数": radical_count,
        "部首的拼音": radical_pinyin,
        "组件": components,
    }

print(len(character_info))

# 下面三个字只出现在了新华字典中，没有出现在zi-dataset中
character_info["范"] = {
    "汉字": "范",
    "笔画数": "8",
    "拼音": "fàn",
    "部首": "艹",
    "部首的笔画数": "3",
    "部首的拼音": "cǎo",
    "组件": "艹氾",
}
character_info["肀"] = {
    "汉字": "肀",
    "笔画数": "4",
    "拼音": "yù",
    "部首": "肀",
    "部首的笔画数": "4",
    "部首的拼音": "肀",
    "组件": "肀",
}
character_info["王"] = {
    "汉字": "王",
    "笔画数": "4",
    "拼音": "wáng",
    "部首": "王",
    "部首的笔画数": "4",
    "部首的拼音": "wáng",
    "组件": "王",
}

肀
王
《
_
`
(
<
范
♂
﻿有
7276


In [5]:
# 从新华字典里获取释义信息
for character in character_info:
    explanations = [i["explanation"] for i in data_xinhua_word if i["word"] == character]
    if len(explanations) == 0:
        character_info[character]["释义"] = "一个罕见的字"
    else:
        character_info[character]["释义"] = " ".join([i.replace("\n", "") for i in explanations])
    text = "[SEP]".join([f"{k}是{v}" for k, v in character_info[character].items() if k != "描述"])
    character_info[character]["描述"] = text
    # print(text)

In [6]:
import json

fp_character_information = r"resource/character_information.json"
json.dump(character_info, open(fp_character_information, "w"), indent=2, ensure_ascii=False)

In [16]:
st_train = list()
st_valid = list()


for item in data_train:
    if item["谜底"] in character_info:
        item["谜底_描述"] = character_info[item["谜底"]]["描述"]
        item["label"] = 1
        st_train.append(item)
for item in data_valid:
    if item["谜底"] in character_info:
        item["谜底_描述"] = character_info[item["谜底"]]["描述"]
        item["label"] = 1
        st_valid.append(item)
        

pd.DataFrame(st_train).to_csv("data/st_train.csv", index=False)
pd.DataFrame(st_valid).to_csv("data/st_valid.csv", index=False)

In [14]:
import random
import copy
import pandas as pd

st_train_enhancement = list()

st_train = pd.read_csv(r"data/st_train.csv").to_dict(orient="records")

for item in st_train:
    candidate1 = random.sample(st_train, 1)
    while candidate1[0]["谜底"] == item["谜底"]:
        candidate1 = random.sample(st_train, 1)
    candidate1 = copy.deepcopy(candidate1[0])
    candidate1["谜底"] = item["谜底"]
    candidate1["谜底_描述"] = item["谜底_描述"]
    candidate1["label"] = 0
    
    candidate2 = random.sample(st_train, 1)
    while candidate2[0]["谜面"] == item["谜面"]:
        candidate2 = random.sample(st_train, 1)
    candidate2 = copy.deepcopy(candidate2[0])
    candidate2["谜面"] = item["谜面"]
    candidate2["label"] = 0

    # print(item)
    # print(candidate1)
    # print(candidate2)
    st_train_enhancement.append(item)
    st_train_enhancement.append(candidate1)
    st_train_enhancement.append(candidate2)

pd.DataFrame(st_train_enhancement).to_csv("data/st_train_enhancement.csv", index=False)

In [11]:
print(len(data_train), len(st_train), len(data_valid), len(st_valid))

16631 16626 5480 5480


In [1]:
fp_test = r"data/test.txt"
fp_preds_riddle = r"/Users/nascent/Documents/Course_22spring/万小军语义计算与知识检索/hw3/output/hfl-chinese-roberta-wwm-ext_train_enhancement_epochs-3_batchsize-16_2022-06-06_00-24-02/test_predictions_with_riddle.txt"
fp_preds = r""


1
