<a href="https://colab.research.google.com/github/dnrb/word2vec_char_level/blob/master/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import gzip
import json
import re
from collections import Counter

from tqdm import tqdm
from opencc import OpenCC
import joblib
import numpy as np

DATAPATH = "zhwiki-multistream1.json.gz"
TMPPATH = "temp.txt"

CC = OpenCC('t2s')
MIN_FREQ = 500
VOC_SIZE = 10000
PAD = 0
UNK = 1

SECTION_BLACKLIST = [
    "相关条目", "外部链接", "参看",
    "注释", "参考文献", "参考书目",
    "扩展阅读", "延伸阅读", "外部连结",
    "相关著作", "分类", "图片", "扩-{展}-阅读",
    "参考来源"
]


def clean_text(text):
    text = " ".join([x for x in text.split("\n") if len(x) > 50])
    text = CC.convert(text)
    text = re.sub(r"'''?", "", text)
    text = re.sub(r"（.*）", "", text)
    text = re.sub(r"\(.*\)", "", text)
    text = re.sub(r"\u200B", "", text)
    text = re.sub(r"\-\{.*\}\-", "", text)
    text = re.sub(r"《》", "", text)
    # text = re.sub(r"\-{2,}", "", text)
    text = re.sub(r"link=\w+\s", " ", text)
    text = re.sub(r"File:.+\|", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"(?<=[^a-zA-Z0-9/\-]) (?=[^a-zA-Z0-9/\-])", "", text)
    text = re.sub(r"(?<=[a-zA-Z0-9/\-]) (?=[^a-zA-Z0-9/\-])", "", text)
    text = re.sub(r"(?<=[^a-zA-Z0-9/\-]) (?=[a-zA-Z0-9/\-])", "", text)
    text = re.sub(r"\*", " ", text)
    text = re.sub(r" $", "", text)
    text = re.sub(r"^ ", "", text)
    # Remove english
    text = re.sub(
        r"(?<=[^a-zA-Z0-9]\s)[a-zA-Z\s\.\,\(\)0-9\-\;]+(?=[\s。][^a-zA-Z0-9])", "", text)
    text = re.sub(
        r"(?<=[^a-zA-Z0-9]\s)[a-zA-Z\s\.\,\(\)0-9\-\;]$", "", text)
    return text


def filter_texts(texts):
    ascii_cnt = len([1 for x in texts if ord(x) < 256])
    if ascii_cnt / len(texts) > 0.5:
        return True
    return False


def main():
    cnt = Counter()
    with gzip.open(DATAPATH) as f:
        with open(TMPPATH, "w") as fw:
            for i, line in tqdm(enumerate(f.readlines())):
                article = json.loads(line)
                if "年表" in article["title"] or "列表" in article["title"]:
                    continue
                for title, section in zip(article["section_titles"], article["section_texts"]):
                    title = CC.convert(title)
                    if title in SECTION_BLACKLIST:
                        continue
                    section = clean_text(section)
                    if len(section) < 200 or filter_texts(section):
                        continue
                    # print(article["title"])
                    # print(section[:100])
                    # print(article["section_texts"][0][:100].replace("\n", " "))
                    # if i == 1000:
                    #     return
                    cnt.update(section)
                    # fw.write(title + "===\n")
                    fw.write(section + "\n")
    print(cnt.most_common(100))
    joblib.dump(cnt, "data/freq.pkl")
    mapping = {
        char: token + 2 for token, (char, freq) in enumerate(cnt.most_common(VOC_SIZE))
        if freq > MIN_FREQ
    }
    print("Vocab:", len(mapping))
    joblib.dump(mapping, "data/mapping.pkl")
    texts = []
    with open(TMPPATH) as f:
        for i, section in tqdm(enumerate(f.readlines())):
            texts.append(
                np.array(list(map(lambda x: mapping.get(x, UNK), section))))
            # if i == 10000:
            #     break
    joblib.dump(np.array(texts), "data/tokens.pkl")


if __name__ == "__main__":
    main()

34148it [04:15, 133.46it/s]

[('，', 1097077), ('的', 754273), ('。', 586586), ('、', 289893), ('1', 280332), ('在', 222646), ('0', 222558), ('年', 219864), ('国', 217013), ('中', 210171), ('一', 209474), ('为', 209060), ('是', 181384), ('人', 172171), ('有', 168336), ('2', 167365), ('大', 160951), ('9', 151525), ('和', 135552), ('于', 121311), ('以', 121280), ('学', 116482), ('了', 112297), ('不', 108330), ('后', 105364), ('时', 103667), ('会', 101983), ('地', 98187), ('成', 97927), ('上', 90280), ('之', 89145), ('日', 88991), ('个', 85093), ('5', 84742), ('3', 82735), ('月', 81765), ('8', 80372), ('他', 79801), ('出', 79239), ('与', 77617), ('主', 77567), ('发', 77458), ('4', 76186), ('生', 75077), ('行', 74893), ('民', 73512), ('其', 73408), ('而', 73099), ('6', 70812), ('作', 69776), ('到', 69262), ('7', 69128), ('用', 69114), ('家', 66732), ('部', 66510), ('对', 65993), ('法', 65883), ('分', 64404), ('这', 64156), ('多', 64008), ('政', 63971), ('》', 63200), ('《', 63155), ('来', 63023), ('方', 62910), ('公', 60461), ('区', 59569), ('文', 59304), ('及', 59154), ('并',




FileNotFoundError: [Errno 2] No such file or directory: 'data/freq.pkl'

In [0]:
!ls

[1m[36mApplications[m[m
[1m[36mApplications (Parallels)[m[m
[1m[36mCalibre Library[m[m
[1m[36mCreative Cloud Files[m[m
[1m[36mDesktop[m[m
[1m[36mDocuments[m[m
[1m[36mDownloads[m[m
[1m[36mDropbox[m[m
[1m[36mGoogle Drive[m[m
[1m[36mLibrary[m[m
[1m[36mMEGAsync[m[m
[1m[36mMEGAsync Downloads[m[m
[1m[36mMovies[m[m
[1m[36mMusic[m[m
[1m[36mParallels[m[m
[1m[36mPictures[m[m
[1m[36mPublic[m[m
Untitled-Copy1.ipynb
Untitled.ipynb
[1m[36mVirtualBox VMs[m[m
[1m[36manaconda3[m[m
[1m[36miCloud Drive (Archive)[m[m
[1m[36mnltk_data[m[m
[1m[36mrepo_yaoxuhon(old)[m[m
[31mssdtPRGen.sh[m[m
temp.txt
[1m[36mwebdriver-sh[m[m
[1m[36myaoxuhon[m[m
zhwiki-20190401-pages-articles-multistream.xml.bz2
zhwiki-20190401-pages-articles-multistream1.xml
zhwiki-20190401-pages-articles-multistream1.xml-p1p162886.bz2
zhwiki-latest.json.gz
zhwiki-latest.txt
zhwiki-latest.txt.zip
zhwiki-multistre