In [1]:
import plyvel
from typing import List, Optional
from pydantic import BaseModel
import json
from bs4 import BeautifulSoup
from tqdm import tqdm
import pydantic


class NewsItem(pydantic.BaseModel):
    # "id": "26829137",
    # "title": "ﺋﯘﻛﺮﺍﺋﯩﻨﺎﻟﯩﻘﻼﺭ ﺯﯦﻤﯩﻦ ﺑﻪﺩﯨﻠﯩﮕﻪ ﺗﯩﻨﭽﻠﯩﻘﻘﺎ ﺋﯧﺮﯨﺸﯩﺸﻨﻰ ﺧﺎﻻﻳﺪﯨﻐﺎﻥ ﺑﻮﻟﯘﭖ ﻗﺎﻟﻤﺎﻗﺘﺎ",
    # "thumb": "https://cdnf.nur.cn/uploadfile/2024/0724/072419254719500000010094.jpgthumb_210_150.jpg",
    # "date_txt": "21 ﺳﺎﺋﻪﺕ ﺋﯩﻠﮕﯩﺮﻯ",
    # "bahanum": "0",
    # "copyfrom": "تېڭشۈن تورى",
    # "type": "news",
    # "url": "/news/2024/07/26829137.shtml"

    id: str
    title: str
    thumb: list | None = None
    date_txt: str | None = None
    bahanum: str | None = None
    copyfrom: str | None = None
    type: str | None = None
    url: str
    crawled: bool

class NewsContentRelate(pydantic.BaseModel):
    title: str
    url: str
    html_content: str

class NewsContent(pydantic.BaseModel):
    title: str
    url: str
    full_page_content:  str
    comes_from: str | None
    published_time: str
    comment_count: int
    tags: list[str]
    related: list[NewsContentRelate]


def html_to_paragraphs(html: str):
    soup = BeautifulSoup(html, 'html.parser')
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
    return paragraphs

def save_paragraphs(paragraphs: list[str], count: int):
    with open(f'../../extracted/{str(count).zfill(4)}.json', 'w+', encoding='utf-8') as f:
        json.dump(paragraphs, f, ensure_ascii=False)


# Read the news list

In [2]:
with plyvel.DB("../../data/news_list.lvdb") as db:
    news_lists = [(k.decode(), v.decode()) for k, v in db.iterator()]

print(f"{len(news_lists)}")
print(news_lists[0:100])

250221
[('/news/2017/01/351865.shtml', '{"id":"351865","title":"ﺋﻪﻳﺪﯨﺰ ﻛﯧﺴﯩﻠﯩﻨﯩﯔ ﻛﯩﭽﯩﻚ ﺑﺎﻟﯩﻼﺭﺩﺍ ﻛﯚﺭﯛﻟﯩﺪﯨﻐﺎﻥ ﺩﻩﺳﻠﻪﭘﻜﻰ ﺋﺎﻻﻣﻪﺗﻠﯩﺮﻯ","thumb":["https://cdnf.nur.cn/uploadfile/2017/0102/20170102120556716.jpgthumb_210_150.jpg"],"date_txt":"7 ﻳﯩﻞ ﺋﯩﻠﮕﯩﺮﻯ","bahanum":"0","copyfrom":"تور دۇنياسى","type":"news","url":"/news/2017/01/351865.shtml","crawled":true}'), ('/news/2017/01/351875.shtml', '{"id":"351875","title":"ﺋﻪﻳﺪﯨﺰ ﻛﯧﺴﯩﻠﯩﻨﻰ ﺩﺍﯞﺍﻻﺷﺘﯩﻜﻰ ﻳﻪﺗﺘﻪ ﭼﻮﯓ ﻳﺎﻟﻐﺎﻥ ﮔﻪﭖ","thumb":["https://cdnf.nur.cn/uploadfile/2017/0102/20170102135601160.jpgthumb_210_150.jpg"],"date_txt":"7 ﻳﯩﻞ ﺋﯩﻠﮕﯩﺮﻯ","bahanum":"0","copyfrom":"تور دۇنياسى","type":"news","url":"/news/2017/01/351875.shtml","crawled":true}'), ('/news/2017/01/351965.shtml', '{"id":"351965","title":"ﻛﻪﭘﺘﻪﺭ ﮔﯚﺷﯩﻨﯩﯔ ﺋﻮﺯﯗﻗﻠﯘﻕ ﻗﯩﻤﻤﯩﺘﯩﻨﻰ ﺑﯩﻠﯩﯟﯦﻠﯩﯔ","thumb":["https://cdnf.nur.cn/uploadfile/2017/0102/20170102222739805.jpgthumb_210_150.jpg"],"date_txt":"7 ﻳﯩﻞ ﺋﯩﻠﮕﯩﺮﻯ","bahanum":"0","copyfrom":"نۇر تورى","type":"news","url":"/news/2017/01/351965.shtml","crawled"

## Read the database

In [3]:
import re
import text_cleaner

ignored_suffix_punctuations = set(text_cleaner.uyghur_symbols["تىنىش_بەلگىلىرى"])

news: list[list[str]] = []
news_objects: list[NewsContent] = []
with plyvel.DB('../../data/news_content.lvdb') as db:
    duplicate_storage: dict[str, bool] = {}
    
    for k, v in tqdm(db.iterator(), desc="preparing..."):
        news_item = NewsContent.model_validate_json(str(v, encoding='utf-8'))
        news_objects.append(news_item)
        html = news_item.full_page_content
        paragraphs = html_to_paragraphs(html)
        temp: list[str] = []
        for p in paragraphs:
            ps = [i for i in re.split(r"(?<=[^0-9\.])\.{1}(?=[ \n])+", p) if i.replace("\n", "").replace("\t", "").strip() != ""]
            ps: list[str]
            ps = [i if i[-1] in ignored_suffix_punctuations else f"{i}." for i in ps]
            for p in ps:
                if p in duplicate_storage:
                    continue
                duplicate_storage[p] = True
                temp.append(p)
            
        news.append(temp)
    print(f"Count of individual sentences: {len(duplicate_storage)}")
    del duplicate_storage

preparing...: 249806it [03:08, 1322.54it/s]


Count of individual sentences: 3885815


In [4]:

import text_cleaner

news = [[text_cleaner.clean_text(i) for i in n] for n in tqdm(news, desc="cleaning text")]
news = [[i for i in n if i != ""] for n in news]

cleaning text: 100%|██████████| 249806/249806 [01:57<00:00, 2119.71it/s]


In [5]:
count = 0
all_paragraphs: list[list[str]] = []
for p in tqdm(news):
    all_paragraphs.append(p)
    count += 1
    if count % 1000 == 0:
        save_paragraphs(all_paragraphs, count)
        all_paragraphs = []
#save the last part of the paragraphs to json file.
save_paragraphs(all_paragraphs, count)

print("done.")

100%|██████████| 249806/249806 [00:07<00:00, 33394.49it/s]

done.





# Clean the text

In [6]:
extracted_symbols = set("")
for p in tqdm(news[:]):
    for s in p:
        extracted_symbols.update(s)

print(extracted_symbols)
len(extracted_symbols)

100%|██████████| 249806/249806 [00:13<00:00, 18338.82it/s]

{'洼', '纯', '艮', '子', '浅', '框', '沛', '跛', '忠', '换', 'к', '苾', '版', '彤', '冷', '咕', '荫', '∶', '矿', '推', '薛', 'M', '膦', '审', '肿', '蚺', '珺', '0', '瞥', '芪', '钙', '🇩', '乍', '浠', '齿', '什', '桓', '็', '酷', '纬', '趣', '苒', '趾', '沽', '橫', '填', '食', '泡', 'を', '蝾', '诈', '嘛', '滚', '◈', '】', '溜', 'Đ', '秤', '漠', 'л', '而', 'X', '狈', '鳌', '恩', '骠', '우', 'θ', '熄', '潭', '蜓', '菪', '骼', '捧', '恒', '👆', '㥠', '椋', '牵', '疔', '肼', '址', '山', '放', '煤', '翼', 'が', '骷', '岈', '佐', '姚', '汗', '错', '都', '鞘', '纭', '蓖', '斜', '琎', '^', '盗', '又', '旗', '蔻', '健', '锚', '蝎', '捻', '.', '进', '등', '誉', '椿', '让', 'Ⅸ', '磨', '皋', '楞', '封', '察', '好', '杉', '荃', '祎', '刮', '炯', '6', '強', '迟', '涓', '株', '夕', '辫', '黏', '雳', '灿', '咯', '铨', '店', 'þ', '被', '湘', '钯', 'ว', '５', '侈', '官', '弱', '對', 'ศ', '甪', '婺', 'y', '德', '之', 'ř', '焉', '其', '诱', '宛', 'δ', '阳', '∆', 'ー', '呋', '加', '躇', '铋', '腿', '逸', '腐', '昇', '樽', '可', '烙', '鸭', '啸', '戚', 'か', '卓', '彬', '叡', '开', '霏', '识', '拷', '署', '醚', '镍', '砍', '麦', '沙', '明', '勃', '比', '浦', '术', '蜥', '髎', '瘁',




5314

# Save all symbols to json file as tokens.

In [7]:
tokens = [s for s in [text_cleaner.symbols_list]]
with open('tokens.json', 'w+', encoding='utf-8') as f:
    json.dump(tokens, f, ensure_ascii=False)


### Save all the news title into json file.

In [8]:
import csv
from text_cleaner import collapse_spaces, clean_extended_uyghur_characters, clean_http_links, collapse_spaces, clean_rare_symbols
with open("../../extracted/by_sentences/titles.tsv", "w") as f:
    writer = csv.DictWriter(f, delimiter="\t", fieldnames=["id", "title_ug"])
    writer.writeheader()
    rows = [{"id": f"{i.url}|0", "title_ug": clean_extended_uyghur_characters(collapse_spaces(clean_rare_symbols(clean_http_links(i.title))))} for i in tqdm(news_objects)]
    writer.writerows(rows)
print("[OK] of save all the titles.")

100%|██████████| 249806/249806 [00:04<00:00, 51753.64it/s]


[OK] of save all the titles.


### first sentences of all the news content

In [10]:
import csv
from text_cleaner import collapse_spaces, clean_extended_uyghur_characters, clean_http_links, collapse_spaces, clean_rare_symbols

max_paragraph_count = max([len(ss) for ss in news])

for i in tqdm(range(max_paragraph_count), ncols=200, desc="extracting sub sentences"):
    sentence_index = i
    with open(f"../../extracted/by_sentences/sub_sentences_{sentence_index + 1}.tsv", "w") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=["id", "title_ug"])
        writer.writeheader()
        first_sentences = [(news_index_with_paragraphs[0], news_index_with_paragraphs[1][sentence_index]) for news_index_with_paragraphs in [(news_index, paragraphs) for news_index, paragraphs in enumerate(news) if len(paragraphs) > sentence_index]]
        rows = [{"id": f"{news_objects[i[0]].url}|{sentence_index + 1}", "title_ug": clean_extended_uyghur_characters(collapse_spaces(clean_rare_symbols(clean_http_links(i[1]))))} for i in tqdm(first_sentences, desc=f"cleaning item {sentence_index}...")]
        writer.writerows(rows)
print("[OK] of save all the titles.")

cleaning item 0...: 100%|██████████| 243344/243344 [00:09<00:00, 24553.88it/s]                                                                                                  | 0/436 [00:00<?, ?it/s]
cleaning item 1...: 100%|██████████| 231253/231253 [00:09<00:00, 25170.46it/s]                                                                                        | 1/436 [00:11<1:23:02, 11.45s/it]
cleaning item 2...: 100%|██████████| 220406/220406 [00:08<00:00, 26830.67it/s]                                                                                        | 2/436 [00:21<1:18:53, 10.91s/it]
cleaning item 3...: 100%|██████████| 204308/204308 [00:07<00:00, 27300.43it/s]                                                                                        | 3/436 [00:31<1:14:04, 10.26s/it]
cleaning item 4...: 100%|██████████| 188773/188773 [00:06<00:00, 28814.59it/s]                                                                                        | 4/436 [00:40<1:09:07,  9.60s

[OK] of save all the titles.



