In [None]:
import plyvel
from typing import List, Optional
from pydantic import BaseModel
import json
from bs4 import BeautifulSoup
from tqdm import tqdm
import pydantic


class NewsItem(pydantic.BaseModel):
    # "id": "26829137",
    # "title": "ﺋﯘﻛﺮﺍﺋﯩﻨﺎﻟﯩﻘﻼﺭ ﺯﯦﻤﯩﻦ ﺑﻪﺩﯨﻠﯩﮕﻪ ﺗﯩﻨﭽﻠﯩﻘﻘﺎ ﺋﯧﺮﯨﺸﯩﺸﻨﻰ ﺧﺎﻻﻳﺪﯨﻐﺎﻥ ﺑﻮﻟﯘﭖ ﻗﺎﻟﻤﺎﻗﺘﺎ",
    # "thumb": "https://cdnf.nur.cn/uploadfile/2024/0724/072419254719500000010094.jpgthumb_210_150.jpg",
    # "date_txt": "21 ﺳﺎﺋﻪﺕ ﺋﯩﻠﮕﯩﺮﻯ",
    # "bahanum": "0",
    # "copyfrom": "تېڭشۈن تورى",
    # "type": "news",
    # "url": "/news/2024/07/26829137.shtml"

    id: str
    title: str
    thumb: list | None = None
    date_txt: str | None = None
    bahanum: str | None = None
    copyfrom: str | None = None
    type: str | None = None
    url: str
    crawled: bool

class NewsContentRelate(pydantic.BaseModel):
    title: str
    url: str
    html_content: str

class NewsContent(pydantic.BaseModel):
    title: str
    url: str
    full_page_content:  str
    comes_from: str | None
    published_time: str
    comment_count: int
    tags: list[str]
    related: list[NewsContentRelate]


def html_to_paragraphs(html: str):
    soup = BeautifulSoup(html, 'html.parser')
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
    return paragraphs

def save_paragraphs(paragraphs: list[str], count: int):
    with open(f'../../extracted/{str(count).zfill(4)}.json', 'w+', encoding='utf-8') as f:
        json.dump(paragraphs, f, ensure_ascii=False)


# Read the news list

In [12]:
with plyvel.DB("../../data/news_list.lvdb") as db:
    news_lists = [(k.decode(), v.decode()) for k, v in db.iterator()]

print(f"{len(news_lists)}")
print(news_lists[0:100])

313160
[('/news/2017/01/351865.shtml', '{"id":"351865","title":"ﺋﻪﻳﺪﯨﺰ ﻛﯧﺴﯩﻠﯩﻨﯩﯔ ﻛﯩﭽﯩﻚ ﺑﺎﻟﯩﻼﺭﺩﺍ ﻛﯚﺭﯛﻟﯩﺪﯨﻐﺎﻥ ﺩﻩﺳﻠﻪﭘﻜﻰ ﺋﺎﻻﻣﻪﺗﻠﯩﺮﻯ","thumb":["https://cdnf.nur.cn/uploadfile/2017/0102/20170102120556716.jpgthumb_210_150.jpg"],"date_txt":"7 ﻳﯩﻞ ﺋﯩﻠﮕﯩﺮﻯ","bahanum":"0","copyfrom":"تور دۇنياسى","type":"news","url":"/news/2017/01/351865.shtml","crawled":true}'), ('/news/2017/01/351875.shtml', '{"id":"351875","title":"ﺋﻪﻳﺪﯨﺰ ﻛﯧﺴﯩﻠﯩﻨﻰ ﺩﺍﯞﺍﻻﺷﺘﯩﻜﻰ ﻳﻪﺗﺘﻪ ﭼﻮﯓ ﻳﺎﻟﻐﺎﻥ ﮔﻪﭖ","thumb":["https://cdnf.nur.cn/uploadfile/2017/0102/20170102135601160.jpgthumb_210_150.jpg"],"date_txt":"7 ﻳﯩﻞ ﺋﯩﻠﮕﯩﺮﻯ","bahanum":"0","copyfrom":"تور دۇنياسى","type":"news","url":"/news/2017/01/351875.shtml","crawled":true}'), ('/news/2017/01/351965.shtml', '{"id":"351965","title":"ﻛﻪﭘﺘﻪﺭ ﮔﯚﺷﯩﻨﯩﯔ ﺋﻮﺯﯗﻗﻠﯘﻕ ﻗﯩﻤﻤﯩﺘﯩﻨﻰ ﺑﯩﻠﯩﯟﯦﻠﯩﯔ","thumb":["https://cdnf.nur.cn/uploadfile/2017/0102/20170102222739805.jpgthumb_210_150.jpg"],"date_txt":"7 ﻳﯩﻞ ﺋﯩﻠﮕﯩﺮﻯ","bahanum":"0","copyfrom":"نۇر تورى","type":"news","url":"/news/2017/01/351965.shtml","crawled"

## Read the database

In [None]:

news: list[list[str]] = []
news_objects: list[NewsContent] = []
with plyvel.DB('../../data/news_content.lvdb') as db:
    
    for k, v in tqdm(db.iterator(), desc="preparing..."):
        news_item = NewsContent.model_validate_json(str(v, encoding='utf-8'))
        news_objects.append(news_item)
        html = news_item.full_page_content
        paragraphs = html_to_paragraphs(html)
        news.append(paragraphs)

preparing...: 231093it [02:57, 1305.54it/s]


In [3]:

import text_cleaner
import re

def clean_http_links(text: str) -> str:
    return re.sub(r'https?://[^\s]+', '', text)

news = [[text_cleaner.clean_text(i) for i in n] for n in tqdm(news, desc="cleaning links")]

cleaning links: 100%|██████████| 231093/231093 [02:28<00:00, 1552.77it/s]


In [4]:
count = 0
all_paragraphs: list[str] = []
for p in tqdm(news):
    all_paragraphs.extend(p)
    count += 1
    if count % 1000 == 0:
        save_paragraphs(all_paragraphs, count)
        all_paragraphs = []
#save the last part of the paragraphs to json file.
save_paragraphs(all_paragraphs, count)

print("done.")

100%|██████████| 231093/231093 [00:06<00:00, 33603.69it/s]

done.





# Clean the text

In [5]:
extracted_symbols = set("")
for p in tqdm(news[:]):
    for s in p:
        extracted_symbols.update(s)

print(extracted_symbols)
len(extracted_symbols)

100%|██████████| 231093/231093 [00:12<00:00, 17879.73it/s]

{'ش', '>', '%', '<', '€', 'گ', '¥', '.', 's', 'p', '|', '-', 'j', 'ى', 'u', 'چ', 'k', 'د', '_', 'ق', 'y', '}', '8', '"', 'l', 'ۋ', 'غ', 'ج', 'v', 'c', '^', '#', '*', 'ە', 'ۆ', 'h', ']', 'ت', '(', '£', 'o', '‹', 'س', '«', 'f', 'خ', '›', ')', 'b', 'w', 'ر', '7', '6', '@', '?', 'r', 'ۇ', '»', 'ا', '0', 'ئ', 'e', 'ژ', '+', 'z', 'i', 'ې', 'ك', 'ز', 'ب', ':', '1', '=', 'ف', '3', 'n', 'ۈ', '5', 'پ', '[', 'ل', '{', '2', 'م', '9', 'q', '4', 'g', 'ي', ' ', 'و', '/', 'a', '،', 'ن', 't', 'ڭ', 'm', '؛', '؟', "'", '$', 'x', '!', 'd', 'ھ'}





106

# Save all symbols to json file as tokens.

In [6]:
tokens = [s for s in [text_cleaner.symbols_list]]
with open('tokens.json', 'w+', encoding='utf-8') as f:
    json.dump(tokens, f, ensure_ascii=False)


### Save all the news title into json file.

In [7]:
import csv
from text_cleaner import collapse_spaces, clean_extended_uyghur_characters, clean_http_links, collapse_spaces, clean_rare_symbols
with open("../../extracted/titles.tsv", "w") as f:
    writer = csv.DictWriter(f, delimiter="\t", fieldnames=["url", "title_ug"])
    writer.writeheader()
    rows = [{"url": i.url, "title_ug": clean_extended_uyghur_characters(collapse_spaces(clean_rare_symbols(clean_http_links(i.title))))} for i in tqdm(news_objects)]
    writer.writerows(rows)
print("[OK] of save all the titles.")

100%|██████████| 231093/231093 [00:05<00:00, 43752.35it/s]


[OK] of save all the titles.
