In [4]:
import plyvel
from typing import List, Optional
from pydantic import BaseModel
import json
from bs4 import BeautifulSoup
from tqdm import tqdm


class RelatedItem(BaseModel):
    title: str
    url: str
    html_content: str

class NewsItem(BaseModel):
    title: str
    url: str
    full_page_content: str
    comes_from: str
    published_time: str
    comment_count: int
    tags: List[str]
    related: List[RelatedItem]


def html_to_paragraphs(html: str):
    soup = BeautifulSoup(html, 'html.parser')
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
    return paragraphs

def save_paragraphs(paragraphs: list[str], count: int):
    with open(f'../../extracted/{str(count).zfill(4)}.json', 'w+', encoding='utf-8') as f:
        json.dump(paragraphs, f, ensure_ascii=False)


## Read the database

In [None]:

news: list[list[str]] = []
news_objects: list[NewsItem] = []
with plyvel.DB('../../data/news_content.lvdb') as db:
    
    for k, v in tqdm(db.iterator(), desc="preparing..."):
        news_item = NewsItem.model_validate_json(str(v, encoding='utf-8'))
        news_objects.append(news_item)
        html = news_item.full_page_content
        paragraphs = html_to_paragraphs(html)
        news.append(paragraphs)

preparing...: 164703it [01:58, 1388.50it/s]


In [3]:

import text_cleaner
import re

def clean_http_links(text: str) -> str:
    return re.sub(r'https?://[^\s]+', '', text)

news = [[text_cleaner.clean_text(i) for i in n] for n in tqdm(news, desc="cleaning links")]

cleaning links:   0%|          | 0/164703 [00:00<?, ?it/s]

cleaning links: 100%|██████████| 164703/164703 [01:41<00:00, 1627.19it/s]


In [6]:
count = 0
all_paragraphs: list[str] = []
for p in tqdm(news):
    all_paragraphs.extend(p)
    count += 1
    if count % 1000 == 0:
        save_paragraphs(all_paragraphs, count)
        all_paragraphs = []
#save the last part of the paragraphs to json file.
save_paragraphs(all_paragraphs, count)

print("done.")

100%|██████████| 164703/164703 [00:04<00:00, 34625.23it/s]


done.


# Clean the text

In [8]:
extracted_symbols = set("")
for p in tqdm(news[:]):
    for s in p:
        extracted_symbols.update(s)

print(extracted_symbols)
len(extracted_symbols)

100%|██████████| 164703/164703 [00:09<00:00, 16671.72it/s]

{']', 'ژ', 'ى', 'd', 'ل', 'z', 'ۆ', 'g', 'پ', 'r', 'ي', '$', 'ق', 'k', '6', 'p', 'ش', '/', 'ز', '.', 'u', '"', 'f', '(', '8', 'س', 'e', '؟', 'i', 't', 'l', 'م', '>', '9', '7', '«', ':', 'ئ', 'ۈ', '1', 'ۇ', "'", '5', '<', '[', 'ە', ' ', 'چ', '{', 'w', 'ۋ', '-', 'ب', 'ت', 'ې', '#', 'ن', '4', 'm', 'خ', 'گ', '^', 'ك', '|', 'c', 'غ', '0', '}', 's', '2', '›', '،', '€', 'ا', '+', 'ف', 'n', '؛', '*', 'ھ', '3', 'j', 'o', 'ڭ', 'a', '=', '!', 'h', 'y', 'x', 'و', 'ج', 'q', 'b', 'د', '%', '»', '‹', '¥', ')', '?', '@', 'v', '£', 'ر', '_'}





106

# Save all symbols to json file as tokens.

In [None]:
tokens = [s for s in [text_cleaner.symbols_list]]
with open('tokens.json', 'w+', encoding='utf-8') as f:
    json.dump(tokens, f, ensure_ascii=False)


### Same all the news title into json file.

In [10]:
print(news_objects[0].related)

[RelatedItem(title='يەر شارى بويىچە ھەر مىنۇتتا بىر ئادەم ئەيدىز كېسىلى بىلەن ئۆلىۋاتىدۇ', url='/news/2024/07/26828077.shtml', html_content='<a href="/news/2024/07/26828077.shtml">\n<img alt="يەر شارى بويىچە ھەر مىنۇتتا بىر ئادەم ئەيدىز كېسىلى بىلەن ئۆلىۋاتىدۇ" src="https://cdnf.nur.cn/uploadfile/2024/0724/072413590533400000023179.jpg"/>\n<h4>يەر شارى بويىچە ھەر مىنۇتتا بىر ئادەم ئەيدىز كېسىلى بىلەن ئۆلىۋاتىدۇ</h4>\n</a>'), RelatedItem(title='ئەيدىز كېسىلىنىڭ جىددىي خاراكتېرلىك مەزگىلىدە قانداق ئىپادىلەر كۆرۈلىدۇ؟', url='/news/2024/07/26791201.shtml', html_content='<a href="/news/2024/07/26791201.shtml">\n<img alt="ئەيدىز كېسىلىنىڭ جىددىي خاراكتېرلىك مەزگىلىدە قانداق ئىپادىلەر كۆرۈلىدۇ؟" src="https://cdnf.nur.cn/uploadfile/2024/0715/2024071568b7a2b906d4930ec52d22806c95d9d100017234601.jpg"/>\n<h4>ئەيدىز كېسىلىنىڭ جىددىي خاراكتېرلىك مەزگىلىدە قانداق ئىپادىلەر كۆرۈلىدۇ؟</h4>\n</a>'), RelatedItem(title='\u200bئەيدىز ۋىرۇسى بىلەن يېڭى يۇقۇملانغاندا قانداق ئالامەتلەر بولىدۇ؟', url='/news/202