In [7]:
import plyvel
from typing import List, Optional
from pydantic import BaseModel
import json
from bs4 import BeautifulSoup
from tqdm import tqdm


class RelatedItem(BaseModel):
    title: str
    url: str
    html_content: str

class NewsItem(BaseModel):
    title: str
    url: str
    full_page_content: str
    comes_from: str
    published_time: str
    comment_count: int
    tags: List[str]
    related: List[RelatedItem]


def html_to_paragraphs(html: str):
    soup = BeautifulSoup(html, 'html.parser')
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
    return paragraphs

def save_paragraphs(paragraphs: list[str], count: int):
    with open(f'extracted/{str(count).zfill(4)}.json', 'w+', encoding='utf-8') as f:
        json.dump(paragraphs, f, ensure_ascii=False)


## Read the database

In [None]:

news: list[list[str]] = []
news_objects: list[NewsItem] = []
with plyvel.DB('../../data/news_content.lvdb') as db:
    
    for k, v in tqdm(db.iterator(), desc="preparing..."):
        news_item = NewsItem.model_validate_json(str(v, encoding='utf-8'))
        news_objects.append(news_item)
        html = news_item.full_page_content
        paragraphs = html_to_paragraphs(html)
        news.append(paragraphs)

preparing...: 164703it [01:51, 1477.68it/s]


preparing...: 57102it [00:40, 1542.52it/s]

In [None]:

import text_cleaner
import re

def clean_http_links(text: str) -> str:
    return re.sub(r'https?://[^\s]+', '', text)

news = [[text_cleaner.clean_text(i) for i in n] for n in tqdm(news, desc="cleaning links")]

NameError: name 'tqdm' is not defined

cleaning links:   9%|▉         | 15116/164703 [00:10<01:39, 1508.25it/s]


KeyboardInterrupt: 

In [None]:
count = 0
all_paragraphs: list[str] = []
for p in news:
    all_paragraphs.extend(p)
    count += 1
    if count % 1000 == 0:
        save_paragraphs(all_paragraphs, count)
        all_paragraphs = []
#save the last part of the paragraphs to json file.
save_paragraphs(all_paragraphs, count)

print("done.")

# Clean the text

In [None]:
extracted_symbols = set("")
for p in news[:]:
    for s in p:
        extracted_symbols.update(s)

print(extracted_symbols)
len(extracted_symbols)

# Save all symbols to json file as tokens.

In [None]:
tokens = [s for s in [text_cleaner.symbols_list]]
with open('tokens.json', 'w+', encoding='utf-8') as f:
    json.dump(tokens, f, ensure_ascii=False)
