In [None]:
import time
import json
import requests
from typing import List
from tqdm import tqdm
from datasets import Dataset
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

In [None]:
def get_rss_feed(url: str):
    rss_xml = requests.get(url)

    if rss_xml.status_code != 200:
        print(f'Not able to query RSS feed page')
        raise RuntimeError('Not able to query RSS feed page')

    root = ET.fromstring(rss_xml.content)
    items = root.findall('.//item')
    titles = [i.find('./title').text.strip() for i in items]
    links = [i.find('./link').text for i in items]

    print(f'{len(links)} found')

    return titles, links

def get_individual_article(url: str, stop_phrases: List[str]):
    response = requests.get(url)

    if response.status_code != 200:
        print(f'Not able to query the article: {url}')
        raise RuntimeError(f'Not able to query the article: {url}')

    soup = BeautifulSoup(response.text, 'html.parser')
    all_paras = [p.get_text() for p in soup.find_all('p')]

    article_content = []
    for para in all_paras:
        for sp in stop_phrases:
            if sp in para:
                break
        article_content.append(para)

    # print(f'{len(all_paras)} paras found. {len(article_content)} paras parsed')

    return '\n\n'.join(article_content)

In [None]:
rss_urls = [
    'https://www.bhaskar.com/rss-v1--category-7140.xml',
    'https://www.bhaskar.com/rss-v1--category-11215.xml',
    'https://www.bhaskar.com/rss-v1--category-7911.xml',
    'https://www.bhaskar.com/rss-v1--category-1051.xml',
    'https://www.bhaskar.com/rss-v1--category-11616.xml'
]
phrases = ['पूरी खबर यहां पढ़ें...', 'ये खबर भी पढ़ें...', 'Copyright', 'पढ़ें पूरी खबर...', 'पूरी खबर पढ़ें...']
sleep_time = 1

In [None]:
dataset = []

for rss_url in rss_urls:

    print(f'Processing for {rss_url}')
    rss_titles, rss_feed = get_rss_feed(url=rss_url)

    for t, l in tqdm(zip(rss_titles, rss_feed), total=len(rss_feed)):
        try:
            article = get_individual_article(url=l, stop_phrases=phrases)
            dataset.append({
                'link': l,
                'title': t,
                'content': article
            })

        except Exception as err:
            continue
        time.sleep(sleep_time)

In [None]:
ds = Dataset.from_list(dataset)

In [None]:
phrases

In [None]:
for elem in ds[4]['content'].split('\n\n'):
    for p in phrases:
        if p in elem:
            print(elem)

In [None]:
for elem in ds[4]['content'].split('\n\n'):
    print(elem)

In [None]:
ds.save_to_disk('../data/retreival/20231228-1604/')