In [None]:
import time
import json
import requests
from typing import List
from tqdm import tqdm
from datasets import Dataset
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

In [None]:
def get_rss_feed(url: str):
    rss_xml = requests.get(url)

    if rss_xml.status_code != 200:
        print(f'Not able to query RSS feed page')
        raise RuntimeError('Not able to query RSS feed page')

    root = ET.fromstring(rss_xml.content)
    items = root.findall('.//item')
    titles = [i.find('./title').text.strip() for i in items]
    links = [i.find('./link').text for i in items]

    print(f'{len(links)} found')

    return titles, links

def get_individual_article(url: str, stop_phrases: List[str]):
    response = requests.get(url)

    if response.status_code != 200:
        print(f'Not able to query the article: {url}')
        raise RuntimeError(f'Not able to query the article: {url}')

    soup = BeautifulSoup(response.text, 'html.parser')
    all_paras = [p.get_text() for p in soup.find_all('p')]

    article_content = []
    for para in all_paras:
        for sp in stop_phrases:
            if sp in para:
                break
        article_content.append(para)

    # print(f'{len(all_paras)} paras found. {len(article_content)} paras parsed')

    return '\n\n'.join(article_content)

In [None]:
rss_urls = [
    'https://www.bhaskar.com/rss-v1--category-7140.xml',
    'https://www.bhaskar.com/rss-v1--category-11215.xml',
    'https://www.bhaskar.com/rss-v1--category-7911.xml',
    'https://www.bhaskar.com/rss-v1--category-1051.xml',
    'https://www.bhaskar.com/rss-v1--category-11616.xml'
]
phrases = ['पूरी खबर यहां पढ़ें...', 'ये खबर भी पढ़ें...', 'Copyright', 'पढ़ें पूरी खबर...', 'पूरी खबर पढ़ें...', 'This website follows the DNPA Code of Ethics']
sleep_time = 1

In [None]:
dataset = []

for rss_url in rss_urls:

    print(f'Processing for {rss_url}')
    rss_titles, rss_feed = get_rss_feed(url=rss_url)

    for t, l in tqdm(zip(rss_titles, rss_feed), total=len(rss_feed)):
        try:
            article = get_individual_article(url=l, stop_phrases=phrases)
            dataset.append({
                'link': l,
                'title': t,
                'content': article
            })

        except Exception as err:
            continue
        time.sleep(sleep_time)

In [None]:
ds = Dataset.from_list(dataset)

In [None]:
phrases

In [None]:
for elem in ds[4]['content'].split('\n\n'):
    for p in phrases:
        if p in elem:
            print(elem)

In [None]:
for elem in ds[4]['content'].split('\n\n'):
    print(elem)

In [None]:
ds.save_to_disk('../data/retreival/20231228-1604/')

Verifying and cleaning dataset

In [None]:
import os
import json
from typing import List
from datasets import load_from_disk, Dataset

In [None]:
ds = load_from_disk('../data/retrieval/20231228-1604/')
phrases = ['पूरी खबर यहां पढ़ें...', 'ये खबर भी पढ़ें...', 'Copyright', 'पढ़ें पूरी खबर...', 'पूरी खबर पढ़ें...', 'This website follows the DNPA Code of Ethics']

In [None]:
cleaned_dataset = []

for datapoint in iter(ds):
    content = datapoint['content']

    joined_paras = []

    for para in content.split('\n\n'):
        # Remove paragraphs that are stop phrases or junk
        flag = len([p for p in phrases if p in para]) >= 1
        if flag:
            continue
        joined_paras.append(para)

    num_paras = len(joined_paras)
    if num_paras < 5:
        continue

    cleaned_dataset.append({
        'link': datapoint['link'],
        'content': datapoint['title'] + '\n\n' + '\n\n'.join(joined_paras),
    })

print(len(cleaned_dataset))

In [None]:
cleaned_dataset[1]

In [None]:
cleaned_ds = Dataset.from_list(cleaned_dataset)

In [None]:
a = cleaned_ds.filter(lambda x: x['link'] == 'https://www.bhaskar.com/business/news/business-events-today-share-market-petrol-diesel-gold-silver-air-india-a350-132332950.html')

In [None]:
a = ds.filter(lambda x: x['link'] == 'https://www.bhaskar.com/business/news/business-events-today-share-market-petrol-diesel-gold-silver-air-india-a350-132332950.html')

In [None]:
print(a['content'][0])

In [None]:
def synth_save_to_disk(base_path: str, generated_dataset: List):
    """
    Saves data to disk in a sequential file by removing elements from the list
    The function will flush the contents of the List to the disk

    Args:
        base_path (str): Location of the base path where the data should be written
        generated_dataset (List): Each element should be a valid Dict
    """

    print(f'Number of rows: {len(generated_dataset)}')

    with open(os.path.join(base_path, 'dataset.jsonl'), 'a', encoding='utf-8') as fp:
        while generated_dataset:
            fp.write(json.dumps(generated_dataset.pop(), ensure_ascii=False))
            fp.write('\n')

In [None]:
print(len(cleaned_dataset))
synth_save_to_disk(base_path='../data/retrieval/cleaned_dataset/', generated_dataset=cleaned_dataset)
print(len(cleaned_dataset))

In [None]:
with open('../data/retrieval/cleaned_dataset/dataset.jsonl', 'r') as fp:
    x = fp.read()
    d = []

    for ln in x.split('\n'):
        if not ln:
            continue
        d.append(json.loads(ln))

Verifying gpt3.5 generated data

In [None]:
import json
from datasets import Dataset

In [None]:
with open('../data/synthetic_data/20231229-1514/retrieval_questions/dataset.jsonl', 'r') as fp:
    x = fp.read()
    data = []
    for ln in x.split('\n'):
        if ln:
            data.append(json.loads(ln))

In [None]:
data[3]