# **INSHORTS** (https://inshorts.me/)

In [1]:
from tqdm import tqdm
import requests


def get_data(url):
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()

    else:
        print("Failed to fetch data. Status code:", response.status_code)
        data = dict()
    
    return data


def get_data_with_count_trial(url):
    counts = [100, 500] + [(i+1) * 1000 for i in range(10)]    
    last_result = None
    
    for idx, count in tqdm(enumerate(counts)):
        transit_url = url.format(count=count)
        response = requests.get(transit_url)

        if response.status_code == 200:
            last_result = response.json()
        
        else:
            break
        
    return last_result["data"]["articles"]

print_ = lambda x: print(x, "news")



In [2]:
all_news_url = "https://inshorts.me/news/all?offset=0&limit={count}"
top_news_url = "https://inshorts.me/news/top?offset=0&limit={count}"
trending_news_url = "https://inshorts.me/news/trending?offset=0&limit={count}"

In [3]:
from datetime import datetime

def timestamp_to_date(timestamp_ms):
    timestamp_seconds = timestamp_ms / 1000  # Convert milliseconds to seconds

    # Convert timestamp to datetime object
    date_object = datetime.fromtimestamp(timestamp_seconds)

    # Format the datetime object as mm-dd-yy
    formatted_date = date_object.strftime('%m-%d-%y')

    return formatted_date


dates = set()
for url in [all_news_url, top_news_url, trending_news_url]:
    timestamp = get_data(url.format(count=1))["data"]["articles"][0]["createdAt"]
    date = timestamp_to_date(timestamp)
    dates.add(date)

    
print(f"{len(dates)} unique dates")
date = dates.pop()
date

2 unique dates


'09-10-23'

# all news

In [4]:
all_news = get_data_with_count_trial(all_news_url)
print_(len(all_news))

5it [00:25,  5.09s/it]

2985 news





# top news

In [5]:
top_news = get_data_with_count_trial(top_news_url)
print_(len(top_news))

6it [00:31,  5.33s/it]

3954 news





# trending news

In [6]:
trending_news = get_data_with_count_trial(trending_news_url)
print_(len(trending_news))

12it [00:21,  1.75s/it]

193 news





# news by topic

In [7]:
# the below 2 are to be used together
get_all_topics = get_data("https://inshorts.me/news/topics")
topic_news_api = lambda topic: get_data(f"https://inshorts.me/news/topics/{topic}")["data"]["articles"]

# get topic names
all_topics = [topic["topic"] for topic in get_all_topics["data"]["topics"]]

# topic news
topic_news = []
for topic in tqdm(all_topics):
    topic_news.extend(topic_news_api(topic))
    
print_(len(topic_news))

100%|██████████| 18/18 [00:19<00:00,  1.09s/it]

180 news





# news by query

In [8]:
search_news_api = lambda query: get_data_with_count_trial("https://inshorts.me/news/search?query={query}&offset=0&limit={{count}}".format(query=query))

# topic news
search_news = []

for topic in tqdm(all_topics):
    search_news.extend(search_news_api(query=topic))
    
print_(len(search_news))

  0%|          | 0/18 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:02,  2.14s/it][A
2it [00:18,  9.22s/it][A
  6%|▌         | 1/18 [00:18<05:13, 18.45s/it]
0it [00:00, ?it/s][A
1it [00:01,  1.29s/it][A
2it [00:05,  3.29s/it][A
3it [00:25,  8.46s/it][A
 11%|█         | 2/18 [00:43<06:00, 22.54s/it]
0it [00:00, ?it/s][A
1it [00:02,  2.83s/it][A
2it [00:21, 10.66s/it][A
 17%|█▋        | 3/18 [01:05<05:29, 21.99s/it]
0it [00:00, ?it/s][A
1it [00:02,  2.21s/it][A
2it [00:08,  4.78s/it][A
3it [00:30, 10.23s/it][A
 22%|██▏       | 4/18 [01:35<05:56, 25.43s/it]
0it [00:00, ?it/s][A
1it [00:02,  2.94s/it][A
2it [00:10,  5.85s/it][A
3it [00:32, 10.82s/it][A
 28%|██▊       | 5/18 [02:08<06:03, 27.97s/it]
0it [00:00, ?it/s][A
1it [00:03,  3.02s/it][A
2it [00:11,  6.09s/it][A
3it [00:32, 10.85s/it][A
 33%|███▎      | 6/18 [02:40<05:54, 29.52s/it]
0it [00:00, ?it/s][A
1it [00:02,  2.72s/it][A
2it [00:19,  9.94s/it][A
 39%|███▉      | 7/18 [03:00<04:50, 26.37s/it]
0it [00:00

8871 news





In [9]:
clubbed_overall_news = []


for news in all_news + top_news + trending_news + topic_news + search_news:
    clubbed_overall_news.append(
        {"title": news["title"].strip(),
         "summary": news["content"].strip(),
         "link": news["sourceUrl"],
         "image_link": news["imageUrl"],
         "source": "inshorts"}
    )
    
print_(len(clubbed_overall_news))

16183 news


In [10]:
clubbed_overall_news[0]

{'title': "How is India-Middle East-Europe Corridor different from China's Belt & Road Initiative?",
 'summary': 'The India-Middle East-Europe Economic Corridor will be substantially different from China\'s Belt and Road Initiative, Railways Minister Ashwini Vaishnaw said. Unlike the BRI, where a huge debt burden gets imposed on host nations, the G20 project will bring in revenue and be bankable, Vaishnaw added. "The BRI came with...conditions...[Now] countries can decide on basis of its needs," he added.',
 'link': 'https://www.news18.com/amp/videos/india/railway-minister-on-how-india-middle-east-eu-corridor-differs-from-china-s-bri-g20-summit-news18-8572165.html?utm_campaign=fullarticle&utm_medium=referral&utm_source=inshorts',
 'image_link': 'https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2023/09_sep/10_sun/img_1694367018282_376.jpg?',
 'source': 'inshorts'}

# Deduplicate News

In [11]:
print(len({n["link"] for n in clubbed_overall_news}), " unique links")
print(len(clubbed_overall_news), " total links")

11024  unique links
16183  total links


In [12]:
import sys

sys.path.append('../../../')
from saar.utils import deduplicate_list_of_dicts, get_full_news

keys_to_check = ['title', 'summary', 'link']
clubbed_overall_news = deduplicate_list_of_dicts(clubbed_overall_news, keys_to_check)

print_(len(clubbed_overall_news))

[nltk_data] Downloading package punkt to /home/qblocks/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


11135 news


# **RSS FEEDS**

In [13]:
# import rss_feeds
# import feedparser

In [14]:
# feed_urls = list(set(rss_feeds.rss_feeds))
# feed_news = []

# for feed_url in tqdm(feed_urls):
#     feed = feedparser.parse(feed_url)
    
#     # Iterate through the entries in the feed
#     for entry in feed.entries:
#         try:
#             feed_news.append(
#                 {"title": entry.title,
#                  "summary": entry.summary,
#                  "link": entry.link,
#                  "source": "rss feed"}
#             )
#         except:
#             continue
            
# print_(len(feed_news))

100%|██████████| 264/264 [02:52<00:00,  1.53it/s]

3851 news





In [15]:
# feed_news[0]

{'title': 'Families of babies murdered by nurse Lucy Letby vow to continue their search for answers',
 'summary': 'The families of babies murdered by Lucy Letby have vowed to continue their search for answers as questions swirled around what more could have been done to stop her killing spree.',
 'link': 'https://news.sky.com/story/families-of-babies-murdered-by-lucy-letby-vow-to-continue-their-search-for-answers-12942744',
 'source': 'rss feed'}

# **AGGREGATE**

In [13]:
news = clubbed_overall_news # + feed_news

In [None]:
# FILTER OUT NEWS THAT WE ALREADY HAVE 
from os.path import join
import json

folder_path = "../../../data/training/"
path = join(folder_path, "inshorts.json")

with open(path, 'r') as json_file:
    existing_data = json.load(json_file)
                
# Create a set of tuples containing (key1, key2) from data for faster look-up
# Filter news based on the presence of (link, summary) in data
set_data_keys = {(d['link'], d['summary']) for d in existing_data}
news = [d for d in news if (d['link'], d['summary']) not in set_data_keys]

# Full Text from URL

In [23]:
"""
get full news
"""
news = [get_full_news(new) for new in tqdm(news)]
news = [new for new in news if new is not None]

In [None]:
# save checkpoint
import os
import json

path = join(folder_path, f"{date}.json")

with open(path, 'w') as json_file:
    json.dump(news, json_file)

In [2]:
import os
import sys

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

sys.path.append('../../../')
from saar.infer import Infer

2023-09-11 07:43:44.390496: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-11 07:43:45.931315: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-11 07:43:45.931434: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/qblocks/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


  warn(msg)
[nltk_data] Downloading package punkt to /home/qblocks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
len(news)

7849

In [4]:
import logging
from tqdm import tqdm
from dotenv import load_dotenv


load_dotenv()


"""
run inference
"""
if len(news) > 0:
    # summary adapter, title adapter path
    summary_adapter_path = os.environ["SUMMARY_ADAPTER_PATH"]
    title_adapter_path = os.environ["TITLE_ADAPTER_PATH"]

    logging.info("loading models..")
    infer = Infer(
        summary_adapter_path=summary_adapter_path, title_adapter_path=title_adapter_path
    )

    # batching
    batch = lambda data, batch_size: [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    
    batch_size = 60 # int(os.environ["BATCH_SIZE"])
    news = batch(news, batch_size=batch_size)
    
    data = []
    
    logging.info("running inference..")
    for news_batch in tqdm(news):
        # generate summary
        news_batch = infer(mode="summary", data=news_batch)

        # generate title
        # NOTE: Title generation needs summary already generated as "generated_summary" key in the news dict
        news_batch = infer(mode="title", data=news_batch)
        data.extend(news_batch)

In [5]:
len(data), len(news)

(7849, 131)

# FOR SOME REASON, THIS CODE DOESN'T PRODUCE DIFFERENT TEXT EVERYTIME IT RUNS

In [None]:
# sample multiple outputs for one news article from the model
from transformers import GenerationConfig


infer.summary_generation_config = GenerationConfig(
            max_new_tokens=200,
            num_beams=8,
            do_sample=True,
            temperature=2.,
            top_k=30,
            top_p=0.8,
        )

infer.title_generation_config = GenerationConfig(
            max_new_tokens=25,
            num_beams=8,
            do_sample=True,
            temperature=2.,
            top_k=30,
            top_p=0.8,
        )

num_of_model_output_samples = 5

for news_batch in tqdm(news):
    for _ in range(num_of_model_output_samples):
        # generate summary
        news_batch = infer(mode="summary", data=news_batch)

        # generate title
        # NOTE: Title generation needs summary already generated as "generated_summary" key in the news dict
        news_batch = infer(mode="title", data=news_batch)
        data.extend(news_batch.copy())

  1%|          | 1/131 [02:51<6:12:17, 171.83s/it]

In [None]:
# deduplicate
data = deduplicate_list_of_dicts(data, ["link", "generated_summary", "generated_title"])

In [None]:
existing_data.extend(data)

In [None]:
with open(path, 'w') as json_file:
    json.dump(existing_data, json_file)

In [None]:
len(data)