# **INSHORTS** (https://inshorts.me/)

In [1]:
from tqdm import tqdm
import requests


def get_data(url):
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()

    else:
        print("Failed to fetch data. Status code:", response.status_code)
        data = dict()
    
    return data


def get_data_with_count_trial(url):
    counts = [100, 500] + [(i+1) * 1000 for i in range(10)]    
    last_result = None
    
    for idx, count in tqdm(enumerate(counts)):
        transit_url = url.format(count=count)
        response = requests.get(transit_url)

        if response.status_code == 200:
            last_result = response.json()
        
        else:
            break
        
    return last_result["data"]["articles"]

print_ = lambda x: print(x, "news")



In [2]:
all_news_url = "https://inshorts.me/news/all?offset=0&limit={count}"
top_news_url = "https://inshorts.me/news/top?offset=0&limit={count}"
trending_news_url = "https://inshorts.me/news/trending?offset=0&limit={count}"

In [3]:
from datetime import datetime

def timestamp_to_date(timestamp_ms):
    timestamp_seconds = timestamp_ms / 1000  # Convert milliseconds to seconds

    # Convert timestamp to datetime object
    date_object = datetime.fromtimestamp(timestamp_seconds)

    # Format the datetime object as mm-dd-yy
    formatted_date = date_object.strftime('%m-%d-%y')

    return formatted_date


dates = set()
for url in [all_news_url, top_news_url, trending_news_url]:
    timestamp = get_data(url.format(count=1))["data"]["articles"][0]["createdAt"]
    date = timestamp_to_date(timestamp)
    dates.add(date)

    
print(f"{len(dates)} unique dates")
date = dates.pop()
date

1 unique dates


'08-19-23'

# all news

In [4]:
all_news = get_data_with_count_trial(all_news_url)
print_(len(all_news))

5it [00:12,  2.43s/it]

3000 news





# top news

In [5]:
top_news = get_data_with_count_trial(top_news_url)
print_(len(top_news))

5it [00:37,  7.47s/it]

2981 news





# trending news

In [6]:
trending_news = get_data_with_count_trial(trending_news_url)
print_(len(trending_news))

12it [00:43,  3.66s/it]

194 news





# news by topic

In [7]:
# the below 2 are to be used together
get_all_topics = get_data("https://inshorts.me/news/topics")
topic_news_api = lambda topic: get_data(f"https://inshorts.me/news/topics/{topic}")["data"]["articles"]

# get topic names
all_topics = [topic["topic"] for topic in get_all_topics["data"]["topics"]]

# topic news
topic_news = []
for topic in tqdm(all_topics):
    topic_news.extend(topic_news_api(topic))
    
print_(len(topic_news))

100%|██████████| 20/20 [00:24<00:00,  1.25s/it]

200 news





# news by query

In [8]:
search_news_api = lambda query: get_data_with_count_trial("https://inshorts.me/news/search?query={query}&offset=0&limit={{count}}".format(query=query))

# topic news
search_news = []

for topic in tqdm(all_topics):
    search_news.extend(search_news_api(query=topic))
    
print_(len(search_news))

  0%|          | 0/20 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:01,  1.89s/it][A
2it [00:18,  9.25s/it][A
  5%|▌         | 1/20 [00:18<05:51, 18.51s/it]
0it [00:00, ?it/s][A
1it [00:03,  3.09s/it][A
2it [00:20, 10.44s/it][A
 10%|█         | 2/20 [00:39<05:58, 19.91s/it]
0it [00:00, ?it/s][A
1it [00:01,  1.92s/it][A
2it [00:07,  3.99s/it][A
3it [00:26,  8.76s/it][A
 15%|█▌        | 3/20 [01:05<06:27, 22.82s/it]
0it [00:00, ?it/s][A
1it [00:02,  2.16s/it][A
2it [00:08,  4.62s/it][A
3it [00:28,  9.54s/it][A
 20%|██        | 4/20 [01:34<06:41, 25.11s/it]
0it [00:00, ?it/s][A
1it [00:03,  3.56s/it][A
2it [00:23, 11.62s/it][A
 25%|██▌       | 5/20 [01:57<06:06, 24.44s/it]
0it [00:00, ?it/s][A
1it [00:02,  2.23s/it][A
2it [00:20, 10.26s/it][A
 30%|███       | 6/20 [02:18<05:23, 23.11s/it]
0it [00:00, ?it/s][A
1it [00:01,  1.49s/it][A
2it [00:18,  9.48s/it][A
 35%|███▌      | 7/20 [02:37<04:42, 21.76s/it]
0it [00:00, ?it/s][A
1it [00:01,  1.85s/it][A
2it [00:19,  9

7167 news





In [9]:
clubbed_overall_news = []


for news in all_news + top_news + trending_news + topic_news + search_news:
    clubbed_overall_news.append(
        {"title": news["title"].strip(),
         "summary": news["content"].strip(),
         "link": news["sourceUrl"],
         "image_link": news["imageUrl"],
         "source": "inshorts"}
    )
    
print_(len(clubbed_overall_news))

13542 news


In [10]:
clubbed_overall_news[0]

{'title': "ED seizes ₹36-cr asset of Goa miner's son in Pandora Papers case",
 'summary': "Enforcement Directorate said it has seized an immovable property worth ₹36.8 crore of Rohan Timblo, who is the son of Goa-based miner Radha Timblo. The seizure has been made as part of ED's investigation into the Pandora Papers leak, which allegedly revealed Rohan held undisclosed foreign exchange outside India. He allegedly contravened the provisions of FEMA for about ₹37 crore.",
 'link': 'https://www.bqprime.com/nation/pandora-papers-leak-ed-seizes-over-rs-36-crore-worth-asset-of-goa-miners-son?utm_campaign=fullarticle&utm_medium=referral&utm_source=inshorts',
 'image_link': 'https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2023/08_aug/19_sat/img_1692444427680_806.jpg?',
 'source': 'inshorts'}

# Deduplicate News

In [11]:
print(len({n["link"] for n in clubbed_overall_news}), " unique links")
print(len(clubbed_overall_news), " total links")

8742  unique links
13542  total links


In [12]:
def deduplicate_list_of_dicts(input_list, keys_to_check):
    """
    input_list - list of dictionaries
    keys_to_check: deduplicate only on these keys
    """
    seen = set()
    deduplicated_list = []
    
    for d in input_list:
        dict_subset = {key: d[key] for key in keys_to_check if key in d}
        dict_tuple = tuple(dict_subset.items())
        
        if dict_tuple not in seen:
            seen.add(dict_tuple)
            deduplicated_list.append(d)
    
    return deduplicated_list

keys_to_check = ['title', 'summary', 'link']

clubbed_overall_news = deduplicate_list_of_dicts(clubbed_overall_news, keys_to_check)
print_(len(clubbed_overall_news))

8827 news


# **RSS FEEDS**

In [13]:
import rss_feeds
import feedparser

In [14]:
feed_urls = list(set(rss_feeds.rss_feeds))
feed_news = []

for feed_url in tqdm(feed_urls):
    feed = feedparser.parse(feed_url)
    
    # Iterate through the entries in the feed
    for entry in feed.entries:
        try:
            feed_news.append(
                {"title": entry.title,
                 "summary": entry.summary,
                 "link": entry.link,
                 "source": "rss feed"}
            )
        except:
            continue
            
print_(len(feed_news))

100%|██████████| 264/264 [02:52<00:00,  1.53it/s]

3851 news





In [15]:
feed_news[0]

{'title': 'Families of babies murdered by nurse Lucy Letby vow to continue their search for answers',
 'summary': 'The families of babies murdered by Lucy Letby have vowed to continue their search for answers as questions swirled around what more could have been done to stop her killing spree.',
 'link': 'https://news.sky.com/story/families-of-babies-murdered-by-lucy-letby-vow-to-continue-their-search-for-answers-12942744',
 'source': 'rss feed'}

# **AGGREGATE**

In [16]:
NEWS = clubbed_overall_news + feed_news

# Full Text from URL

In [17]:
from newspaper import Article
import re


phrases_to_remove = ["Sign In", "Want to read more?", "Already have an account?", "To continue reading"]

def remove_phrases(string, phrases):
    pattern = '|'.join(re.escape(phrase) for phrase in phrases)
    result = re.split(pattern, string)
    return result[0]


def curate_article(article):
    # Remove "Advertisement" sections
    curated_article = re.sub(r'Advertisement', '', article)

    # Remove extra spaces and new lines
    curated_article = re.sub(r'\n{3,}', '\n\n', curated_article)
    
    # Remove everything after the stop phrases
    curated_article = remove_phrases(curated_article, phrases_to_remove)
    
    # routine curation
    curated_article = re.sub(r'\s+', ' ', curated_article)
    curated_article = curated_article.strip()

    return curated_article


def extract_article_text(url):
    article = Article(url)
    article.download()
    article.parse()
    return curate_article(article.text)

In [18]:
pbar = tqdm(NEWS)

failed = 0
total = 0

for news in pbar:
    try:
        full_text = extract_article_text(news["link"])
        
    except:
        full_text = ""
    
    if full_text == "":
        failed += 1
    total += 1
    
    pbar.set_description(f"failed: {failed}, total: {total}, failed percentage: {round(100 * failed / total, 2)}")
    news.update({"full_text": full_text})

failed: 2025, total: 12678, percentage: 15.97: 100%|██████████| 12678/12678 [3:43:22<00:00,  1.06s/it]   


In [19]:
import json

path = f'./data/{date}.json'

with open(path, 'w') as json_file:
    json.dump(NEWS, json_file)