In [1]:
# !pip install GoogleNews
# !pip install gnewsclient

In [2]:
from tqdm import tqdm
from GoogleNews import GoogleNews

In [3]:
import pickle
import os

title_store_path = ""
title_store = pickle.load(open(title_store_path, "rb")) if os.path.isfile(title_store_path) else set()

In [4]:
topics = ['Top Stories',
          'World',
          'Nation',
          'Business',
          'Technology',
          'Entertainment',
          'Sports',
          'Science',
          'Health',
          'Politics']

In [5]:
def get_google_news(use_method="search"):
    googlenews = GoogleNews(lang='en', 
                            period="2d",
                            encode='utf-8')
    
    results = []

    for topic in tqdm(topics):
        if use_method == "search":
            # search news on "topic"
            googlenews.search(topic)
            
            # fetch all pages from google search
            for i in range(2, 10):
                googlenews.get_page(i)
            
        else:
            # directly call the news API
            googlenews.get_news(topic)

        # get results
        result = googlenews.results()
        
        for news in result:
            news["link"] = news["link"].split("&ved")[0]
            news["category"] = topic
            
            del news["img"]
            del news["desc"]
        
        results.extend(result)
        googlenews.clear()

    results = sorted(results, reverse=True, key=lambda d: d['datetime'])
    return results

In [7]:
# fetch news
news = get_google_news(use_method="search")
news += get_google_news(use_method="get_news")

100%|██████████| 1/1 [00:16<00:00, 16.20s/it]
100%|██████████| 1/1 [00:02<00:00,  2.52s/it]


In [8]:
len(news)

188

In [9]:
news[0]

{'title': 'News: Today’s News Headlines, Breaking News India, World News and Cricket News',
 'media': 'Hindustan Times',
 'date': '0 hours ago',
 'datetime': datetime.datetime(2023, 9, 7, 22, 47, 0, 480906),
 'link': 'https://www.hindustantimes.com/infographic/understanding-the-fosbury-flop-101682536635133.html',
 'category': 'Top Stories'}

In [12]:
# # get newspapers directly
# import time
# import newspaper
# from tqdm import tqdm


# news_sites = [
#     "http://cnn.com",
#     "http://bbc.com",
#     "http://nytimes.com",
#     "http://washingtonpost.com",
#     "http://reuters.com",
#     "http://apnews.com",
#     "http://nbcnews.com",
#     "http://usatoday.com",
#     "http://theguardian.com",
#     "http://aljazeera.com",
#     "http://bloomberg.com",
#     "http://huffpost.com",
#     "http://abcnews.go.com",
#     "http://time.com",
#     "http://forbes.com",
#     "http://cnbc.com",
#     "http://wsj.com",
#     "http://npr.org",
#     "http://bostonglobe.com",
#     "http://latimes.com",
#     "http://chicagotribune.com",
#     "http://foxnews.com",
#     "http://usatoday.com",
#     "http://news.yahoo.com",
#     "http://cbc.ca",
#     "http://independent.co.uk",
#     "http://usatoday.com",
#     "http://thetimes.co.uk",
#     "http://mirror.co.uk",
#     "http://news.sky.com",
#     "http://dailynews.com",
# ]

# results = []

# for news_site in tqdm(news_sites):
#     paper = newspaper.build(news_site)

#     for article in paper.articles:
#         results.append({"link": article.url,
#                      "media": news_site})
        
#     time.sleep(5)

In [9]:
def deduplicate_list_of_dicts(input_list, keys_to_check):
    """
    input_list - list of dictionaries
    keys_to_check: deduplicate only on these keys
    """
    seen = set()
    deduplicated_list = []
    
    for d in input_list:
        dict_subset = {key: d[key] for key in keys_to_check if key in d}
        dict_tuple = tuple(dict_subset.items())
        
        if dict_tuple not in seen:
            seen.add(dict_tuple)
            deduplicated_list.append(d)
    
    return deduplicated_list

# deduplicate news
news = deduplicate_list_of_dicts(news, keys_to_check=["link"])

In [10]:
# remove news that is already inserted
news = [new for new in news if new["link"] not in title_store]

# add newly insterted news to title store and save the title store
title_store = title_store.union({new["link"] for new in news})
# pickle.dump(title_store, open(title_store_path, "wb"))

In [11]:
# bring full news data
from newspaper import Article
import re
import nltk
from datetime import datetime


nltk.download('punkt')
pattern = re.compile(r'[^a-zA-Z0-9`~!@#$%^&*()_+={}\[\]:;"\'<>,.?/\\| -]')
phrases_to_remove = ["Sign In", "Want to read more?", "Already have an account?", "To continue reading"]


def remove_phrases(string, phrases):
    pattern = '|'.join(re.escape(phrase) for phrase in phrases)
    result = re.split(pattern, string)
    return result[0]


def curate_article(article):
    # Remove characters not on the QWERTY keyboard 
    article = pattern.sub('', article)
    
    # Remove "Advertisement" sections
    curated_article = re.sub(r'Advertisement', '', article)

    # Remove extra spaces and new lines
    curated_article = re.sub(r'\n{3,}', '\n\n', curated_article)
    
    # Remove everything after the stop phrases
    curated_article = remove_phrases(curated_article, phrases_to_remove)
    
    # routine curation
    curated_article = re.sub(r'\s+', ' ', curated_article)
    curated_article = curated_article.strip()

    return curated_article


def get_full_news(news):
    url = news["link"]
    
    try:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()
    
    except:
        return 
    
    full_text = curate_article(article.text)

    # failure criteria
    if article.is_media_news() or len(full_text.split()) < 50:
        return
    
    news["full_text"] = full_text
    
    if "image_url" not in news:
        news["image_url"] = article.top_image
        
    # check for date
    is_date = "pdate" in article.meta_data
    
    if "date" not in news and is_date:
        news["date"] = datetime.strptime(str(article.meta_data["pdate"]), "%Y%m%d")
    
    if "datetime" not in news and is_date:
        news["datetime"] = datetime.strptime(str(article.meta_data["pdate"]), "%Y%m%d")
        
    # check for title
    if "title" not in news:
        news["title"] = article.title
        
    return news

[nltk_data] Downloading package punkt to /home/qblocks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
news = [get_full_news(new) for new in tqdm(news)]
news = [new for new in news if new is not None]

  0%|          | 77/1106500 [00:27<87:51:58,  3.50it/s] 

In [None]:
len(news)

In [None]:
news[0]

In [9]:
if len(news) > 0:
    # infer
    pass

899

In [16]:
# from gnewsclient import gnewsclient
 
# client = gnewsclient.NewsClient(language='english',
#                                 location='india',
#                                 topic='Science',
#                                 max_results=300)
 
# news_list = client.get_news()
# client.topics