In [2]:
from gnews import GNews
from newspaper import Article, ArticleException
from bs4 import BeautifulSoup
import nltk
import requests
import time
import urllib3
import random, math
from datetime import datetime, timedelta


#hl: language
#gl: country
#ceid: country: language
#(en, jp, fr, de)
lang_codes = ['en', 'jp', 'fr', 'de', 'it', 'es']
def get_news_meta_data(keyword, start_date, end_date, language):
    #yyyy-mm-dd'
    
    lang_string = '&ceid=' + language + ':' + language + '&hl=' + language + '-' + language + '&gl=' + language
    query = 'https://news.google.com/rss/search?q=' + keyword + '+after:' + start_date + '+before:' + end_date + lang_string
    #'&ceid=US:en&hl=en-US&gl=US'
    http = urllib3.PoolManager()
    response = http.request("GET", query)
    soupy = BeautifulSoup(response.data, 'html.parser')
    
    items = soupy.contents[1].find_all('item')
    
    if len(items) == 100:
        #parse start_date and end_date to get less articles.
        d = int(days_between(start_date, end_date)/2)
        mid_date = datetime.strptime(start_date, "%Y-%m-%d") + timedelta(days=d)
        mid_date = mid_date.strftime("%Y-%m-%d")
        
        if mid_date == start_date:
            return items
        
        items1 = get_news_meta_data(keyword, start_date, mid_date, language)
        items2 = get_news_meta_data(keyword, mid_date, end_date, language)
        
        items = items1 + items2
        
    print('Number of articles retrieved: ' + str(len(items)))
    return items

def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    return abs((d2 - d1).days)


#returns a list of dicts
'''
{'title' : the page title,
 'pubdate' : the date the article was published,
 'link' : the link to the article online}
 '''
def news_items_to_dict(items):
    article_meta_data = []
    
    for item in items:
        title = item.find('title')
        pubdate = item.find('pubdate')

        stringed = str(item)
        start = stringed.find('href')
        stringed = stringed[start+6:]
        link = stringed[:stringed.find('"')]
        
        article_meta_data.append({'title' : title,
                                 'pubdate' : pubdate,
                                 'link' : link})
    return article_meta_data

def get_article_text(meta_data):
    a = Article(meta_data['link'])
    a.download()
    a.parse()
    return(a.text)

def build_corpus(article_meta_data):
    articles = []
    for article in article_meta_data:
        try:
            articles.append(get_article_text(article))
        except ArticleException:
            print(ArticleException)
    return articles

def get_news_articles_text(keyword, start_date, end_date, language):
    meta_data = get_news_meta_data(keyword, start_date, end_date, language)
    articles_text = build_corpus(meta_data)
    return articles_text

In [22]:
# use CountVectorizer to turn the docs into vectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords
import pandas as pd


# turn the documents into vectors
def create_count_vectorizer(documents, stopWords, num_features):
    count_vectorizer = CountVectorizer(stop_words=set(stopWords), tokenizer=tokenize, max_features=num_features)
    data = count_vectorizer.fit_transform(documents)
    return (count_vectorizer, data)

# create the LDA model (note that usually num_topics is unknown)
def create_and_fit_lda(data, num_topics):
    lda = LDA(n_components=num_topics, n_jobs=-1)
    lda.fit(data)
    return lda

# identify & print the most common topic words
def get_most_common_words_for_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    word_dict = {}
    for topic_index, topic in enumerate(model.components_):
        this_topic_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        word_dict[topic_index] = this_topic_words
    return word_dict

def print_topic_words(word_dict):
    for key in word_dict.keys():
        print(f"Topic {key}")
        print("\t", word_dict[key])
        
def tokenize(s):
    tokens = nltk.word_tokenize(s)
    return tokens

def tokenize_articles(articles):
    t = []
    for a in articles:
        t += tokenize(a)
    return t

#displays the lda.  also returns the words of the topics.
def create_display_lda(articles, language, num_features, num_topics, num_words):
    stopWords = set(stopwords.words(language))
    stopWords.update({',', '.', '’', '“', '”', ')', '(', '—', '``', '?', ':', ';', "''", '/', '–', '‘',
                     '$', '%', '[', ']', "'s", '!', "'", '-',})
    
    df = pd.DataFrame(articles)

    (vectorizer, data) = create_count_vectorizer(df[0], stopWords, num_features)
    lda = create_and_fit_lda(data, num_topics)
    topic_words = get_most_common_words_for_topics(lda, vectorizer, num_words)
    print_topic_words(topic_words)
    
    words = []
    for key in topic_words.keys():
        words += topic_words[key]
        
    return list(set(words))

In [41]:
articles_en = get_news_meta_data('martin heidegger', '2002-07-13', '2010-07-15', 'en')
articles_en = news_items_to_dict(articles_en)
print(len(articles_en))
articles_en = build_corpus(articles_en)
#common_words = create_display_lda(articles_en, 'english', 3000, 1, 10)

08/19/2022 04:17:55 AM - Redirecting https://news.google.com/rss/search?q=martin heidegger+after:2002-07-13+before:2010-07-15&ceid=en:en&hl=en-en&gl=en -> https://news.google.com/rss/search?q=martin+heidegger+after:2002-07-13+before:2010-07-15&ceid=US:en&hl=en-US&gl=US


Number of articles retrieved: 9
9


In [5]:
print(common_words)

#an object containing the related terms to this concept.
obj = requests.get('http://api.conceptnet.io/related/c/en/' + common_words[8] + '?filter=/c/en').json()

['one', 'heidegger', 'human', 'philosophy', 'world', 'would', 'death', 'martin', 'hannah', 'arendt']


In [6]:
obj['related'][1]['@id']

'/c/en/sadie'

In [32]:
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokenize_articles(articles_en))
finder.apply_freq_filter(7)

In [33]:
finder.nbest(bigram_measures.pmi, 20)

[('SUBSCRIBE', 'NOW'),
 ('please', 'SUBSCRIBE'),
 ('global', 'warming'),
 ('free', 'per'),
 ('complete', 'access'),
 ('Bill', 'Griffith'),
 ('Your', 'complimentary'),
 ('per', 'month'),
 ('complimentary', 'articles'),
 ('four', 'complimentary'),
 ('Freiburg', 'University'),
 ('articles', 'free'),
 ('ve', 'read'),
 ('your', 'four'),
 ('four', 'articles'),
 ('read', 'four'),
 ('Advertisement', 'Advertisement'),
 ('human', 'beings'),
 ('natural', 'world'),
 ('analytic', 'philosophers')]

In [36]:
articles_en[3]

'Will we ever be able to think of Hannah Arendt in the same way again? Two new and damning critiques, one of Arendt and one of her longtime Nazi-sycophant lover, the philosopher Martin Heidegger, were published within 10 days of each other last month. The pieces cast further doubt on the overinflated, underexamined reputations of both figures and shed new light on their intellectually toxic relationship.\n\nMy hope is that these revelations will encourage a further discrediting of the most overused, misused, abused pseudo-intellectual phrase in our language: the banality of evil. The banality of the banality of evil, the fatuousness of it, has long been fathomless, but perhaps now it will be consigned to the realm of the deceitful and disingenuous as well.\n\nAdvertisement\n\nThe first of the two new reports—and the one most overlooked here in America, perhaps because it’s not online—appeared in the sober pages of London’s Times Literary Supplement on Oct. 9. It was titled “Blame the V