In [63]:
#news stuff
import newspaper
from newsapi import NewsApiClient

#utilities
from datetime import date
import pandas as pd
import numpy as np

#cluster stuff
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.neighbors import NearestNeighbors

#nlp stuff
import spacy
nlp = spacy.load('en_core_web_lg')

#summarization
from rake_nltk import Rake
rake_nltk_var = Rake()

In [37]:
api_key = 'eeeaefaae3c14737bc08e252a6e1991b'
newsapi = NewsApiClient(api_key=api_key)

In [38]:
def get_trending_articles_today(num_trends):
    today = date.today().strftime("%Y-%m-%d")

    trending_topics = newspaper.hot()
    if num_trends < len(trending_topics):
        trending_topics = trending_topics[:num_trends]

    data = []
    for topic in trending_topics:
        articles = []
        for i in range(1, 5):
            page_articles = newsapi.get_everything(q=topic,
                                      language='en',
                                      from_param=today,
                                      page=i)

            if len(page_articles) == 0:
                break
            else:
                articles.extend(page_articles["articles"])

        # add article info
        article_info = [(article['publishedAt'], article['title'], article['url'], topic) for article in articles]
        data.extend(article_info)
    
    return pd.DataFrame(data, columns=["date", "title", "url", "topic"])

In [39]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [w.lower_ for w in doc if not (w.is_stop or w.is_punct)]
    preproc_text = " ".join(tokens) 
    return preproc_text 

In [138]:
def get_best_eps_val(vectors, neighbors=2):
    neigh = NearestNeighbors(n_neighbors=neighbors)
    nbrs = neigh.fit(vectors)

    distances, indices = nbrs.kneighbors(vectors)

    # return half of first non zero distance value
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    non_zero = distances.nonzero()

    return distances[non_zero[0][0]] / 2

In [139]:
def cluster_articles(df, min_articles=5):
    sent_vecs = {}
    for title in df.title:
        try:
            doc = nlp(preprocess_text(title))
            sent_vecs.update({title: doc.vector})
        except Exception as e:
            print(e)
    
    vectors = list(sent_vecs.values())
    titles = list(sent_vecs.keys())

    # create clusters out of news titles
    x = np.array(vectors)

    #finds best eps value for dbscan
    eps = get_best_eps_val(x)

    #clusters articles using dbscan
    dbscan = DBSCAN(eps=eps, min_samples=min_articles, metric='cosine').fit(x)
    clusters = pd.DataFrame({'label': dbscan.labels_, 'title': titles, 'vectors': vectors})

    return clusters

In [140]:
def get_mean_vec(vectors):
    total = np.zeros(300)
    for vec in vectors:
        total += vec

    mean = total / len(vectors)
    return mean


In [141]:
def get_central_vec_title(cluster):
    vectors = cluster.vectors.to_list()

    mean_vec = get_mean_vec(vectors)
    index = pairwise_distances_argmin_min(np.array([mean_vec]), vectors)[0][0]
    
    return cluster.title.iloc[index]

In [142]:
def get_categorized_news(clusters, article_df):
    summarized_news = []
    for cluster in clusters.label.unique():
        #unclustered category
        if cluster == -1:
            continue
        
        # get best article from cluster
        cluster_titles = clusters.loc[clusters.label == cluster]
        best_article = get_central_vec_title(cluster_titles)

        #look up in original df
        cluster_df = article_df.loc[article_df.title == best_article].copy()
        cluster_df["num_articles"] = len(cluster_titles)

        summarized_news.append(cluster_df)

    return pd.concat(summarized_news)

In [44]:
def label_cluster(row):
    document = row.title

    # extract longest keword
    rake_nltk_var.extract_keywords_from_text(document)
    keyword_extracted = rake_nltk_var.get_ranked_phrases()
    title = max(keyword_extracted, key=len)

    return title

In [45]:
trending_news = get_trending_articles_today(100)
trending_news = trending_news.drop_duplicates(subset=['title'])

NewsAPIException: {'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 100 requests over a 24 hour period (50 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}

In [145]:
# put all trending news into clusters and pick most objective article for each one
clusters = cluster_articles(trending_news)
summarized_news = get_categorized_news(clusters, trending_news)

# summarize each cluster
summarized_news = summarized_news.reset_index(drop=True)
summarized_news["cluster_title"] = summarized_news.apply(label_cluster, axis=1)

In [146]:
summarized_news

Unnamed: 0,date,title,url,topic,num_articles,cluster_title
0,2021-08-10T11:36:16Z,Christina Applegate reveals multiple sclerosis...,https://www.usatoday.com/story/entertainment/c...,Christina Applegate,52,christina applegate reveals multiple sclerosis...
1,2021-08-10T16:12:00Z,New York Gov. Andrew Cuomo resigns over sexual...,https://www.wgrz.com/article/news/nation-world...,Kerry Kennedy,67,sexual harassment allegations
2,2021-08-10T13:13:44Z,Senate Set To Pass $1.2 Trillion Infrastructur...,https://www.forbes.com/sites/andrewsolender/20...,Infrastructure bill 2021,35,2 trillion infrastructure bill
3,2021-08-10T15:10:37Z,CDC investigating 2 deaths from rare bacterial...,https://thehill.com/policy/healthcare/567135-c...,Melioidosis,6,cdc investigating 2 deaths


In [148]:
trending_news.topic.value_counts()

Infrastructure bill 2021    78
Chris Cuomo                 71
Christina Applegate         56
Kerry Kennedy               39
Melioidosis                 14
Kelly Clarkson               2
Name: topic, dtype: int64