In [12]:
#news stuff
import newspaper
from newsapi import NewsApiClient

#utilities
from datetime import date
import pandas as pd
import numpy as np

#cluster stuff
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.neighbors import NearestNeighbors

#nlp stuff
import spacy
nlp = spacy.load('en_core_web_lg')

#summarization
from rake_nltk import Rake
rake_nltk_var = Rake()

In [13]:
api_key = 'debd522136164978a43f9815fe4dde7d'
newsapi = NewsApiClient(api_key=api_key)

In [14]:
def get_trending_articles_today(num_trends):
    today = date.today().strftime("%Y-%m-%d")

    trending_topics = newspaper.hot()
    if num_trends < len(trending_topics):
        trending_topics = trending_topics[:num_trends]

    data = []
    for topic in trending_topics:
        articles = []
        for i in range(1, 5):
            page_articles = newsapi.get_everything(q=topic,
                                      language='en',
                                      from_param=today,
                                      page=i)

            if len(page_articles) == 0:
                break
            else:
                articles.extend(page_articles["articles"])

        # add article info
        article_info = [(article['publishedAt'], article['title'], article['url'], topic) for article in articles]
        data.extend(article_info)
    
    return pd.DataFrame(data, columns=["date", "title", "url", "topic"])

In [15]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [w.lower_ for w in doc if not (w.is_stop or w.is_punct)]
    preproc_text = " ".join(tokens) 
    return preproc_text 

In [16]:
def get_best_eps_val(vectors, neighbors=2):
    neigh = NearestNeighbors(n_neighbors=neighbors)
    nbrs = neigh.fit(vectors)

    distances, indices = nbrs.kneighbors(vectors)

    # return half of first non zero distance value
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    non_zero = distances.nonzero()

    return distances[non_zero[0][0]] / 2

In [32]:
def cluster_articles(df, min_articles=6):
    sent_vecs = {}
    for title in df.title:
        try:
            doc = nlp(preprocess_text(title))
            sent_vecs.update({title: doc.vector})
        except Exception as e:
            print(e)
    
    vectors = list(sent_vecs.values())
    titles = list(sent_vecs.keys())

    # create clusters out of news titles
    x = np.array(vectors)

    #finds best eps value for dbscan
    eps = get_best_eps_val(x)

    #clusters articles using dbscan
    dbscan = DBSCAN(eps=eps, min_samples=min_articles, metric='cosine').fit(x)
    clusters = pd.DataFrame({'label': dbscan.labels_, 'title': titles, 'vectors': vectors})

    return clusters

In [18]:
def get_mean_vec(vectors):
    total = np.zeros(300)
    for vec in vectors:
        total += vec

    mean = total / len(vectors)
    return mean


In [19]:
def get_central_vec_title(cluster):
    vectors = cluster.vectors.to_list()

    mean_vec = get_mean_vec(vectors)
    index = pairwise_distances_argmin_min(np.array([mean_vec]), vectors)[0][0]
    
    return cluster.title.iloc[index]

In [20]:
def get_categorized_news(clusters, article_df):
    summarized_news = []
    for cluster in clusters.label.unique():
        #unclustered category
        if cluster == -1:
            continue
        
        # get best article from cluster
        cluster_titles = clusters.loc[clusters.label == cluster]
        best_article = get_central_vec_title(cluster_titles)

        #look up in original df
        cluster_df = article_df.loc[article_df.title == best_article].copy()
        cluster_df["num_articles"] = len(cluster_titles)

        summarized_news.append(cluster_df)

    return pd.concat(summarized_news)

In [21]:
def label_cluster(row):
    document = row.title

    # extract longest keword
    rake_nltk_var.extract_keywords_from_text(document)
    keyword_extracted = rake_nltk_var.get_ranked_phrases()
    title = max(keyword_extracted, key=len)

    return title

In [22]:
trending_news = get_trending_articles_today(100)
trending_news = trending_news.drop_duplicates(subset=['title'])

In [30]:
# put all trending news into clusters and pick most objective article for each one
clusters = cluster_articles(trending_news)
summarized_news = get_categorized_news(clusters, trending_news)

# summarize each cluster
summarized_news = summarized_news.reset_index(drop=True)
summarized_news["cluster_title"] = summarized_news.apply(label_cluster, axis=1)

In [31]:
summarized_news

Unnamed: 0,date,title,url,topic,num_articles,cluster_title
0,2021-08-10T13:29:47Z,Christina Applegate Reveals Multiple Sclerosis...,https://www.tmz.com/2021/08/10/christina-apple...,Christina Applegate,53,christina applegate reveals multiple sclerosis...
1,2021-08-10T16:12:00Z,New York Gov. Andrew Cuomo resigns over sexual...,https://www.wgrz.com/article/news/nation-world...,Kerry Kennedy,152,sexual harassment allegations
2,2021-08-10T13:13:44Z,Senate Set To Pass $1.2 Trillion Infrastructur...,https://www.forbes.com/sites/andrewsolender/20...,Infrastructure bill 2021,39,2 trillion infrastructure bill
3,2021-08-10T15:10:37Z,CDC investigating 2 deaths from rare bacterial...,https://thehill.com/policy/healthcare/567135-c...,Melioidosis,6,cdc investigating 2 deaths
4,2021-08-10T16:23:02Z,Dominion sues Newsmax and One America News ove...,https://www.washingtonpost.com/media/2021/08/1...,Mike Lindell,6,election fraud claims
5,2021-08-10T15:53:27Z,Volunteer High School in Tennessee on lockdown...,https://slashdot.org/firehose.pl?op=view&amp;i...,Volunteer High School,23,‘ emergency situation ’
6,2021-08-10T02:54:00Z,Jackass Star Bam Margera Sues Johnny Knoxville...,https://comicbook.com/movies/news/bam-margera-...,Bam Margera,13,jackass star bam margera sues johnny knoxville
7,2021-08-10T02:15:00Z,Katie Thurston Gets Engaged to Blake Moynes on...,https://people.com/tv/katie-thurston-gets-enga...,Bachelorette finale,34,katie thurston gets engaged
8,2021-08-10T12:06:57Z,Tropical Storm Fred likely to form Tuesday; Fl...,https://www.usatoday.com/story/news/nation/202...,National Hurricane Center,13,tropical storm fred likely
9,2021-08-10T16:47:36Z,Legendary hip-hop and R&B producer Chucky Thom...,https://djmag.com/news/legendary-hip-hop-and-r...,Chucky Thompson,6,b producer chucky thompson dies


In [28]:
summarized_news.iloc[4].url

'https://www.washingtonpost.com/media/2021/08/10/dominion-sues-oan-newsmax/'

In [25]:
trending_news.topic.value_counts()

Epic                         78
Infrastructure bill 2021     78
Bachelorette finale          74
Chris Cuomo                  72
Prince Andrew                71
Mike Lindell                 69
Cuomo                        64
Christina Applegate          57
Volunteer High School        41
Kerry Kennedy                41
National Hurricane Center    29
Saweetie                     25
Nine Perfect Strangers       19
Reservation Dogs             18
Bam Margera                  18
Melioidosis                  14
Chucky Thompson              14
Aubrey Huff                   8
Bob Jenkins                   4
Kelly Clarkson                2
Name: topic, dtype: int64