In [269]:
#news stuff
import newspaper
from newsapi import NewsApiClient

from datetime import date
import pandas as pd
import numpy as np

#cluster stuff
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances_argmin_min

#nlp stuff
import spacy
nlp = spacy.load('en_core_web_lg')

In [26]:
api_key = 'eeeaefaae3c14737bc08e252a6e1991b'
newsapi = NewsApiClient(api_key=api_key)

In [368]:
def get_trending_articles_today(num_trends):
    today = date.today().strftime("%Y-%m-%d")

    trending_topics = newspaper.hot()
    if num_trends < len(trending_topics):
        trending_topics = trending_topics[:num_trends]

    data = []
    for topic in trending_topics:
        articles = []
        for i in range(1, 5):
            page_articles = newsapi.get_everything(q=topic,
                                      language='en',
                                      from_param=today,
                                      page=i)

            if len(page_articles) == 0:
                break
            else:
                articles.extend(page_articles["articles"])

        # add article info
        article_info = [(article['publishedAt'], article['title'], article['url'], topic) for article in articles]
        data.extend(article_info)
    
    return pd.DataFrame(data, columns=["date", "title", "url", "topic"])

In [288]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [w.lower_ for w in doc if not (w.is_stop or w.is_punct)]
    preproc_text = " ".join(tokens) 
    return preproc_text 

In [287]:
def cluster_articles(df, eps):
    sent_vecs = {}
    for title in df.title:
        try:
            doc = nlp(preprocess_text(title))
            sent_vecs.update({title: doc.vector})
        except Exception as e:
            print(e)

    vectors = list(sent_vecs.values())
    titles = list(sent_vecs.keys())

    # create clusters out of news titles
    x = np.array(vectors)
    dbscan = DBSCAN(eps=eps, min_samples=2, metric='cosine').fit(x)
    clusters = pd.DataFrame({'label': dbscan.labels_, 'title': titles, 'vectors': vectors})

    return clusters

In [348]:
def get_mean_vec(vectors):
    total = np.zeros(300)
    for vec in vectors:
        total += vec

    mean = total / len(vectors)
    return mean


In [355]:
def get_central_vec_title(cluster):
    vectors = cluster.vectors.to_list()

    mean_vec = get_mean_vec(vectors)
    index = pairwise_distances_argmin_min(np.array([mean_vec]), vectors)[0][0]
    
    return cluster.title.iloc[index]

In [362]:
def get_categorized_news(clusters, article_df):
    summarized_news = []
    for cluster in clusters.label.unique():
        #unclustered category
        if cluster == -1:
            continue
        
        # get best article from cluster
        cluster_titles = clusters.loc[clusters.label == cluster]
        best_article = get_central_vec_title(cluster_titles)

        #look up in original df
        cluster_df = article_df.loc[article_df.title == best_article].copy()
        #label the cluster it's part of
        cluster_df["cluster"] = cluster

        summarized_news.append(cluster_df)

    return pd.concat(summarized_news)

In [283]:
trending_news = get_trending_articles_today(6)
trending_news = trending_news.drop_duplicates(subset=['title'])

In [363]:
clusters = cluster_articles(trending_news, .12)
summarized_news = get_categorized_news(clusters, trending_news)
summarized_news