In [2]:
#news stuff
import newspaper
from newsapi import NewsApiClient

#utilities
from datetime import date
import pandas as pd
import numpy as np

#cluster stuff
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.neighbors import NearestNeighbors

#nlp stuff
import spacy
nlp = spacy.load('en_core_web_lg')

#summarization
from rake_nltk import Rake
rake_nltk_var = Rake()

In [3]:
api_key = '5964b2e875064a83a9033afc11f48101'
newsapi = NewsApiClient(api_key=api_key)

In [4]:
def get_trending_articles_today(num_trends):
    today = date.today().strftime("%Y-%m-%d")

    trending_topics = newspaper.hot()
    if num_trends < len(trending_topics):
        trending_topics = trending_topics[:num_trends]

    data = []
    for topic in trending_topics:
        articles = []
        for i in range(1, 5):
            page_articles = newsapi.get_everything(q=topic,
                                      language='en',
                                      from_param=today,
                                      page=i)

            if len(page_articles) == 0:
                break
            else:
                articles.extend(page_articles["articles"])

        # add article info
        article_info = [(article['publishedAt'], article['title'], article['url'], topic) for article in articles]
        data.extend(article_info)
    
    return pd.DataFrame(data, columns=["date", "title", "url", "topic"])

In [5]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [w.lower_ for w in doc if not (w.is_stop or w.is_punct)]
    preproc_text = " ".join(tokens) 
    return preproc_text 

In [6]:
def get_best_eps_val(vectors, neighbors=2):
    neigh = NearestNeighbors(n_neighbors=neighbors)
    nbrs = neigh.fit(vectors)

    distances, indices = nbrs.kneighbors(vectors)

    # return half of first non zero distance value
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    non_zero = distances.nonzero()

    return distances[non_zero[0][0]] / 2, distances

In [18]:
import hdbscan

ModuleNotFoundError: No module named 'hdbscan'

In [7]:
def get_best_min_sample_val(num_total_articles, factor=132):
    return int(num_total_articles / factor)


def cluster_articles(df):
    sent_vecs = {}
    # make each article title into a vector
    for title in df.title:
        try:
            doc = nlp(preprocess_text(title))
            sent_vecs.update({title: doc.vector})
        except Exception as e:
            print(e)

    vectors = list(sent_vecs.values())
    x = np.array(vectors)

    # finds best hyper parameters for dbscan
    eps, distances = get_best_eps_val(x)
    print(eps)
    min_articles = get_best_min_sample_val(len(df))
    # clusters articles using dbscan
    dbscan = DBSCAN(eps=.25, min_samples=min_articles, metric='cosine').fit(x)

    titles = list(sent_vecs.keys())
    return pd.DataFrame({'label': dbscan.labels_, 'title': titles, 'vectors': vectors}), distances

In [8]:
def get_mean_vec(vectors):
    total = np.zeros(300)
    for vec in vectors:
        total += vec

    mean = total / len(vectors)
    return mean


In [9]:
def get_central_vec_title(cluster):
    vectors = cluster.vectors.to_list()

    mean_vec = get_mean_vec(vectors)
    index = pairwise_distances_argmin_min(np.array([mean_vec]), vectors)[0][0]
    
    return cluster.title.iloc[index]

In [10]:
def get_categorized_news(clusters, article_df):
    summarized_news = []
    for cluster in clusters.label.unique():
        #unclustered category
        if cluster == -1:
            continue
        
        # get best article from cluster
        cluster_titles = clusters.loc[clusters.label == cluster]
        best_article = get_central_vec_title(cluster_titles)

        #look up in original df
        cluster_df = article_df.loc[article_df.title == best_article].copy()
        cluster_df["num_articles"] = len(cluster_titles)

        summarized_news.append(cluster_df)

    return pd.concat(summarized_news)

In [11]:
trending_news = get_trending_articles_today(4)
trending_news = trending_news.drop_duplicates(subset=['title'])

In [12]:
# put all trending news into clusters and pick most objective article for each one
clusters, distances = cluster_articles(trending_news)
summarized_news = get_categorized_news(clusters, trending_news)

# summarize each cluster
summarized_news = summarized_news.reset_index(drop=True)

0.3301095370732062


In [13]:
summarized_news

Unnamed: 0,date,title,url,topic,num_articles
0,2021-08-11T07:19:28Z,MLB roundup: Rays top Red Sox to pad AL East lead,https://www.channelnewsasia.com/sport/mlb-roun...,Phillies,2
1,2021-08-11T04:39:14Z,"Seager, Muncy HR, Dodgers end Phillies' 8-game...",https://sports.yahoo.com/seager-muncy-hr-dodge...,Phillies,3
2,2021-08-11T10:20:13Z,Call from New England Patriots felt like MLB d...,https://www.espn.com/blog/new-england-patriots...,Phillies,3
3,2021-08-11T04:42:57Z,Orioles become first team in 19 years to allow...,https://www.cbssports.com/mlb/news/orioles-bec...,Phillies,1
4,2021-08-11T15:32:54Z,How smooth was Trea Turner's slide vs. Phillie...,https://news.yahoo.com/smooth-trea-turners-sli...,Phillies,6
5,2021-08-11T15:57:32Z,"MLB DFS: Top DraftKings, FanDuel daily Fantasy...",https://www.cbssports.com/mlb/news/mlb-dfs-top...,Phillies,3
6,2021-08-11T16:48:53Z,An MLB player had a mesmerizing slide and fans...,https://www.insider.com/video-trae-turner-slid...,Phillies,1
7,2021-08-11T01:26:12Z,Noah Syndergaard Likely To Work In Relief Upon...,https://www.mlbtraderumors.com/2021/08/noah-sy...,Phillies,1
8,2021-08-11T17:11:13Z,Braves activate Travis d'Arnaud as Atlanta con...,https://www.cbssports.com/mlb/news/braves-acti...,Phillies,1
9,2021-08-11T14:25:36Z,Trea Turner Did The Smoothest Slide You’ll Eve...,https://brobible.com/sports/article/trea-turne...,Phillies,2


In [14]:
trending_news.topic.value_counts()

Phillies           37
Kylie Jenner       25
Kissing Booth 3    23
Name: topic, dtype: int64