In [7]:
#news stuff
import newspaper
from newsapi import NewsApiClient

#utilities
from datetime import date
import pandas as pd
import numpy as np

#cluster stuff
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.neighbors import NearestNeighbors

#nlp stuff
import spacy
nlp = spacy.load('en_core_web_lg')

#summarization
from rake_nltk import Rake
rake_nltk_var = Rake()

In [8]:
api_key = '5964b2e875064a83a9033afc11f48101'
newsapi = NewsApiClient(api_key=api_key)

In [9]:
def get_trending_articles_today(num_trends):
    today = date.today().strftime("%Y-%m-%d")

    trending_topics = newspaper.hot()
    if num_trends < len(trending_topics):
        trending_topics = trending_topics[:num_trends]

    data = []
    for topic in trending_topics:
        articles = []
        for i in range(1, 5):
            page_articles = newsapi.get_everything(q=topic,
                                      language='en',
                                      from_param=today,
                                      page=i)

            if len(page_articles) == 0:
                break
            else:
                articles.extend(page_articles["articles"])

        # add article info
        article_info = [(article['publishedAt'], article['title'], article['url'], topic) for article in articles]
        data.extend(article_info)
    
    return pd.DataFrame(data, columns=["date", "title", "url", "topic"])

In [10]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [w.lower_ for w in doc if not (w.is_stop or w.is_punct)]
    preproc_text = " ".join(tokens) 
    return preproc_text 

In [11]:
def get_best_eps_val(vectors, neighbors=2):
    neigh = NearestNeighbors(n_neighbors=neighbors)
    nbrs = neigh.fit(vectors)

    distances, indices = nbrs.kneighbors(vectors)

    # return half of first non zero distance value
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    non_zero = distances.nonzero()

    return distances[non_zero[0][0]] / 2, distances

In [23]:
import sys
sys.path.append("../src/zinfo")

# news scraping
from news_scraper import get_trending_articles_today

# clustering
from article_clustering import get_vectorized_titles
from article_clustering import cluster_articles

# article selecting
from article_selector import get_best_article_all_clusters

In [None]:
trending_news = get_trending_articles_today(5)