In [18]:
import math
import sys
import os
import re
from datetime import datetime, timezone
from importlib import util as importlibutil

In [3]:
from pymongo import MongoClient
from newsapi import NewsApiClient

In [72]:
import numpy as np

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [4]:
# Constants
PAGE_SIZE = 100
MINUTES_IN_HOUR = 60
SECONDS_IN_MINUTE = 60

In [7]:
# Enable requests cache
if importlibutil.find_spec("requests_cache"):
    import requests_cache
    requests_cache.install_cache('.requests_cache', expire_after=SECONDS_IN_MINUTE * MINUTES_IN_HOUR * 12)

In [8]:
# Config
update_interval = 360  # minutes
countries = ['in', 'us', 'gb', 'au', 'ca', 'nz'] # 'in' ['in', 'us', 'gb', 'au', 'ca', 'nz']
categories = {
    'business',
    'entertainment',
    'general',
    'health',
    'science',
    'sports',
    'technology'
}

In [9]:
#Keys
NEWSAPI_ORG_KEY = os.environ.get('NEWSAPI_ORG_KEY')
if not NEWSAPI_ORG_KEY:  # TODO(devansh): Remove
    NEWSAPI_ORG_KEY = 'ea0f26bbe06b44b898f0f0a80af00c7d'
MONGODB_URL = os.environ.get('MONGODB_URL')
if not MONGODB_URL:
    MONGODB_URL = 'mongodb://localhost:27017'

In [10]:
def FetchNews(newsapi):
    news = {category: [] for category in categories}
    for category in categories:
        articles = []
        keep_downloading = True
        cindex = 0
        while keep_downloading:
            print('newsapi.get_top_headlines(category=', category,
                  ', page=1', 'country='+countries[cindex] if countries != [] else '', 
                  ', page_size=', PAGE_SIZE, ')')
            r = None
            if countries == []:
                r = newsapi.get_top_headlines(category=category,
                                              page=1,
                                              page_size=PAGE_SIZE)
                if (r['status'] == 'ok'):
                    news[category] = r['articles']
                keep_downloading = False
            else:
                r = newsapi.get_top_headlines(country=countries[cindex],
                                              category=category,
                                              page=1,
                                              page_size=PAGE_SIZE)
                for c in r['articles']:
                    c['country'] = countries[cindex]
                # print(r['articles'])
                news[category].extend(r['articles'])
                cindex += 1
                keep_downloading = cindex < len(countries)

    for category_news in news:
        print(category_news, len(news[category_news]))
    return news

In [11]:
def MakeReadyForImport(data):
    def Transform(category, article):
        article['category'] = category
        return article
    r = []
    for category in categories:
        r.extend(list(map(lambda x: Transform(category, x), data[category])))
    return r

In [12]:
# Init newsapi
newsapi = NewsApiClient(api_key=NEWSAPI_ORG_KEY)

# Init Mongo
mongo_client = MongoClient(MONGODB_URL)
db = mongo_client['feed']
article = db.article

for row in article.find().sort('_id', -1).limit(1):
    diff = datetime.now(timezone.utc) - row['_id'].generation_time
    (m, s) = divmod(diff.total_seconds(), MINUTES_IN_HOUR)
    if (m <= update_interval):
        print('Not fetching/updating, last update:', m, 'minutes ago')
        exit(0)
news = FetchNews(newsapi)
news = MakeReadyForImport(news)

# Drop collection articles
article.drop()
article.insert_many(news)

newsapi.get_top_headlines(category= entertainment , page=1 country=in , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=us , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=gb , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=au , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=ca , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=nz , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=in , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=us , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=gb , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=au , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=ca , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 coun

<pymongo.results.InsertManyResult at 0x7f44a2307e08>

In [48]:
def ExtractText(data):
    def Prune(s):
        if not s:
            return ''
        return s
    def StripFromEnd(s):
        return re.sub(r'\[.+\]$', '', Prune(s))
    def Special(pre, text):
        return pre + '_' + text
    def ExtractItem(item):
        return ' '.join([Prune(item['title']), Prune(item['description']), StripFromEnd(item['content']),
                         Special('country', item['country']), Special('category', item['category'])])
    result = [''] * len(data)
    for i, item in enumerate(data):
#         print(i)
        result[i] = ExtractItem(item)
    return result

In [49]:
extracted_news = ExtractText(news)

In [64]:
tf = TfidfVectorizer(max_features=50, stop_words='english')

In [66]:
x = tf.fit_transform(extracted_news)

In [81]:
np.linalg.norm(x[2].todense())

0.9999999999999999

In [68]:
def AddTfIdfScores(data):
    
    

array([3.99831514, 2.76545333, 3.2671084 , 3.79607828, 3.80314544,
       4.0069732 , 4.59302225, 2.90836092, 2.80126278, 3.31818333,
       2.95281268, 3.28807353, 2.79348064, 2.7092725 , 3.87670801,
       3.58711936, 2.69044841, 2.6788608 , 2.91417489, 2.66513061,
       2.98671423, 2.849272  , 3.6893103 , 4.23011676, 3.8615562 ,
       3.8615562 , 3.75469306, 4.05142497, 4.05142497, 3.83192441,
       3.59861374, 4.30925408, 3.7346255 , 4.33306473, 2.3533104 ,
       2.88260842, 3.91562343, 3.65203891, 3.56451953, 3.79607828,
       3.95611479, 3.74795903, 4.24104583, 3.89987507, 3.49440996,
       3.31382602, 3.96441359, 3.57012178, 2.46404677, 3.47890577])