In [18]:
import math
import sys
import os
import re
from datetime import datetime, timezone
from importlib import util as importlibutil

In [3]:
from pymongo import MongoClient
from newsapi import NewsApiClient

In [72]:
import numpy as np

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [4]:
# Constants
PAGE_SIZE = 100
MINUTES_IN_HOUR = 60
SECONDS_IN_MINUTE = 60

In [7]:
# Enable requests cache
if importlibutil.find_spec("requests_cache"):
    import requests_cache
    requests_cache.install_cache('.requests_cache', expire_after=SECONDS_IN_MINUTE * MINUTES_IN_HOUR * 12)

In [8]:
# Config
update_interval = 360  # minutes
countries = ['in', 'us', 'gb', 'au', 'ca', 'nz'] # 'in' ['in', 'us', 'gb', 'au', 'ca', 'nz']
categories = {
    'business',
    'entertainment',
    'general',
    'health',
    'science',
    'sports',
    'technology'
}

In [9]:
#Keys
NEWSAPI_ORG_KEY = os.environ.get('NEWSAPI_ORG_KEY')
if not NEWSAPI_ORG_KEY:  # TODO(devansh): Remove
    NEWSAPI_ORG_KEY = 'ea0f26bbe06b44b898f0f0a80af00c7d'
MONGODB_URL = os.environ.get('MONGODB_URL')
if not MONGODB_URL:
    MONGODB_URL = 'mongodb://localhost:27017'

In [82]:
def FetchNews(newsapi):
    def Transform(category, article):
        article['category'] = category
        return article
    news = {category: [] for category in categories}
    for category in categories:
        articles = []
        keep_downloading = True
        cindex = 0
        while keep_downloading:
            print('newsapi.get_top_headlines(category=', category,
                  ', page=1', 'country='+countries[cindex] if countries != [] else '', 
                  ', page_size=', PAGE_SIZE, ')')
            r = None
            if countries == []:
                r = newsapi.get_top_headlines(category=category,
                                              page=1,
                                              page_size=PAGE_SIZE)
                if (r['status'] == 'ok'):
                    news[category] = r['articles']
                keep_downloading = False
            else:
                r = newsapi.get_top_headlines(country=countries[cindex],
                                              category=category,
                                              page=1,
                                              page_size=PAGE_SIZE)
                for c in r['articles']:
                    c['country'] = countries[cindex]
                # print(r['articles'])
                news[category].extend(r['articles'])
                cindex += 1
                keep_downloading = cindex < len(countries)

    for category_news in news:
        print(category_news, len(news[category_news]))
    r = []
    for category in categories:
        r.extend(list(map(lambda x: Transform(category, x), data[category])))
    return r

In [12]:
# Init newsapi
newsapi = NewsApiClient(api_key=NEWSAPI_ORG_KEY)

# Init Mongo
mongo_client = MongoClient(MONGODB_URL)
db = mongo_client['feed']
article = db.article

for row in article.find().sort('_id', -1).limit(1):
    diff = datetime.now(timezone.utc) - row['_id'].generation_time
    (m, s) = divmod(diff.total_seconds(), MINUTES_IN_HOUR)
    if (m <= update_interval):
        print('Not fetching/updating, last update:', m, 'minutes ago')
        exit(0)
news = FetchNews(newsapi)

# Drop collection articles
article.drop()
article.insert_many(news)

newsapi.get_top_headlines(category= entertainment , page=1 country=in , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=us , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=gb , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=au , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=ca , page_size= 100 )
newsapi.get_top_headlines(category= entertainment , page=1 country=nz , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=in , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=us , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=gb , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=au , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 country=ca , page_size= 100 )
newsapi.get_top_headlines(category= sports , page=1 coun

<pymongo.results.InsertManyResult at 0x7f44a2307e08>

In [48]:
def ExtractText(data):
    def Prune(s):
        if not s:
            return ''
        return s
    def StripFromEnd(s):
        return re.sub(r'\[.+\]$', '', Prune(s))
    def Special(pre, text):
        return pre + '_' + text
    def ExtractItem(item):
        return ' '.join([Prune(item['title']), Prune(item['description']), StripFromEnd(item['content']),
                         Special('country', item['country']), Special('category', item['category'])])
    result = [''] * len(data)
    for i, item in enumerate(data):
#         print(i)
        result[i] = ExtractItem(item)
    return result

In [49]:
extracted_news = ExtractText(news)

In [64]:
tf = TfidfVectorizer(max_features=50, stop_words='english')

In [83]:
x = tf.fit_transform(extracted_news)

In [89]:
for ii in x:
    print(ii.todense())

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.32975089 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.31372608 0.         0.         0.         0.
  0.         0.45456342 0.         0.         0.         0.
  0.         0.         0.         0.         0.27702013 0.3393265
  0.         0.         0.         0.44685568 0.         0.44119132
  0.         0.         0.         0.         0.         0.
  0.         0.        ]]
[[0.         0.         0.         0.         0.         0.53582508
  0.         0.         0.37459368 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.35638966 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.47665878 0.         0.         0.
  0.         0.         0.46728351 0.         0.   

  0.         0.37440422]]
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.30416767 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.31642793 0.         0.         0.         0.         0.
  0.         0.41929681 0.         0.         0.         0.
  0.         0.         0.40551438 0.         0.         0.
  0.42516756 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.53510393 0.        ]]
[[0.         0.23060223 0.         0.         0.         0.
  0.         0.         0.23358826 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.24300364 0.         0.         0.         0.         0.
  0.         0.64400541 0.         0.         0.         0.
  0.         0.         0.         0.         0.19623496 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.

  0.         0.        ]]
[[0.         0.56778395 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.28676916 0.         0.         0.         0.         0.
  0.         0.27359319 0.         0.         0.         0.
  0.         0.         0.         0.         0.41590542 0.
  0.         0.44237328 0.         0.         0.         0.
  0.         0.         0.         0.3896924  0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]]
[[0.         0.         0.         0.89136287 0.         0.
  0.         0.         0.         0.         0.         0.
  0.3279707  0.         0.         0.         0.         0.
  0.         0.31290167 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.

[[0.         0.         0.46202897 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.39504933 0.         0.         0.         0.38047868 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.69692269 0.        ]]
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.72026481 0.         0.         0.         0.69369921 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.22150925 0.
  0.         0.         0.         0.         0.         0.
  0.         0.19992839 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.95444344 0.         0.         0.         0.         0.
  0.         0.        ]]
[[0.         0.         0.         0.48536722 0.         0.
  0.         0.         0.         0.         0.37754714 0.
  0.         0.         0.         0.         0.         0.
  0.         0.34076406 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.46011937 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.54226085 0.         0.         0.         0.         0.
  0.         0

[[0.         0.25884241 0.         0.         0.         0.
  0.         0.         0.         0.         0.27637897 0.
  0.         0.         0.         0.         0.         0.25073747
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.75841496 0.         0.
  0.         0.         0.         0.         0.22026643 0.
  0.         0.34182553 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.23063119 0.        ]]
[[0.         0.         0.39248386 0.         0.         0.
  0.         0.         0.         0.         0.35472693 0.
  0.         0.         0.         0.         0.         0.32181658
  0.         0.         0.         0.         0.         0.
  0.         0.         0.45105832 0.48670529 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.41979003 0.         0.         0

[[0.         0.         0.         0.         0.65032717 0.
  0.         0.16577384 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.16240583 0.         0.72333806
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]]
[[0.         0.         0.         0.         0.         0.
  0.         0.4603404  0.         0.         0.         0.
  0.         0.         0.61361205 0.         0.         0.
  0.         0.         0.         0.4509877  0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.45626425
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0

  0.4547937  0.        ]]
[[0.         0.35349283 0.         0.         0.         0.
  0.         0.37175993 0.         0.         0.         0.
  0.         0.         0.         0.45852192 0.         0.34242418
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.45999119 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.44468957]]
[[0.         0.         0.         0.         0.         0.
  0.         0.35675863 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.32860664
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.87449469 0.         0.         0.
  0.         0.         0.      

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.37097865
  0.         0.         0.         0.40471866 0.         0.3022439
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.2655135  0.
  0.         0.         0.         0.42829456 0.44635076 0.
  0.         0.         0.         0.         0.         0.
  0.         0.3925094 ]]
[[0.         0.38556699 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.22921606
  0.         0.         0.         0.         0.         0.18674701
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.30206381 0.32810491 0.
  0.27296341 0.         0.         0.         0.55157224 0.2612753
  0.         0.         0.         0

[[0.         0.         0.52858713 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.43833457 0.         0.         0.43528901 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.58222154 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]]
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.45308648 0.         0.59989361 0.44993842 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.48207439
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.  

In [81]:
np.linalg.norm(x[2].todense())

0.9999999999999999