In [1]:
from pymongo import MongoClient
import json
from IPython.display import clear_output

client = MongoClient()
db = client.blog_data

## Initialize Mongo Database
#### Clear and update collections

In [23]:
db.collection_names()

['reuters', 'linkedin', 'spoon_tamago']

In [24]:
json_files = {
    'linkedin': 'blog_spider/linkedin.json',
    'reuters': 'blog_spider/reuters_tech.json',
    'spoon_tamago': 'blog_spider/spoon_tamago.json'
}

In [25]:
def clear_collections(db, json_files):
    for collection in json_files.keys():
        db[collection].drop()
        print("Dropped '%s'" % collection)
    print("----------------------")
    print("Remaining Collections:")
    if len(db.collection_names()) == 0:
        print(" EMPTY")
    for collection in db.collection_names():
        print(" - %s" % collection)
        
def update_collections(db, json_files):
    for collection, json_file in json_files.items():
        with open(json_file, 'r') as fn:
            my_list = json.load(fn)
        list_length = len(my_list)
        i = 1
        print("'%s' added to database" % collection)
        print("Number of entries: %d" % list_length)
        print('---------')
        db[collection].insert_many(my_list)

In [26]:
clear_collections(db, json_files)

Dropped 'reuters'
Dropped 'linkedin'
Dropped 'spoon_tamago'
----------------------
Remaining Collections:
 EMPTY


In [27]:
update_collections(db, json_files)

'reuters' added to database
Number of entries: 598
---------
'linkedin' added to database
Number of entries: 1620
---------
'spoon_tamago' added to database
Number of entries: 2867
---------


---
## Read from database

In [28]:
linkedin = db.linkedin
reuters = db.reuters
spoon = db.spoon_tamago

In [34]:
linkedin.find_one()

{'_id': ObjectId('5a8f0a286b4a7531a96f8399'),
 'author': 'Keren Baruch',
 'content': 'In 2016, we introduced -- a tool that allows users to see a detailed breakdown of salaries by job title and location based on information privately submitted by LinkedIn members. Since then, we’ve been hard at work identifying ways to make this experience even more valuable, and encourage even greater transparency when it comes to conversations about pay. That’s why today we are announcing Salary Insights: a new way for you to explore compensation details on open roles. This feature will appear on job listings and will show an estimated or expected salary range for the role, based on data from our 546+ million members and employer provided information. We know salary insights on job listings is important to you. In fact, a recent survey found that more than 70% of professionals want to hear about salary in the first message from a recruiter. By surfacing this information early on in the process, we ho

In [38]:
text = linkedin.find_one({},{'content':1, '_id':0})['content']
text[:100]+'...'

'In 2016, we introduced -- a tool that allows users to see a detailed breakdown of salaries by job ti...'

In [39]:
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

In [44]:
stemmer = Stemmer('english')
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words('english')

1.0

In [1]:
import datetime as dt

In [9]:
start = dt.datetime(2018,1,1)
end = dt.datetime(2018,2,1)
delta = end-start

In [6]:
step = dt.timedelta(days=1)

In [14]:
time_range = []
for i in range(delta.days+1):
    time_range.append((start+dt.timedelta(days=i)).strftime('%Y-%m-%d'))

In [15]:
time_range

['2018-01-01',
 '2018-01-02',
 '2018-01-03',
 '2018-01-04',
 '2018-01-05',
 '2018-01-06',
 '2018-01-07',
 '2018-01-08',
 '2018-01-09',
 '2018-01-10',
 '2018-01-11',
 '2018-01-12',
 '2018-01-13',
 '2018-01-14',
 '2018-01-15',
 '2018-01-16',
 '2018-01-17',
 '2018-01-18',
 '2018-01-19',
 '2018-01-20',
 '2018-01-21',
 '2018-01-22',
 '2018-01-23',
 '2018-01-24',
 '2018-01-25',
 '2018-01-26',
 '2018-01-27',
 '2018-01-28',
 '2018-01-29',
 '2018-01-30',
 '2018-01-31',
 '2018-02-01']