#Project Fletcher
Description: Getting New York Times articles to make personalized article suggestions

Sources:
* http://dlab.berkeley.edu/blog/scraping-new-york-times-articles-python-tutorial
* http://developer.nytimes.com/docs/read/article_search_api_v2
* http://brooksandrew.github.io/simpleblog/articles/new-york-times-api-to-mongodb/
* https://docs.mongodb.org/getting-started/python/insert/
* http://open.blogs.nytimes.com/2015/08/11/building-the-next-new-york-times-recommendation-engine/

KEY: 888f546089cc789d146a2d70b4f2c804:9:74609839

In [3]:
api_key = '888f546089cc789d146a2d70b4f2c804:9:74609839'

####IMPORTS

In [1]:
from nytimesarticle import articleAPI
from pymongo import MongoClient
from gensim import corpora, models, similarities
from gensim.matutils import Sparse2Corpus
from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime
import requests

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

####FUNCTIONS

Build_query helps build the API query to fetch metadata on news articles from the New York Times

In [4]:
def build_query(year,key,page):
    base  = 'http://api.nytimes.com/svc/search/v2/articlesearch.json?'
    filtr = 'fq=source:("The+New+York+Times")'
    start = '&begin_date=' + str(year) + '0101'
    end = '&end_date=' + str(year) + '1231'
    pages = '&page=' + str(page)
    query = base + filtr + start + end + pages + '&sort=oldest&api-key=' + key
    return query

Get_articles returns in an array all the articles fetched from the New York Times

In [5]:
def get_articles(year):
    '''
    This function accepts a year in string format (e.g.'1980')
    and a query (e.g.'Amnesty International') and it will 
    return a list of parsed articles (in dictionaries)
    for that year.
    '''
    all_articles = []
    for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
        q = build_query(year,api_key,i)
        print("Page #%i, %s" % (i,q)) 
        response = requests.get(q)
        articles = response.json()
        print(response)
        articles = parse_articles(articles)
        all_articles = all_articles + articles
    return(all_articles)

Parse_articles converts each json response from the API into a dictionary with the meta information that I would like to keep from each article

In [6]:
def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    
    * _id
    * author name: byline.person.firstname & byline.person.lastname
    * document_type
    * headline
    * keywords: name & value
    * lead_paragraph
    * news_desk
    * pub_date
    * section_name
    * source
    * subsection_name
    * type_of_material
    * web_url
    * word_count
    '''
    news = []
    v = 0
    for i in articles['response']['docs']:
        dic = {}
        if i['lead_paragraph'] is None:
            continue
        dic['id'] = i['_id']
        if i['byline'] is not None and len(i['byline']) > 1 and len(i['byline']['person']) > 1 and 'firstname' and 'lastname' in i['byline']['person'][0]:
            dic['author'] = {'firstname':i['byline']['person'][0]['firstname'],'lastname':i['byline']['person'][0]['lastname']}
        dic['doc_type'] = i['document_type']
        #if i['abstract'] is not None:
            #dic['abstract'] = i['abstract'].encode("utf8")
        dic['headline'] = i['headline']['main']#.encode("utf8")
        dic['lead_paragraph'] = i['lead_paragraph']
        #if i['snippet'] is not None:
        #    dic['snippet'] = i['snippet']
        dic['summary'] = scrape_summary(i['web_url'])
        #dic['desk'] = i['news_desk']
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        dic['section'] = i['section_name']
        dic['subsection'] = i['subsection_name']
        #if i['snippet'] is not None:
        #    dic['snippet'] = i['snippet'].encode("utf8")
        dic['source'] = i['source']
        dic['type'] = i['type_of_material']
        dic['url'] = i['web_url']
        dic['word_count'] = i['word_count']
        # locations
        #locations = []
        #for x in range(0,len(i['keywords'])):
        #    if 'glocations' in i['keywords'][x]['name']:
        #        locations.append(i['keywords'][x]['value'])
        #dic['locations'] = locations
        # subject
        subjects = []
        for x in range(0,len(i['keywords'])):
            if 'subject' in i['keywords'][x]['name']:
                subjects.append(i['keywords'][x]['value'])
        dic['keywords'] = subjects   
        news.append(dic)
        v += 1
    return(news)

Get_url_soup_content gets the BeautifulSoup content from the requested url

In [None]:
#Get article off of the New York Times
def get_url_soup_content(url):
    response = requests.get(url)
    if response.ok:
        return BeautifulSoup(response.text)

Scrape_summary goes to the url page of the New York Times gets the article content and summarizes it using Sumy. It returns that summary

In [None]:
def scrape_summary(url, language='english', sentences_cnt = 3):
    soup = get_url_soup_content(url)
    paragraphs = soup.body.find_all('p',{'itemprop':'articleBody'})
    text = ''
    for i in range(len(paragraphs)):
        text += paragraphs[i].text + ' '
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)
    summary_obj = summarizer(parser.document, sentences_cnt)
    summary = ''
    for sentences in summary_obj:
        summary += sentences._text + ' '
        #print(sentences)
    return summary

find_article: given an article_id and a list of articles find the article with that id

In [None]:
def find_article(article_id,articles):
    for article in articles:
        if article['id'] == article_id:
            return article

find_most_similar_articles: given an article and a list of articles find similar_cnt articles that are similar to the article we want

In [None]:
def find_most_similar_articles(article,articles,similar_cnt = 5):
    cnt = 1
    suggestions = []
    sorted_similars = ((k, article['similar_docs'][k]) for k in sorted(article['similar_docs'], key = article['similar_docs'].get, reverse=True))
    for k,v in sorted_similars:
        found_article = find_article(k,articles)
        if found_article['section'] != "Paid Death Notices":
            suggestions.append(found_article)
            cnt += 1
        if cnt == 10:
            break
    return suggestions

________

In [58]:
result = get_articles(2016)

Page #0, http://api.nytimes.com/svc/search/v2/articlesearch.json?fq=source:("The+New+York+Times")&begin_date=20160101&end_date=20161231&page=0&sort=oldest&api-key=888f546089cc789d146a2d70b4f2c804:9:74609839
<Response [200]>
Page #1, http://api.nytimes.com/svc/search/v2/articlesearch.json?fq=source:("The+New+York+Times")&begin_date=20160101&end_date=20161231&page=1&sort=oldest&api-key=888f546089cc789d146a2d70b4f2c804:9:74609839
<Response [200]>
Page #2, http://api.nytimes.com/svc/search/v2/articlesearch.json?fq=source:("The+New+York+Times")&begin_date=20160101&end_date=20161231&page=2&sort=oldest&api-key=888f546089cc789d146a2d70b4f2c804:9:74609839
<Response [200]>
Page #3, http://api.nytimes.com/svc/search/v2/articlesearch.json?fq=source:("The+New+York+Times")&begin_date=20160101&end_date=20161231&page=3&sort=oldest&api-key=888f546089cc789d146a2d70b4f2c804:9:74609839
<Response [200]>
Page #4, http://api.nytimes.com/svc/search/v2/articlesearch.json?fq=source:("The+New+York+Times")&begin_

In [59]:
len(result)

900

In [60]:
result[0:10]

[{'date': '2016-01-01',
  'doc_type': 'article',
  'headline': '50 Comics Walk Into a Club. No, You Haven’t Heard This One.',
  'id': '56842cab79881066fdb9523e',
  'keywords': ['Comedy and Humor', 'New Year'],
  'lead_paragraph': '“50 First Jokes NYC” celebrates its 10th anniversary with a show at the Bell House in Brooklyn on Saturday, with 50 comedians trying for new laughs in the new year.',
  'section': 'Arts',
  'source': 'The New York Times',
  'subsection': None,
  'summary': 'A group of New York comics found that out almost by accident a decade ago, and on Saturday “50 First Jokes NYC” celebrates its 10th anniversary with a show at the Bell House in Gowanus, Brooklyn, that will no doubt generate lots of laughs and perhaps the occasional groan. “They’re really putting it on the line and doing something that’s completely untested.” Mr. O’Donnell, one of the show’s creators and the event’s host, said the idea was born in the middle of the last decade when he and other comics and t

In [17]:
val = []
s = set()
for i in range(len(result)):
    val.append(result[i]['lead_paragraph'])
    s.add(result[i]['subsection'])#used to check how many topics i should define

In [20]:
s

{'Credit and Debit Cards',
 'Media',
 'Personal Tech',
 'Middle East',
 'Soccer',
 'Sunday Review',
 'Room For Debate',
 'Fashion & Beauty',
 'Music',
 'Olympics',
 'Dance',
 'College Football',
 'International Arts',
 'DealBook',
 'Asia Pacific',
 'Sunday Book Review',
 None,
 'Pro Basketball',
 'Hockey',
 'Art & Design',
 'Auto Racing',
 'Skiing',
 'Energy & Environment ',
 'Africa',
 'Economy',
 'Politics',
 'Television',
 'College Basketball',
 'Baseball',
 'New Cars',
 'International Business',
 'Campaign Stops',
 'Pro Football',
 'Tennis',
 'Weddings',
 'Europe',
 'Horse Racing',
 'Americas',
 'Men’s Style',
 'Korean',
 'América'}

In [102]:
cnt = 0
new_val = []
for i in range(len(val)):
    if val[i] is not None:
        new_val.append(val[i])
        cnt += 1
print(cnt)

900


Suggestions: Content base, collaborative filtering

______________

##MongoDB

In [7]:
client = MongoClient('localhost', 27017)
db = client.nytimes
collection = db.articles_info

Insert result into MongoDB database

In [40]:
result[0:10]

[{'date': '2016-01-01',
  'doc_type': 'article',
  'headline': '50 Comics Walk Into a Club. No, You Haven’t Heard This One.',
  'id': '56842cab79881066fdb9523e',
  'keywords': ['Comedy and Humor', 'New Year'],
  'lead_paragraph': '“50 First Jokes NYC” celebrates its 10th anniversary with a show at the Bell House in Brooklyn on Saturday, with 50 comedians trying for new laughs in the new year.',
  'section': 'Arts',
  'snippet': '“50 First Jokes NYC” celebrates its 10th anniversary with a show at the Bell House in Brooklyn on Saturday, with 50 comedians trying for new laughs in the new year.',
  'source': 'The New York Times',
  'subsection': None,
  'summary': 'A group of New York comics found that out almost by accident a decade ago, and on Saturday “50 First Jokes NYC” celebrates its 10th anniversary with a show at the Bell House in Gowanus, Brooklyn, that will no doubt generate lots of laughs and perhaps the occasional groan. “They’re really putting it on the line and doing somethin

In [61]:
#collection.insert(result)

[ObjectId('56e0ebba9d1fa206b01dd128'),
 ObjectId('56e0ebba9d1fa206b01dd129'),
 ObjectId('56e0ebba9d1fa206b01dd12a'),
 ObjectId('56e0ebba9d1fa206b01dd12b'),
 ObjectId('56e0ebba9d1fa206b01dd12c'),
 ObjectId('56e0ebba9d1fa206b01dd12d'),
 ObjectId('56e0ebba9d1fa206b01dd12e'),
 ObjectId('56e0ebba9d1fa206b01dd12f'),
 ObjectId('56e0ebba9d1fa206b01dd130'),
 ObjectId('56e0ebba9d1fa206b01dd131'),
 ObjectId('56e0ebba9d1fa206b01dd132'),
 ObjectId('56e0ebba9d1fa206b01dd133'),
 ObjectId('56e0ebba9d1fa206b01dd134'),
 ObjectId('56e0ebba9d1fa206b01dd135'),
 ObjectId('56e0ebba9d1fa206b01dd136'),
 ObjectId('56e0ebba9d1fa206b01dd137'),
 ObjectId('56e0ebba9d1fa206b01dd138'),
 ObjectId('56e0ebba9d1fa206b01dd139'),
 ObjectId('56e0ebba9d1fa206b01dd13a'),
 ObjectId('56e0ebba9d1fa206b01dd13b'),
 ObjectId('56e0ebba9d1fa206b01dd13c'),
 ObjectId('56e0ebba9d1fa206b01dd13d'),
 ObjectId('56e0ebba9d1fa206b01dd13e'),
 ObjectId('56e0ebba9d1fa206b01dd13f'),
 ObjectId('56e0ebba9d1fa206b01dd140'),
 ObjectId('56e0ebba9d1fa2

In [9]:
cursor = collection.find()

In [10]:
summaries = []
doc_ids = []
database_articles = []
for article in cursor:
    database_articles.append(article)
    doc_ids.append(article['_id'])
    lda_text = ''
    for keyword in article['keywords']:
        lda_text += keyword + ' '
    lda_text += article['headline'] + ' ' + article['section'] + ' ' + article['type'] + ' ' + article['summary']
    summaries.append(lda_text)

In [11]:
print(doc_ids[1])
print(summaries[1])

56e0ebba9d1fa206b01dd129
Deaths (Obituaries) Public Relations and Publicity Awards, Decorations and Honors Movies Academy Awards (Oscars) Television Murray Weissman, Publicist With a Focus on Oscar Campaigning, Dies at 90 Movies Obituary Murray Weissman, who helped lead a generation of Hollywood publicists in their transition from press agent to Oscar campaigner, died on Monday in Los Angeles. He went to great lengths in his pursuit of individual awards votes, going so far, he once told The Los Angeles Times, as to send a limousine to carry a member of the Academy of Motion Picture Arts and Sciences to a screening of a film that voter had not yet seen. Mr. Weissman began his entertainment-industry career with jobs at TV Guide and a Los Angeles television station, then moved to ABC and CBS, where he worked with Frank Sinatra, Judy Garland and other stars. 


In [12]:
database_articles[1]

{'_id': ObjectId('56e0ebba9d1fa206b01dd129'),
 'date': '2016-01-01',
 'doc_type': 'article',
 'headline': 'Murray Weissman, Publicist With a Focus on Oscar Campaigning, Dies at 90',
 'id': '5684743579881066fdb952d4',
 'keywords': ['Deaths (Obituaries)',
  'Public Relations and Publicity',
  'Awards, Decorations and Honors',
  'Movies',
  'Academy Awards (Oscars)',
  'Television'],
 'lead_paragraph': 'Mr. Weissman, who started at a time when publicity meant hot tips for newspaper reporters, worked on behalf of at least 38 best-picture nominees.',
 'section': 'Movies',
 'similar_docs': {'55f1d5b379881015b22f3201': '0.0',
  '567c27b479881009570cd242': '0.0',
  '568163cd79881067a393555b': '0.0',
  '5681677e79881067a3935561': '0.0',
  '568180d97988100cd95e574e': '0.0',
  '5681b0627988100cd95e57a9': '0.0148683',
  '568259a97988100cd95e58ae': '0.0',
  '568275c87988100cd95e58f4': '0.0',
  '568275c97988100cd95e58f5': '0.0',
  '568275ce7988100cd95e58f6': '0.0',
  '5682928c7988100cd95e5934': '0.0

____________

In [13]:
summaries[0]

'Comedy and Humor New Year 50 Comics Walk Into a Club. No, You Haven’t Heard This One. Arts News A group of New York comics found that out almost by accident a decade ago, and on Saturday “50 First Jokes NYC” celebrates its 10th anniversary with a show at the Bell House in Gowanus, Brooklyn, that will no doubt generate lots of laughs and perhaps the occasional groan. “They’re really putting it on the line and doing something that’s completely untested.” Mr. O’Donnell, one of the show’s creators and the event’s host, said the idea was born in the middle of the last decade when he and other comics and their friends convened on the Lower East Side for some informal joke-swapping to start the year. Five participating comics agreed to answer five frivolous questions by email:  For Anthony DeVito, who is doing the show for the fifth time:   Singers, athletes, actors all have routines they go through before a concert, a game, a performance to prepare physically or mentally or both. '

###LDA

In [14]:
count_vectorizer = CountVectorizer(analyzer='word',stop_words='english',token_pattern='\\b[a-z][a-z]+\\b')

In [15]:
ng_vecs = count_vectorizer.fit_transform(summaries).transpose()

In [16]:
ng_vecs.shape

(13575, 900)

In [17]:
corpus = Sparse2Corpus(ng_vecs)

In [18]:
count_vectorizer.vocabulary_

{'phylloxera': 9009,
 'contributes': 2641,
 'sensitive': 10865,
 'shrouded': 11062,
 'thing': 12283,
 'lebron': 6979,
 'necks': 8140,
 'electronics': 3910,
 'stingel': 11656,
 'kagame': 6633,
 'wiring': 13371,
 'harkness': 5564,
 'giants': 5132,
 'mehrishi': 7669,
 'bnai': 1303,
 'kuaidi': 6805,
 'lighthearted': 7106,
 'condom': 2494,
 'leadership': 6955,
 'buenos': 1587,
 'bookshop': 1350,
 'alleging': 354,
 'moskowitz': 7960,
 'yesterday': 13509,
 'affirmative': 234,
 'hurst': 5950,
 'role': 10395,
 'fortunes': 4838,
 'steer': 11623,
 'uong': 12856,
 'reprieve': 10104,
 'efficient': 3867,
 'exhibition': 4325,
 'speculation': 11439,
 'velez': 12959,
 'rejected': 9996,
 'requirements': 10119,
 'individual': 6142,
 'broken': 1532,
 'creed': 2838,
 'devising': 3350,
 'meaningless': 7633,
 'cemitas': 1915,
 'spartan': 11413,
 'belgravia': 1104,
 'spies': 11454,
 'ugly': 12701,
 'bengal': 1138,
 'darren': 3032,
 'calling': 1699,
 'potentially': 9253,
 'evenings': 4258,
 'mitigate': 7873,
 

In [33]:
id2word = dict((count_vectorizer.vocabulary_[k],k) for k in count_vectorizer.vocabulary_)

In [34]:
id2word

{0: 'aaron',
 1: 'abandoned',
 2: 'abathembu',
 3: 'abba',
 4: 'abbas',
 5: 'abbey',
 6: 'abbott',
 7: 'abc',
 8: 'abdelaziz',
 9: 'abdul',
 10: 'abdullah',
 11: 'abe',
 12: 'abiertamente',
 13: 'ability',
 14: 'able',
 15: 'aboard',
 16: 'abolish',
 17: 'abortion',
 18: 'abraham',
 19: 'abrahamson',
 20: 'abroad',
 21: 'abs',
 22: 'absence',
 23: 'absent',
 24: 'absolute',
 25: 'absolutely',
 26: 'absorbed',
 27: 'abstract',
 28: 'abstraction',
 29: 'absurdity',
 30: 'abu',
 31: 'abuse',
 32: 'abused',
 33: 'abusers',
 34: 'abuses',
 35: 'abusing',
 36: 'abusive',
 37: 'abuts',
 38: 'abysmal',
 39: 'abyss',
 40: 'ac',
 41: 'academia',
 42: 'academic',
 43: 'academics',
 44: 'academy',
 45: 'acceleration',
 46: 'accelerator',
 47: 'accent',
 48: 'accents',
 49: 'accept',
 50: 'acceptable',
 51: 'acceptance',
 52: 'accepted',
 53: 'accepter',
 54: 'access',
 55: 'accessible',
 56: 'accessories',
 57: 'accident',
 58: 'accidental',
 59: 'accidentally',
 60: 'accidents',
 61: 'acclaimed',

In [21]:
lda = models.LdaModel(corpus, id2word=id2word, num_topics=11)



In [22]:
lda.print_topics(num_words=10)

[(6,
  '0.010*paid + 0.009*news + 0.008*notice + 0.008*death + 0.006*new + 0.005*mr + 0.004*year + 0.004*york + 0.003*deaths + 0.003*notices'),
 (2,
  '0.007*news + 0.005*said + 0.004*new + 0.004*mr + 0.003*paid + 0.002*region + 0.002*like + 0.002*world + 0.002*deaths + 0.002*football'),
 (10,
  '0.008*news + 0.006*said + 0.006*new + 0.005*mr + 0.004*year + 0.003*sports + 0.003*world + 0.003*like + 0.002*police + 0.002*long'),
 (7,
  '0.009*news + 0.006*said + 0.006*mr + 0.004*paid + 0.003*death + 0.003*world + 0.003*notice + 0.002*years + 0.002*government + 0.002*new'),
 (0,
  '0.010*news + 0.006*said + 0.006*new + 0.006*mr + 0.005*year + 0.003*game + 0.002*york + 0.002*people + 0.002*region + 0.002*real'),
 (8,
  '0.031*paid + 0.021*death + 0.020*notice + 0.011*deaths + 0.010*notices + 0.007*new + 0.006*news + 0.005*mr + 0.004*books + 0.003*years'),
 (4,
  '0.009*news + 0.006*paid + 0.006*death + 0.006*mr + 0.004*notice + 0.004*new + 0.003*said + 0.003*deaths + 0.003*review + 0.002*s

In [23]:
lda_corpus = lda[corpus]

In [24]:
lda_docs = [doc for doc in lda_corpus]

In [25]:
lda_docs[0:10]

[[(3, 0.98891289588613562)],
 [(2, 0.98954975846071513)],
 [(2, 0.76980336986235476),
  (4, 0.17831481149321379),
  (6, 0.045147278786195968)],
 [(3, 0.36704563844745053), (4, 0.62376060286316082)],
 [(7, 0.98877597080027313)],
 [(1, 0.98556910377087403)],
 [(4, 0.46412083405056748), (7, 0.52565133094966909)],
 [(9, 0.9891768387204749)],
 [(9, 0.98989843156214175)],
 [(0, 0.05304601305185358), (2, 0.93646372444653281)]]

In [26]:
doc_vecs = [doc for doc in lda_corpus]

In [27]:
index = similarities.MatrixSimilarity(doc_vecs)



In [28]:
sims = sorted(enumerate(index[doc_vecs[0]]), key=lambda item: -item[1])

In [30]:
print("Original doc: ")
print(database_articles[0]['section'])
print(database_articles[0]['url'])
print("Similar documents: ")
cnt = 0
for sim in sims[1:]:
    #if database_articles[sim[0]]['section'] != 'Paid Death Notices':
    print(database_articles[sim[0]]['section'])
    print(database_articles[sim[0]]['url'])
    print("________________________________")
    cnt += 1
    if cnt >= 10:
        break

Original doc: 
Arts
http://www.nytimes.com/2016/01/01/arts/50-comics-walk-into-a-club-no-you-havent-heard-this-one.html
Similar documents: 
Business Day
http://www.nytimes.com/2016/01/01/business/dealbook/a-roller-coaster-year-ends-with-us-markets-mostly-down.html
________________________________
Sports
http://www.nytimes.com/2016/01/01/sports/chris-mullin-sees-mixed-signs-in-st-johns-loss.html
________________________________
Food
http://www.nytimes.com/2016/01/01/dining/happy-new-year.html
________________________________
Arts
http://www.nytimes.com/2016/01/02/arts/music/natalie-cole-grammy-award-winning-singer-dies-at-65.html
________________________________
Sports
http://www.nytimes.com/2016/01/02/sports/hockey/dallas-stars-visit-new-york-and-theyre-bringing-a-defense.html
________________________________
Your Money
http://www.nytimes.com/2016/01/02/your-money/want-to-keep-new-years-resolutions-consider-the-consequences-of-failing.html
________________________________
Business Day


In [35]:
dic = {}
for i in range(0,len(doc_vecs)):
    sims = sorted(enumerate(index[doc_vecs[i]]), key=lambda item: -item[1])
    #dic[database_articles[i]['_id']] = {}
    dic2 = {}
    for sim in sims[1:]:
        dic2[database_articles[sim[0]]['id']] = str(sim[1])
    dic[database_articles[i]['id']] = dic2

Update database

In [183]:
cnt = 0
for key in dic:
    modified_doc = db.articles_info.update_one({"id": key},{"$set": {"similar_docs": dic[key]}})
    if modified_doc.matched_count == 1:
        cnt += 1
print(cnt)

889


In [174]:
dic

{ObjectId('56e0ebba9d1fa206b01dd128'): {ObjectId('56e0ebba9d1fa206b01dd129'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd12a'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd12b'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd12c'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd12d'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd12e'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd12f'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd130'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd131'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd132'): '0.190837',
  ObjectId('56e0ebba9d1fa206b01dd133'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd134'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd135'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd136'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd137'): '0.748425',
  ObjectId('56e0ebba9d1fa206b01dd138'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd139'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd13a'): '0.146628',
  ObjectId('56e0ebba9d1fa206b01dd13b'): '0.0',
  ObjectId('56e0ebba9d1fa206b01dd13c'): '1.0',
  Objec

In [36]:
db.articles_info.find_one({'id':database_articles[125]['id']})['summary']

'Mr. Adelson has been involved in some fairly complex court proceedings, which revolve around claims of misconduct in his operations in Macau, including links to organized crime and prostitution. I find myself thinking, for example, of the hedge-fund billionaire Paul Singer, another big power in the G.O.P., who published an investor’s letter declaring that inflation was running rampant — he could tell from the prices of Hamptons real estate and high-end art. Or think of the various billionaires who, a few years ago, were declaring with straight faces, and no sign of self-awareness, that President Obama was holding back the economy by suggesting that some businesspeople had misbehaved. '

In [37]:
cursor = collection.find()

In [38]:
new_database_articles = []
for article in cursor:
    new_database_articles.append(article)

In [39]:
new_database_articles[0]

{'_id': ObjectId('56e0ebba9d1fa206b01dd128'),
 'date': '2016-01-01',
 'doc_type': 'article',
 'headline': '50 Comics Walk Into a Club. No, You Haven’t Heard This One.',
 'id': '56842cab79881066fdb9523e',
 'keywords': ['Comedy and Humor', 'New Year'],
 'lead_paragraph': '“50 First Jokes NYC” celebrates its 10th anniversary with a show at the Bell House in Brooklyn on Saturday, with 50 comedians trying for new laughs in the new year.',
 'section': 'Arts',
 'similar_docs': {'55f1d5b379881015b22f3201': '0.0',
  '567c27b479881009570cd242': '0.0',
  '568163cd79881067a393555b': '0.943247',
  '5681677e79881067a3935561': '0.0',
  '568180d97988100cd95e574e': '0.0',
  '5681b0627988100cd95e57a9': '0.0148684',
  '568259a97988100cd95e58ae': '0.0',
  '568275c87988100cd95e58f4': '0.0',
  '568275c97988100cd95e58f5': '0.0',
  '568275ce7988100cd95e58f6': '0.0',
  '5682928c7988100cd95e5934': '0.0',
  '568296287988100cd95e593b': '0.0',
  '568297a07988100cd95e593f': '0.0',
  '5682ac7f7988100cd95e5970': '0.0

In [46]:
similar_articles = find_most_similar_articles(new_database_articles[0],new_database_articles)

In [48]:
for i in range(0,len(similar_articles)):
    print(similar_articles[i]['url'])

http://www.nytimes.com/2016/01/01/opinion/a-better-standard-for-the-use-of-deadly-force.html
http://www.nytimes.com/2016/01/03/magazine/letter-of-recommendation-terro-liquid-ant-bait.html
http://www.nytimes.com/2016/01/03/opinion/sunday/the-strip-brian-mcfadden-comics.html
http://www.nytimes.com/2016/01/03/nyregion/a-different-note-on-race-at-yale.html
http://www.nytimes.com/2016/01/01/world/europe/bataclan-hostage-paris-terror-attack.html
http://www.nytimes.com/2016/01/03/fashion/weddings/fabio-monteiro-mitchell-travers.html
http://www.nytimes.com/2016/01/01/world/europe/monitoring-of-terrorism-threats-has-risen-official-says.html
http://www.nytimes.com/2016/01/03/fashion/so-you-think-your-sisters-boyfriend-is-gay.html
http://www.nytimes.com/2016/01/01/arts/new-years-day-offers-plenty-of-free-chances-at-a-clean-slate.html


_______

###Scrape

In [42]:
print(result[6]['keywords'])
print(result[6]['url'])

['Shopping and Retail', 'Tax Credits, Deductions and Exemptions', 'Airports', 'Value-Added Tax']
http://www.nytimes.com/2016/01/01/business/international/britain-to-review-how-airport-shops-collect-sales-tax.html


In [43]:
soup = get_url_soup_content(str(result[6]['url']))

In [45]:
#Paragraphs
paragraphs = soup.body.find_all('p',{'itemprop':'articleBody'})
new_paragraphs = ''
for i in range(len(paragraphs)):
    new_paragraphs += ("<p>" + paragraphs[i].text + "</p>")
print(new_paragraphs)

<p>George Osborne, the British chancellor of the Exchequer, announced a review on Thursday of how retailers at major airports in Britain collect sales tax from their customers.</p><p>The investigation, which is expected to be completed by early 2016, comes after several retailers were accused of not passing along savings to consumers who bought items at major hubs including Heathrow Airport near London.</p><p>Regardless of their nationalities, passengers at British airports who are flying to destinations outside the European Union are exempt from paying the valued-added tax, which can reach 20 percent. The measure applies not only to duty-free stores but also to retailers as varied as Boots, WHSmith or Dixons.</p><p>But some retailers at the country’s largest airports were found by the government to have been charging the additional tax in cases they were not allowed to, and keeping it rather than passing it on to the tax authorities.</p><p>As a result, some travelers have refused to p

In [46]:
paragraphs_string = ''
for i in range(len(paragraphs)):
    paragraphs_string += paragraphs[i].text + ' '
paragraphs_string

'George Osborne, the British chancellor of the Exchequer, announced a review on Thursday of how retailers at major airports in Britain collect sales tax from their customers. The investigation, which is expected to be completed by early 2016, comes after several retailers were accused of not passing along savings to consumers who bought items at major hubs including Heathrow Airport near London. Regardless of their nationalities, passengers at British airports who are flying to destinations outside the European Union are exempt from paying the valued-added tax, which can reach 20 percent. The measure applies not only to duty-free stores but also to retailers as varied as Boots, WHSmith or Dixons. But some retailers at the country’s largest airports were found by the government to have been charging the additional tax in cases they were not allowed to, and keeping it rather than passing it on to the tax authorities. As a result, some travelers have refused to provide their boarding pass

_______

##Summerization

In [74]:
LANGUAGE = "english"
SENTENCES_COUNT = 3

In [69]:
parser = PlaintextParser.from_string(paragraphs_string, Tokenizer(LANGUAGE))

In [75]:
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
summary = summarizer(parser.document, SENTENCES_COUNT)

In [76]:
for sentences in summary:
    print(sentences)

George Osborne, the British chancellor of the Exchequer, announced a review on Thursday of how retailers at major airports in Britain collect sales tax from their customers.
The investigation, which is expected to be completed by early 2016, comes after several retailers were accused of not passing along savings to consumers who bought items at major hubs including Heathrow Airport near London.
relief at airports is intended to cut prices for those travelers, not be a windfall gain for shops,” Mr. Osborne said in a statement on Thursday, adding that some retailers had been keeping half of every pound, or $1.48, of potential tax savings owed to individuals.


_______

###Scrape and Summerize Article

In [3]:
import requests
from bs4 import BeautifulSoup
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words