In [198]:
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
import pandas as pd
import nltk as nlp
from nltk.tag.stanford import StanfordNERTagger as nert

In [137]:
# Set the limit for number of articles to download
LIMIT = 100

In [139]:
data = {}
data['newspapers'] = {}
news = pd.DataFrame([],columns=['Title','Text','Link','Published','Type'])


In [140]:
# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
    companies = json.load(data_file)

In [141]:
companies

{'bbc': {'link': 'http://www.bbc.com/',
  'rss': 'http://feeds.bbci.co.uk/news/rss.xml'},
 'cnn': {'link': 'http://edition.cnn.com/'},
 'washingtonpost': {'link': 'https://www.washingtonpost.com/',
  'rss': 'http://feeds.washingtonpost.com/rss/world'}}

In [142]:
count = 1
lst = []
# Iterate through each news company
for company, value in companies.items():
    # If a RSS link is provided in the JSON file, this will be the first choice.
    # Reason for this is that, RSS feeds often give more consistent and correct data.
    # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                newsPaper['articles'].append(article)
                
                lst.append([content.title,content.text,content.url,datetime.fromtimestamp(mktime(date)).isoformat(),'RSS'])
                
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
                news

    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.
            # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
            if content.publish_date is None:
                print(count, " Article has date of type None...")
                noneTypeCount = noneTypeCount + 1
                if noneTypeCount > 10:
                    print("Too many noneType dates, aborting...")
                    noneTypeCount = 0
                    break
                count = count + 1
                continue
            article = {}
            article['title'] = content.title
            article['text'] = content.text
            article['link'] = content.url
            article['published'] = content.publish_date.isoformat()
            
            lst.append([content.title,content.text,content.url,content.publish_date.isoformat(),'News'])
            
            newsPaper['articles'].append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            noneTypeCount = 0
            news
            

Downloading articles from  bbc
1 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/world-europe-44024184
2 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/uk-44026796
3 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/uk-politics-44026548
4 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/uk-44023411
5 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/world-middle-east-44026087
6 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/uk-44026544
7 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/uk-england-leeds-44025685
8 articles downloaded from bbc , url:  http://www.bbc.co.uk/sport/football/44025339
9 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/uk-politics-44025187
10 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/uk-wales-44024978
11 articles downloaded from bbc , url:  http://www.bbc.co.uk/news/entertainment-arts-44024889
12 articles downloaded from bbc , 

84 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/style/article/meghan-markle-wedding-dress-designer/index.html
85 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/style/article/carsten-holler-florence-experiment/index.html
86 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/style/article/azzedine-alaia-the-couturier-design-museum-london/index.html
87 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/style/article/xavier-cha-ruthless-logic/index.html
88 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/style/article/facial-tattoos-yumna-al-arashi/index.html
89 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/style/article/tom-hegen-aerial-photography/index.html
90 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/style/article/transcendents-southeast-asia-spirit-mediums/index.html
91 articles download

In [59]:
paper = newspaper.build(value['link'], memoize_articles=False)

In [60]:
data['newspapers'][company] = newsPaper
try:
    with open('scraped_articles.json', 'w') as outfile:
        json.dump(data, outfile)
except Exception as e: print(e)

In [165]:
news = pd.DataFrame(lst,columns=['Title','Text','Link','Published','Type'])

In [166]:
news.head()

Unnamed: 0,Title,Text,Link,Published,Type
0,Putin set to be inaugurated for fourth term as...,Media playback is unsupported on your device M...,http://www.bbc.co.uk/news/world-europe-44024184,2018-05-07T01:55:34,RSS
1,Gun violence on London's streets 'must stop',Image copyright Press handout Image caption Rh...,http://www.bbc.co.uk/news/uk-44026796,2018-05-07T04:38:25,RSS
2,Iran nuclear deal: Johnson tells Trump 'don't ...,Image copyright EPA/AFP\n\nBoris Johnson has u...,http://www.bbc.co.uk/news/uk-politics-44026548,2018-05-07T03:40:01,RSS
3,Britain's 'best and worst' railway stations named,Image copyright Getty Images\n\nGlasgow Queen ...,http://www.bbc.co.uk/news/uk-44023411,2018-05-06T23:07:32,RSS
4,"Tutankhamun 'secret chamber' does not exist, r...",Image copyright Reuters Image caption A secret...,http://www.bbc.co.uk/news/world-middle-east-44...,2018-05-06T22:55:06,RSS


In [167]:
news['Text']

0     Media playback is unsupported on your device M...
1     Image copyright Press handout Image caption Rh...
2     Image copyright EPA/AFP\n\nBoris Johnson has u...
3     Image copyright Getty Images\n\nGlasgow Queen ...
4     Image copyright Reuters Image caption A secret...
5     Image copyright PA Image caption A dog walker ...
6     Video\n\nA support vehicle crashed through a t...
7     Sir Alex Ferguson presented outgoing Arsenal m...
8     Image copyright PA\n\nLabour peer Lord Adonis ...
9     Video\n\nCardiff City manager Neil Warnock's p...
10    Image copyright Getty Images Image caption Sur...
11    The price of vanilla has soared over the last ...
12    Image caption Theresa May is pressing ahead wi...
13    Image copyright Annie Segarra\n\nA US YouTuber...
14    I'm back and ready - Serena Williams\n\nAs she...
15    Video\n\nLife expectancy in Fleetwood, Lancash...
16    Could Trump's 'Space Force' become a reality?\...
17    Video\n\nA vintage light aeroplane has mad

In [177]:
def tokenize(news_text):
     return nlp.word_tokenize(news_text)

def posTag(tokens):
    return 

In [192]:
token = []
postag = []

for i in range(0,len(news)):
    token.append(tokenize(news['Text'].iloc[i]))

for i in range(0,len(token)):
    postag.append(nlp.pos_tag(token[i]))

In [193]:
postag

[[('Media', 'NNP'),
  ('playback', 'NN'),
  ('is', 'VBZ'),
  ('unsupported', 'JJ'),
  ('on', 'IN'),
  ('your', 'PRP$'),
  ('device', 'NN'),
  ('Media', 'NNP'),
  ('caption', 'NN'),
  ('Police', 'NNP'),
  ('seized', 'VBD'),
  ('opposition', 'NN'),
  ('leader', 'NN'),
  ('Alexei', 'NNP'),
  ('Navalny', 'NNP'),
  ('at', 'IN'),
  ('a', 'DT'),
  ('rally', 'NN'),
  ('in', 'IN'),
  ('Moscow', 'NNP'),
  ('Vladimir', 'NNP'),
  ('Putin', 'NNP'),
  ('is', 'VBZ'),
  ('due', 'JJ'),
  ('to', 'TO'),
  ('be', 'VB'),
  ('sworn', 'VBN'),
  ('in', 'IN'),
  ('for', 'IN'),
  ('a', 'DT'),
  ('fourth', 'JJ'),
  ('term', 'NN'),
  ('as', 'IN'),
  ('president', 'NN'),
  ('of', 'IN'),
  ('Russia', 'NNP'),
  ('on', 'IN'),
  ('Monday', 'NNP'),
  ('after', 'IN'),
  ('winning', 'VBG'),
  ('the', 'DT'),
  ('election', 'NN'),
  ('in', 'IN'),
  ('March', 'NNP'),
  ('.', '.'),
  ('He', 'PRP'),
  ('has', 'VBZ'),
  ('been', 'VBN'),
  ('in', 'IN'),
  ('power', 'NN'),
  ('for', 'IN'),
  ('18', 'CD'),
  ('years', 'NNS'),
  (

In [208]:
nert.tag(tokens= token[1],self=)

TypeError: tag() missing 1 required positional argument: 'self'