In [18]:
import feedparser as fp
import newspaper
from newspaper import Article
import time
from time import mktime
from datetime import datetime
from datetime import date
import pandas as pd
import json
import pprint
import dateutil

#### 1 Website data ####

## 1A ##  From JSON file - for final version

#with open('NewsPapers.json') as data_file: #Loads the JSON files with news URLs
#    companies = json.load(data_file)

## 1B ## From variable - this is for testing, makes it way faster
website = {"cnn": {"link": "http://edition.cnn.com/", "rss": "http://rss.cnn.com/rss/cnn_topstories.rss"},
          "cnbc":{"link": "https://www.cnbc.com/", "rss": "https://www.cnbc.com/id/10000664/device/rss/rss.html"}}


#### 2 Todays date - for filtering the articles ####
today = str(date.today()) 
print("Today's date:", today)


#### 3 Scraping the news articles ####

article_list = []
date_list = []
time_list = []
title_list = []

for source, value in website.items():
    if 'rss' in value:
        d = fp.parse(value['rss']) #if there is an RSS value for a company in the website data, it will be extracted into d
        article={}
        
        for entry in d.entries:
            if hasattr(entry, 'published'):
                
                #getting the article URLs
                article['link'] = entry.link
                article_list.append(article['link'])
                
                #getting the article published dates
                date = (getattr(entry, 'published'))
                date = dateutil.parser.parse(date)
                date_formated = date.strftime("%Y-%m-%d")
                time_formated = date.strftime("%H:%M:%S %Z") #hour, minute, timezone (converted)
                date_list.append(date_formated)
                time_list.append(time_formated)
                
                #getting the titles
                content = Article(entry.link)
                try:
                    content.download() #downloading article content
                    #downloading takes approx. 3min to load
                    content.parse()                    
                except Exception as e: 
                    #in case the download fails, it prints the error and immediatly continues with downloading the next article
                    print(e)
                    print("continuing...")
                title = content.title #extract article titles
                title_list.append(title)
                
#creating dicts for formatting and inserting to pandas df
link_dict = {'link':article_list}
date_dict = {'published_date':date_list}
time_dict = {'published_time':time_list}
title_dict = {'title':title_list}

#creating separate pandas dfs for each feature
link_df = pd.DataFrame(link_dict)
date_df = pd.DataFrame(date_dict)
time_df = pd.DataFrame(time_dict)
title_df = pd.DataFrame(title_dict)

#join all pandas dfs together
news_df = link_df.join(date_df)
news_df = news_df.join(time_df)
news_df = news_df.join(title_df)

#after 3 min, pandas DF sould be created with link, published_date, published_time and title



Today's date: 2019-10-23


In [20]:
news_df_daily = news_df[news_df.published_date == today]

news_df_daily.head()

Unnamed: 0,link,published_date,published_time,title
0,http://rss.cnn.com/~r/rss/cnn_topstories/~3/md...,2019-10-23,18:09:23 UTC,Lawmaker stormed into hearing room. See what h...
1,http://rss.cnn.com/~r/rss/cnn_topstories/~3/8C...,2019-10-23,19:12:14 UTC,Trump couldn't be charged for a Fifth Avenue s...
2,http://rss.cnn.com/~r/rss/cnn_topstories/~3/Cb...,2019-10-23,16:32:50 UTC,Trump photos posted on private Instagram raise...
3,http://rss.cnn.com/~r/rss/cnn_topstories/~3/lo...,2019-10-23,18:52:17 UTC,Only 7(!) Republican senators are ruling out r...
4,http://rss.cnn.com/~r/rss/cnn_topstories/~3/YK...,2019-10-23,19:03:53 UTC,In photos: The Trump impeachment inquiry
