<a href="https://colab.research.google.com/github/iyoo2018/findatalake/blob/master/scrapeRSS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Exclusive to Colab

In [1]:
import os
if 'COLAB_GPU' in os.environ:
  from google.colab import drive
  drive.mount('/content/gdrive')
  import sys
  sys.path.append('/content/gdrive/My Drive/Colab Notebooks')

Mounted at /content/gdrive


In [2]:
if 'COLAB_GPU' in os.environ:
  os.environ['AWS_CONFIG_FILE']="/content/gdrive/My Drive/cred-stockdata.txt"
  os.environ["bucket"] = "026090555438-stockdata"
  os.environ["rssKey"] = "rssList.json"

In [3]:
"""
# CNBC Top News
topNews = {"network":"CNBC", "feed":"Top News", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"}
# Earnings
earning = {"network":"CNBC", "feed":"Earnings", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15839135"}
# Economy
economy = {"network":"CNBC", "feed":"Economy", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=20910258"}
# Finance
finance = {"network":"CNBC", "feed":"Finance", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664"}
# Tech
tech = {"network":"CNBC", "feed":"Tech", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=19854910"}
# Investing
invest = {"network":"CNBC", "feed":"Investing", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15839069"}
"""

'\n# CNBC Top News\ntopNews = {"network":"CNBC", "feed":"Top News", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"}\n# Earnings\nearning = {"network":"CNBC", "feed":"Earnings", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15839135"}\n# Economy\neconomy = {"network":"CNBC", "feed":"Economy", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=20910258"}\n# Finance\nfinance = {"network":"CNBC", "feed":"Finance", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664"}\n# Tech\ntech = {"network":"CNBC", "feed":"Tech", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=19854910"}\n# Investing\ninvest = {"network":"CNBC", "feed":"Investing", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15839069"}\n'

# Import Packages

In [4]:
import feedparser
import requests
from bs4 import BeautifulSoup
import json
import re
import boto3
import datetime

# Scrape RSS Feeds

In [5]:
# Fetch the articles from an RSS feed
# Arg: source [string] *contains RSS feed link*
# Returns: entries [list of dictionaries] *contains articles in RSS feed*
def fetchRSS(source):
  # fetch the RSS feed
  RSS = feedparser.parse(source)
  # extract the entries from the retrieved feed
  entries = RSS.entries
  return entries

# Parse HTML

In [6]:
# Scrape an article using the url
# Arg: url [string] **contains url for the article**
# Returns: soup [BeautifulSoup object]
def scraper(url):
  # scrape the data from the url using requests and BeautifulSoup
  page = requests.get(url)
  soup = BeautifulSoup(page.content, "html.parser")
  return soup

# Extract text from the scraped/parsed article
# Arg: soup [BeautifulSoup object]
# Returns: result [string] **contains the text of the article**
def textExtract(soup):
  # extract the data corresponding to the article from the soup
  articleGroups = soup.find_all("div", {"class": "group"})
  sections = []
  for group in articleGroups:
    section = group.find_all("p")
    if section:
      sections.append(section)
  # extract the text of the article
  text = []
  for section in sections:
    for para in section:
      text.append(para.get_text())
  # return the article joined as a single string
  return ("\n").join(text)

# Convert publication dates

In [14]:
# Convert publication dates of files
# Arg: unstrDates **unstructed dates** [list of str]
# Returns: list of structured dates [list of str]
def convertDates(unstrDates):
  dateFormat = "%a, %d %b %Y %H:%M:%S %Z"
  strDates = []
  for unstrDate in unstrDates:
    strDates.append(datetime.datetime.strptime(unstrDate, dateFormat))
  return strDates

# AccessS3 Class

In [8]:
class AccessS3:
  def __init__(self):
    session = boto3.Session()
    self.s3 = session.client('s3')
    self.paginator = self.s3.get_paginator('list_objects_v2')

  # Get an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str],
  # Returns: object
  def getObj(self, bucket, key):
    return self.s3.get_object(Bucket=bucket, Key=key)

  # Delete an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  def deleteObj(self, bucket, key):
    self.s3.delete_object(Bucket=bucket, Key=key)
    print("Deleted object at {}".format(key))
    return 0

  # Save an object
  # Arg: data **data to be saved**
  #      bucket **bucket name** [str],
  #      key **object key** [str]
  def saveObj(self, data, bucket, key):
    self.s3.put_object(
      Body=data,
      Bucket=bucket,
      Key=key
    )
    print("Saved object at {}".format(key))
    return 0

  # Look at objects contained in a key
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  # Returns: objs **objects in key**
  def scanFolder(self, bucket, key):
    objs = []
    pages = self.paginator.paginate(Bucket=bucket, Prefix=key)
    for page in pages:
      for content in page['Contents']:
        if not content['Key'].endswith("/"):
          objs.append(content['Key'])
    return objs

  # Look up an object
  # Arg: bucket [string] **S3 bucket to look in**
  #      key [string] **key to look in**
  #      id [str] **lookup object id**
  # Returns: key **object key if it exists**
  def lookupObj(self, bucket, key, query, group):
    keys = []
    objs = self.scanFolder(bucket, key)
    for obj in objs:
      lookup = re.search(query, obj)
      if lookup:
        keys.append(lookup.group(group))
    return keys

# SaveStockData Class

In [16]:
class SaveStockData:
  def __init__(self):
    self.s3Helper = AccessS3()

  def createMetaPath(self, newEntry, network):
    id = newEntry["id"]
    year = newEntry["published_parsed"][0]
    month = newEntry["published_parsed"][1]
    day = newEntry["published_parsed"][2]
    metaKey = "metadata/{}/{}/{}/{}/{}.json".format(network, year, month, day, id)
    return metaKey

  def createTextPath(self, newEntry, network):
    id = newEntry["id"]
    year = newEntry["published_parsed"][0]
    month = newEntry["published_parsed"][1]
    day = newEntry["published_parsed"][2]
    textKey = "textdata/{}/{}/{}/{}/{}.json".format(network, year, month, day, id)
    return textKey

  def collectMetaData(self, newEntry, feed):
    title = newEntry["title"]
    link = newEntry["link"]
    date = newEntry["published"]
    sponsor = newEntry["metadata_sponsored"]
    metaType = newEntry["metadata_type"]
    metaData = json.dumps({"title":title,"link":link,"date":date,"feed":feed,"sponsor":sponsor,"type":metaType})
    return metaData

  def collectTextData(self, newEntry):
    textData = json.dumps(newEntry["text"])
    return textData

  def saveData(self, data, bucket, key):
    self.s3Helper.saveObj(data, bucket, key)
    return 0

# Save New Articles to S3

In [24]:
# Save the text data and metadata for a new entry
# Arg: newEntry [dict] **new article to be saved**
#      network [string] **network that published the RSS feed**
#      feed [string] **name of the RSS feed**
#      bucket [string] **S3 bucket where data is stored**
def saveData(newEntry, network, feed, bucket):
  stockSaver = SaveStockData()

  # scrape and add the text to the entry
  entryURL = newEntry["link"]
  soup = scraper(entryURL)
  result = textExtract(soup)
  newEntry["text"] = result

  # create object path
  metaKey = stockSaver.createMetaPath(newEntry, network)
  textKey = stockSaver.createTextPath(newEntry, network)

  # collect metadata and text data
  metaData = stockSaver.collectMetaData(newEntry, feed)
  textData = stockSaver.collectTextData(newEntry)

  # upload metadata
  stockSaver.saveData(metaData, bucket, metaKey)
  # upload text data
  stockSaver.saveData(textData, bucket, textKey)

  return 0

In [33]:
# Save new articles to S3 and update old ones
# Arg: entries [list of dictionaries] **entries gathered from feed**
#      network [string] **network that published the RSS feed**
#      feed [string] **name of the RSS feed**
#      bucket [string] **S3 bucket where data is stored**
#      folder [string] **folder where data is stored**
def saveNewEntries(entries, network, feed, bucket):
  s3Helper = AccessS3()
  count = 0
  print("This feed has {} articles".format(len(entries)))
  for entry in entries:
    # get the article id
    id = entry["id"]
    # look up id to see if it exists already (i.e. has been saved before)
    metaQuery = "^metadata\/(.*"+id+"\.[\D]{4})$"
    keys = s3Helper.lookupObj(bucket, "metadata", metaQuery, 1)

    # if metakey is empty (i.e. it does not exist),
    if not keys:
      # then save it to s3
      print("Saving new article id {}:".format(id))
      saveData(entry, network, feed, bucket)
      count += 1

    # otherwise it exists
    else:
      metaDates = []
      for key in keys:
        metaKey = "metadata/"+key
        metaData = (json.loads(s3Helper.getObj(bucket, metaKey)['Body'].read().decode()))
        metaDates.append(metaData["date"])
      # convert dates to comparable datetime objects
      existingDates = convertDates(metaDates)
      newestDate = max(existingDates)
      entryDate = convertDates([entry["published"]])[0]

      # if the article has been updated,
      if entryDate > newestDate:
        print("Updating article id: {}".format(id))
        for key in keys:
          # delete the old one(s)
          metaKey = "metadata/"+key
          textKey = "textdata/"+key
          #print("Deleted {}".format(metaKey))
          #print("Deleted {}".format(textKey))
          s3Helper.deleteObj(bucket, metaKey)
          s3Helper.deleteObj(bucket, textKey)
        # and save the new one
        saveData(entry, network, feed, bucket)
        count += 1

      # otherwise check for existing duplicates
      elif len(existingDates) > 1:
        print("Deleting duplicate articles id: {}".format(id))
        for key, existingDate in zip(keys, existingDates):
          if not existingDate==newestDate:
            # and delete the older ones
            metaKey = "metadata/"+key
            textKey = "textdata/"+key
            #print("Deleted {}".format(metaKey))
            #print("Deleted {}".format(textKey))
            s3Helper.deleteObj(bucket, metaKey)
            s3Helper.deleteObj(bucket, textKey)
  # report the total number of articles that have been saved
  if count==0:
    print('No new entries for {}'.format(feed))
  else:
    print("{} new articles for {}".format(count, feed))
  return 0

# main

In [12]:
def main(event, context):
  bucket = os.environ["bucket"]
  rssKey = os.environ["rssKey"]
  s3Helper = AccessS3()
  # Retrieve list of RSS feeds to gather articles from
  obj = s3Helper.getObj(bucket, rssKey)
  rssList = json.loads(obj["Body"].read().decode())
  # Gather articles from the feeds and save new ones to S3
  count = 0
  for rss in rssList:
    rssNetwork = rss["network"]
    rssFeed = rss["feed"]
    rssURL = rss["url"]
    # Fetch entries from the current RSS feed
    print("Currently scraping from {}".format(rssFeed))
    entries = fetchRSS(rssURL)
    # Save new entries to S3
    saveNewEntries(entries, rssNetwork, rssFeed, bucket)
  return {
      'statusCode': 200
  }

# Test

In [34]:
if 'COLAB_GPU' in os.environ:
  # import text and test
  result = main(None, None)
  print(result)

Currently scraping from Top News
This feed has 30 articles
No new entries for Top News
Currently scraping from Earnings
This feed has 30 articles
No new entries for Earnings
Currently scraping from Economy
This feed has 30 articles
No new entries for Economy
Currently scraping from Finance
This feed has 30 articles
No new entries for Finance
Currently scraping from Tech
This feed has 30 articles
No new entries for Tech
Currently scraping from Investing
This feed has 30 articles
No new entries for Investing
{'statusCode': 200}
