<a href="https://colab.research.google.com/github/iyoo2018/findatalake/blob/master/scrapeRSS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Exclusive to Colab

In [None]:
import os

In [None]:
if 'COLAB_GPU' in os.environ:
  from google.colab import drive
  drive.mount('/content/gdrive')
  import sys
  sys.path.append('/content/gdrive/My Drive/Colab Notebooks')

Mounted at /content/gdrive


In [None]:
import boto3

In [None]:
if 'COLAB_GPU' in os.environ:
  os.environ['AWS_CONFIG_FILE']="/content/gdrive/My Drive/cred-stockdata.txt"
  os.environ['AWS_CONFIG_FILE']

  s = boto3.Session()
  c = s.client("s3")

  import json
  os.environ["bucket"] = "026090555438-stockdata"
  os.environ["key"] = "rssList.json"

In [None]:
if 'COLAB_GPU' in os.environ:
  # CNBC Top News
  topNews = {"network":"CNBC", "feed":"Top News", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"}

  # Earnings
  earning = {"network":"CNBC", "feed":"Earnings", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15839135"}

  # Economy
  economy = {"network":"CNBC", "feed":"Economy", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=20910258"}

  # Finance
  finance = {"network":"CNBC", "feed":"Finance", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664"}

  # Tech
  tech = {"network":"CNBC", "feed":"Tech", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=19854910"}

  # Investing
  invest = {"network":"CNBC", "feed":"Investing", "url":"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15839069"}

  rssList = json.dumps([topNews, earning, economy, finance, tech, invest])

  c.put_object(
  Body=rssList,
  Bucket='026090555438-stockdata',
  Key="rssList.json"
)

# fetchRSS

In [None]:
import feedparser

# Fetch the articles from an RSS feed
# Arg: source [string] *contains RSS feed link*
# Returns: entries [list of dictionaries] *contains articles in RSS feed*
def fetchRSS(source):
    # fetch the RSS feed
    RSS = feedparser.parse(source)
    # extract the entries from the retrieved feed
    entries = RSS.entries
    return entries

# checkID

In [None]:
# Check if a file already exists
# Arg: bucket [string] **S3 bucket**
#      path [string] **path within the bucket**
#      s3 [boto3 s3 client object]
# Returns: boolean **indicator as to whether or not the file exists**
def fileExists(bucket, path, s3):
    # check if the file exists by requesting its metadata
    try:
        s3.head_object(Bucket=bucket, Key=path)
        # if the file exists, return True
        return True
    # otherwise return False
    except:
        return False

# Check which entries are new
# Arg: entries [list of dictionaries] **current entries published to an RSS feed**
#      s3 [boto3 s3 client object]
#      network [string] **network that published RSS feed**
# Returns: newEntries [list of dictionaries] **new entries that haven't been seen**
def checkID(entries, s3, network):
    newEntries = []
    for entry in entries:
        # get the article id
        id = entry["id"]
        # get the date the article was published
        date = entry["published_parsed"]
        year = date[0]
        month = date[1]
        day = date[2]
        # search to see if the file already exists
        bucket = "026090555438-stockdata"
        path = "metadata/{}/{}/{}/{}/{}.json".format(network, year, month, day, id)
        # if it doesn't then that means this article is new
        if not fileExists(bucket, path, s3):
            newEntries.append(entry)
    return newEntries

# parseHTML

In [None]:
import requests
from bs4 import BeautifulSoup

# Scrape an article using the url
# Arg: url [string] **contains url for the article**
# Returns: soup [BeautifulSoup object]
def scraper(url):
  # scrape the data from the url using requests and BeautifulSoup
  page = requests.get(url)
  soup = BeautifulSoup(page.content, "html.parser")
  return soup

# Extract text from the scraped/parsed article
# Arg: soup [BeautifulSoup object]
# Returns: result [string] **contains the text of the article**
def textExtract(soup):
  # extract the data corresponding to the article from the soup
  articleGroups = soup.find_all("div", {"class": "group"})
  sections = []
  for group in articleGroups:
    section = group.find_all("p")
    if section:
      sections.append(section)
  # extract the text of the article
  text = []
  for section in sections:
    for para in section:
      text.append(para.get_text())
  # return the article joined as a single string
  return ("\n").join(text)

# Obtain author of the article
# Arg: soup [BeautifulSoup object]
# Returns: name [string] **author's name**
#          profile [string] **url to author's profile**
def getAuthor(soup):
  authorInfo = soup.find_all("a", {"class": "Author-authorName"})
  print(authorInfo)
  name = authorInfo[0].getText()
  profile = authorInfo[0]["href"]
  return name, profile

# saveS3

In [None]:
# Save new files
# Arg: newEntries [list of dictionaries] **contains new articles to be saved**
#      s3 [boto3 s3 client object]
#      network [string] **name of network that published RSS feed**
#      feed [string] **name of RSS feed**
def saves3(newEntries, s3, network, feed):
    for newEntry in newEntries:
      # get info for file save path
      id = newEntry["id"]
      year = newEntry["published_parsed"][0]
      month = newEntry["published_parsed"][1]
      day = newEntry["published_parsed"][2]
      metapath = "metadata/{}/{}/{}/{}/{}.json".format(network, year, month, day, id)
      textpath = "textdata/{}/{}/{}/{}/{}.json".format(network, year, month, day, id)

      # collect and upload metadata
      #authorName = newEntry["author-name"]
      #authorProfile = newEntry["author-profile"]
      title = newEntry["title"]
      link = newEntry["link"]
      date = newEntry["published"]
      sponsor = newEntry["metadata_sponsored"]
      metatype = newEntry["metadata_type"]
      #metadata = {"author-name":authorName,"author-profile":authorProfile,"title":title,"link":link,"date":date,"feed":feed}
      metadata = json.dumps({"title":title,"link":link,"date":date,"feed":feed,"sponsor":sponsor,"type":metatype})
      s3.put_object(
        Body=metadata,
        Bucket='026090555438-stockdata',
        Key=metapath
      )
      # upload text data
      textdata = json.dumps(newEntry["text"])
      s3.put_object(
        Body=textdata,
        Bucket='026090555438-stockdata',
        Key=textpath
      )
    return 0

# main

In [None]:
import json
import boto3

# event is a dictionary containing the feed name and URL
# assume RSS feed is from CNBC
# assume event is given as json and contains 1 dictionary for 1 RSS feed
def main(event, context):
    session = boto3.Session()
    s3 = session.client('s3')
    bucket = os.environ["bucket"]
    key = os.environ["key"]

    rssList = json.loads(s3.get_object(Bucket=bucket, Key=key)["Body"].read().decode())

    count = 0
    for rss in rssList:
      rssNetwork = rss["network"]
      rssFeed = rss["feed"]
      rssURL = rss["url"]

      print("Currently scraping from {}".format(rssFeed))
      print("Step 1: fetch entries from RSS feed")
      entries = fetchRSS(rssURL)

      print("Step 2: check for new entries")
      newEntries = checkID(entries, s3, rssNetwork)
      # if no new entries there is nothing else that needs to be done
      if not newEntries:
          print('No new entries for {}'.format(rssFeed))
          continue
      print("{} new articles for {}".format(len(newEntries),rssFeed))

      print("Step 3: scrape and parse text for new entries")
      saved = 0
      for newEntry in newEntries:
          entryURL = newEntry["link"]
          soup = scraper(entryURL)
          result = textExtract(soup)
          newEntry["text"] = result
          #authorName, authorProfile = getAuthor(soup)
          #newEntry["author-name"] = authorName
          #newEntry["author-profile"] = authorProfile
          saved += 1
          if saved%5==0:
            print("Scraped {} articles".format(saved))

      print("Step 4: save to S3")
      saves3(newEntries, s3, rssNetwork, rssFeed)
      count += len(newEntries)

    print('{} new entries have been saved'.format(count))
    return {
        'statusCode': 200
    }

# Test to see that it works

In [None]:
if 'COLAB_GPU' in os.environ:
  # import text and test
  result = main(None, None)
  print(result)

Currently scraping from Top News
Step 1: fetch entries from RSS feed
Step 2: check for new entries
No new entries for Top News
Currently scraping from Earnings
Step 1: fetch entries from RSS feed
Step 2: check for new entries
No new entries for Earnings
Currently scraping from Economy
Step 1: fetch entries from RSS feed
Step 2: check for new entries
No new entries for Economy
Currently scraping from Finance
Step 1: fetch entries from RSS feed
Step 2: check for new entries
No new entries for Finance
Currently scraping from Tech
Step 1: fetch entries from RSS feed
Step 2: check for new entries
No new entries for Tech
Currently scraping from Investing
Step 1: fetch entries from RSS feed
Step 2: check for new entries
No new entries for Investing
0 new entries have been saved
{'statusCode': 200}
