<a href="https://colab.research.google.com/github/iyoo2018/findatalake/blob/master/createHTML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Exclusive to Colab

In [1]:
import os
if 'COLAB_GPU' in os.environ:
  from google.colab import drive
  drive.mount('/content/gdrive')
  import sys
  sys.path.append('/content/gdrive/My Drive/Colab Notebooks')

Mounted at /content/gdrive


In [13]:
if 'COLAB_GPU' in os.environ:
  os.environ['AWS_CONFIG_FILE']="/content/gdrive/My Drive/cred-stockdata.txt"
  os.environ['bucket'] = "026090555438-stockdata"
  os.environ['html'] = "htmldata"
  os.environ['meta'] = "metadata"

# Import Packages

In [14]:
import boto3
import json
import re

# HTML Formatter

In [15]:
# Support class for working with HTML
# Formats tags and generates an HTML file
class HTMLformatter:
  def __init__(self):
    self.text = ""
    session = boto3.Session()
    self.s3 = session.client('s3')

  # Format head tag
  # Arg: title **tab name** [str], meta **metadata** [dict]
  # Returns: formatted head tag [str]
  def head(self, title, meta):
    head = "<head>\n<title>{}</title>".format(title)
    for key, value in meta.items():
      head += "<meta name={} content={}>".format(key, value)
    return head

  # Format heading tag
  # Arg: heading **heading/section name** [str], size **font size** [str]
  # Returns: formatted heading tag [str]
  def heading(self, heading, size):
    return "<h{}>{}</h{}>".format(size, heading, size)

  # Format a tag
  # Arg: url **link to website** [str], text **displayed text** [str]
  # Returns: formatted a tag [str]
  def a(self, url, text):
    a_tag = "<a href={}>{}</a>".format(url,text)
    return a_tag

  # Format table tag
  # Arg: heading **table heading name** [str],
  #      column_names **column headings** [str],
  #      entries **objects to be put into the table** [list of dictionaries]
  # Returns: formatted table tag [str]
  def table(self, heading, column_names, entries):
    table_heading = self.heading(heading, 2)
    table = "<table border=\"1\" cellpadding=\"10\">\n  <tr>"
    for column_name in column_names:
      table += "\n    <th>{}</th>".format(column_name)
    table += "\n  </tr>"

    table += "\n  <tr>"
    for entry in entries:
      for column_name in column_names:
        table += "\n    <td>{}</td>".format(entry[column_name])
      table += "\n  </tr>"
    table += "\n</table>"
    return table_heading+"\n"+table

  # Format body tag
  # Arg: heading **body heading name** [str], text **text to be displayed** [str]
  # Returns: formatted body tag [str]
  def body(self, heading, text):
    body_heading = self.heading(heading, 2)
    body_text = "<body>"
    for para in text:
      body_text += "\n  <p>{}</p>".format(para)
    body_text += "\n</body>"
    return body_heading+"\n"+body_text

  # Begin the HTML file
  def openHTML(self):
    self.text += "<!DOCTYPE html>\n<html>"
    return 0

  # Add element(s) to HTML file
  # Arg: *elements **HTML elements produced by other functions** [str]
  def addHTML(self, *elements):
    for element in elements:
      self.text += "\n"+element
    return 0

  # End the HTML file
  def closeHTML(self):
    self.text += "\n</html>"
    return 0

  # Write the entire HTML file at once
  # Arg: *elements **HTML elements produced by other functions** [str]
  def fullWrite(self, *elements):
    self.clearHTML()
    self.openHTML()
    for element in elements:
      self.addHTML(element)
    self.closeHTML()
    return 0

  # Clear the HTML file
  def clearHTML(self):
    self.text = ""
    return 0

  # Display the HTML file
  def showHTML(self):
    print(self.text)
    return 0

  # Save the HTML file to S3
  # Arg: bucket, path
  def saveHTML(self, bucket, path):
    self.s3.put_object(
        Body=self.text,
        Bucket=bucket,
        Key=path,
        ContentType='text/html'
    )
    return 0

# S3 Helper

In [16]:
class AccessS3:
  def __init__(self):
    session = boto3.Session()
    self.s3 = session.client('s3')
    self.paginator = self.s3.get_paginator('list_objects_v2')

  # Get an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str],
  # Returns: object
  def getObj(self, bucket, key):
    return self.s3.get_object(Bucket=bucket, Key=key)

  # Delete an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  def deleteObj(self, bucket, key):
    self.s3.delete_object(Bucket=bucket, Key=key)
    print("Deleted object at {}".format(key))
    return 0

  # Save an object
  # Arg: data **data to be saved**
  #      bucket **bucket name** [str],
  #      key **object key** [str]
  def saveObj(self, data, bucket, key):
    self.s3.put_object(
      Body=data,
      Bucket=bucket,
      Key=key
    )
    print("Saved object at {}".format(key))
    return 0

  # Look at objects contained in a key
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  # Returns: objs **objects in key**
  def scanFolder(self, bucket, key):
    objs = []
    pages = self.paginator.paginate(Bucket=bucket, Prefix=key)
    for page in pages:
      for content in page['Contents']:
        if not content['Key'].endswith("/"):
          objs.append(content['Key'])
    return objs

  # Look up an object
  # Arg: bucket [string] **S3 bucket to look in**
  #      key [string] **key to look in**
  #      id [str] **lookup object id**
  # Returns: key **object key if it exists**
  def lookupObj(self, bucket, key, query, group):
    keys = []
    objs = self.scanFolder(bucket, key)
    for obj in objs:
      lookup = re.search(query, obj)
      if lookup:
        keys.append(lookup.group(group))
    return keys

# StockData Object Class

In [17]:
class StockData:
  def __init__(self, bucket, baseKey):
    self.baseKey = baseKey
    self.id = baseKey.rsplit("/", 1)[1].split(".",1)[0]
    self.bucket = bucket

  def getMeta(self, s3Helper):
    metaKey = "metadata/{}".format(self.baseKey)
    meta = json.loads(s3Helper.getObj(self.bucket, metaKey)['Body'].read().decode())
    return meta

  def getText(self, s3Helper):
    textKey = "textdata/{}".format(self.baseKey)
    text = json.loads(s3Helper.getObj(self.bucket, textKey)['Body'].read().decode())
    text = text.split("\n")
    return text

# Scan for Files


In [18]:
def checkHTML(mode, bucket, metaKey, htmlKey, s3Helper):
  if mode=="create":
    metas = s3Helper.scanFolder(bucket, metaKey)
    baseKeys = [meta.split("/",1)[1] for meta in metas]

  elif mode=="update":
    metas = s3Helper.scanFolder(bucket, metaKey)
    ids = [meta.rsplit("/", 1)[1].split(".",1)[0] for meta in metas]
    htmls = s3Helper.scanFolder(bucket, htmlKey)
    newMetas = []
    for id, meta in zip(ids, metas):
      if not any([id in html for html in htmls]):
        newMetas.append(meta)
    baseKeys = [newMeta.split("/",1)[1] for newMeta in newMetas]

  stockObjs = []
  for baseKey in baseKeys:
    stockObjs.append(StockData(bucket, baseKey))
  return stockObjs

# Create Multiple HTMLs

In [19]:
def createAllHTML(mode, bucket, metaKey, htmlKey, s3Helper, formatter):
  # Scan for all files
  stockObjs = checkHTML(mode, bucket, metaKey, htmlKey, s3Helper)
  # Create all html files
  for stockObj in stockObjs:
    createSingleHTML(stockObj, s3Helper, formatter)
  return len(stockObjs)

# Create Each HTML

In [20]:
# Create an HTML file for a single file
# Arg: bucket **bucket name** [str], name **file name** [str],
def createSingleHTML(stockObj, s3Helper, formatter):
  # Get the metadata for file
  meta_content = stockObj.getMeta(s3Helper)
  # Create the head and heading
  head = formatter.head("{}.html".format(stockObj.id), {"description":"data for article id: {}".format(stockObj.id)})
  heading = formatter.heading(meta_content['title'], 1)
  # Add article id
  meta_content['id'] = stockObj.id
  # Add link to website as formatted a tag
  meta_content['external-link'] = formatter.a(meta_content['link'],'website')
  meta_content.pop('link')
  # Create the table
  table = formatter.table("Metadata", list(meta_content.keys()), [meta_content])
  # Get the text data for file
  text_content = stockObj.getText(s3Helper)
  # Create the body
  body = formatter.body("Text", text_content)
  # Create a link to return to index.html
  return_link = formatter.a("https://{}.s3.us-east-1.amazonaws.com/index.html".format(stockObj.bucket),"Click to return to index.html")
  # Create the full HTML file
  formatter.fullWrite(head, heading, table, body, return_link)
  # Write the HTML file to S3
  htmlKey = "htmldata/{}.html".format(stockObj.id)
  formatter.saveHTML(stockObj.bucket, htmlKey)
  return 0

# main

In [21]:
def main(event, context):
  bucket = os.environ["bucket"]
  s3Helper = AccessS3()
  formatter = HTMLformatter()
  if event=="create" or event=="update":
    metaKey = os.environ["meta"]
    htmlKey = os.environ['html']
    count = createAllHTML(event, bucket, metaKey, htmlKey, s3Helper, formatter)
    if count > 0:
      print("Successfully created {} HTML file(s)".format(count))
    else:
      print("No new HTML files were created")

  elif event=="review":
    metaKey = os.environ["meta"]
    htmlKey = os.environ['html']
    htmls = s3Helper.scanFolder(bucket, htmlKey)
    metas = s3Helper.scanFolder(bucket, metaKey)
    print('There are {} htmls and {} entries'.format(len(htmls),len(metas)))

  else:
    metaKey = event['Records'][0]['s3']['object']['key']
    baseKey = metaKey.split("/",1)[1]
    stockObj = StockData(bucket, baseKey)
    createSingleHTML(stockObj, s3Helper, formatter)
    print("Successfully created HTML file for file id: {}".format(baseKey.rsplit("/",1)[1].split(".",1)[0]))

  return {
    'statusCode': 200,
  }

In [24]:
if 'COLAB_GPU' in os.environ:
  # event can consist of:
  # create - create all html files
  # update - only create html files that don't already exist
  # review - view the html files that already exist
  # s3 upload event - creates html file for uploaded s3 file
  result = main("review","")
  print(result)

There are 2145 htmls and 2145 entries


{'statusCode': 200}