<a href="https://colab.research.google.com/github/iyoo2018/findatalake/blob/master/createHTML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Exclusive to Colab

In [2]:
import os
if 'COLAB_GPU' in os.environ:
  from google.colab import drive
  drive.mount('/content/gdrive')
  import sys
  sys.path.append('/content/gdrive/My Drive/Colab Notebooks')

Mounted at /content/gdrive


In [3]:
if 'COLAB_GPU' in os.environ:
  os.environ['AWS_CONFIG_FILE']="/content/gdrive/My Drive/cred-stockdata.txt"
  os.environ['bucket'] = "026090555438-stockdata"
  os.environ['html'] = "htmldata"
  os.environ['text'] = "textdata"

# Import Packages

In [4]:
import boto3
import json
import re

# HTML Formatter

In [15]:
# Support class for working with HTML
# Formats tags and generates an HTML file
class HTMLformatter:
  def __init__(self):
    self.text = ""
    session = boto3.Session()
    self.s3 = session.client('s3')

  # Format head tag
  # Arg: title **tab name** [str], meta **metadata** [dict]
  # Returns: formatted head tag [str]
  def head(self, title, meta):
    head = "<head>\n<title>{}</title>".format(title)
    for key, value in meta.items():
      head += "<meta name={} content={}>".format(key, value)
    return head

  # Format heading tag
  # Arg: heading **heading/section name** [str], size **font size** [str]
  # Returns: formatted heading tag [str]
  def heading(self, heading, size):
    return "<h{}>{}</h{}>".format(size, heading, size)

  # Format a tag
  # Arg: url **link to website** [str], text **displayed text** [str]
  # Returns: formatted a tag [str]
  def a(self, url, text):
    a_tag = "<a href={}>{}</a>".format(url,text)
    return a_tag

  # Format table tag
  # Arg: heading **table heading name** [str],
  #      column_names **column headings** [str],
  #      entries **objects to be put into the table** [list of dictionaries]
  # Returns: formatted table tag [str]
  def table(self, heading, column_names, entries):
    table_heading = self.heading(heading, 2)
    table = "<table border=\"1\" cellpadding=\"10\">\n  <tr>"
    for column_name in column_names:
      table += "\n    <th>{}</th>".format(column_name)
    table += "\n  </tr>"

    table += "\n  <tr>"
    for entry in entries:
      for column_name in column_names:
        table += "\n    <td>{}</td>".format(entry[column_name])
      table += "\n  </tr>"
    table += "\n</table>"
    return table_heading+"\n"+table

  # Format body tag
  # Arg: heading **body heading name** [str], text **text to be displayed** [str]
  # Returns: formatted body tag [str]
  def body(self, heading, text):
    body_heading = self.heading(heading, 2)
    body_text = "<body>"
    for para in text:
      body_text += "\n  <p>{}</p>".format(para)
    body_text += "\n</body>"
    return body_heading+"\n"+body_text

  # Begin the HTML file
  def openHTML(self):
    self.text += "<!DOCTYPE html>\n<html>"
    return 0

  # Add element(s) to HTML file
  # Arg: *elements **HTML elements produced by other functions** [str]
  def addHTML(self, *elements):
    for element in elements:
      self.text += "\n"+element
    return 0

  # End the HTML file
  def closeHTML(self):
    self.text += "\n</html>"
    return 0

  # Write the entire HTML file at once
  # Arg: *elements **HTML elements produced by other functions** [str]
  def fullWrite(self, *elements):
    self.clearHTML()
    self.openHTML()
    for element in elements:
      self.addHTML(element)
    self.closeHTML()
    return 0

  # Clear the HTML file
  def clearHTML(self):
    self.text = ""
    return 0

  # Display the HTML file
  def showHTML(self):
    print(self.text)
    return 0

  # Write the HTML file to S3
  # Arg: bucket, path
  def writeHTML(self, bucket, path):
    self.s3.put_object(
        Body=self.text,
        Bucket=bucket,
        Key=path,
        ContentType='text/html'
    )
    return 0

# S3 Helper

In [6]:
class AccessS3:
  def __init__(self):
    session = boto3.Session()
    self.s3 = session.client('s3')
    self.paginator = self.s3.get_paginator('list_objects_v2')

  # Get an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str],
  # Returns: object
  def getObj(self, bucket, key):
    return self.s3.get_object(Bucket=bucket, Key=key)

  # Delete an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  def deleteObj(self, bucket, key):
    self.s3.delete_object(Bucket=bucket, Key=key)
    print("Deleted object at {}".format(key))
    return 0

  # Save an object
  # Arg: data **data to be saved**
  #      bucket **bucket name** [str],
  #      key **object key** [str]
  def saveObj(self, data, bucket, key):
    self.s3.put_object(
      Body=data,
      Bucket=bucket,
      Key=key
    )
    print("Saved object at {}".format(key))
    return 0

  # Look at objects contained in a key
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  # Returns: pages **objects in key**
  def scanFolder(self, bucket, key):
    pages = self.paginator.paginate(Bucket=bucket, Prefix=key)
    return pages

# Create All Non-existing HTML

In [7]:
# Check for all files that need an HTML file
# Arg: bucket **bucket name** [str], filekey **file folder** [str]
# Returns: all files [list of str] and their file paths [list of str]
def allHTML(bucket, filekey):
  s3Helper = AccessS3()
  files = []
  paths = []

  textPages = s3Helper.scanFolder(bucket, filekey)
  for textPage in textPages:
    for text in textPage['Contents']:
      text_search = re.search("^textdata\/(.*\/*(\d{9})\.json$)", text["Key"])
      if text_search:
        files.append(text_search.group(2))
        paths.append(text_search.group(1))
  return files, paths

In [8]:
# Check for new files that need an HTML file
# Arg: bucket **bucket name** [str], filekey **file folder** [str], htmlkey **html folder** [str]
# Returns: new files [list of str] and their file paths [list of str]
def newHTML(bucket, filekey, htmlkey):
  s3Helper = AccessS3()
  htmls = []
  htmlPages = s3Helper.scanFolder(bucket, htmlkey)
  for htmlPage in htmlPages:
    htmls.extend([obj["Key"] for obj in htmlPage["Contents"] if not obj["Key"].endswith("/")])

  new_files = []
  new_paths = []
  textPages = s3Helper.scanFolder(bucket, filekey)
  for textPage in textPages:
    for text in textPage['Contents']:
      text_search = re.search("^textdata\/(.*\/*(\d{9})\.json$)", text["Key"])
      if text_search and not any([text_search.group(2) in html for html in htmls]):
        new_files.append(text_search.group(2))
        new_paths.append(text_search.group(1))
  return new_files, new_paths

In [9]:
# Get metadata for file
# Arg: bucket **bucket name** [str], name **file name** [str],
# Returns: metadata [dict]
def getMeta(bucket, filepath):
  s3Helper = AccessS3()
  file_path = "metadata/{}".format(filepath)
  meta = json.loads(s3Helper.getObj(bucket, file_path)['Body'].read().decode())
  return meta

In [10]:
# Get text data for file
# Arg: bucket **bucket name** [str], name **file name** [str],
# Returns: text data [str]
def getText(bucket, filepath):
  s3Helper = AccessS3()
  file_path = "textdata/{}".format(filepath)
  text = json.loads(s3Helper.getObj(bucket, file_path)['Body'].read().decode())
  text = text.split("\n")
  return text

In [11]:
# Create an HTML file for a single file
# Arg: bucket **bucket name** [str], name **file name** [str],
def createSingleHTML(bucket, filename, filepath):
  formatter = HTMLformatter()
  # Get the metadata for file
  meta_content = getMeta(bucket, filepath)
  # Create the head and heading
  head = formatter.head("{}.html".format(filename), {"description":"data for article id: {}".format(filename)})
  heading = formatter.heading(meta_content['title'], 1)
  # Add article id
  meta_content['id'] = filename
  # Add link to website as formatted a tag
  meta_content['external-link'] = formatter.a(meta_content['link'],'website')
  meta_content.pop('link')
  # Create the table
  table = formatter.table("Metadata", list(meta_content.keys()), [meta_content])
  # Get the text data for file
  text_content = getText(bucket, filepath)
  # Create the body
  body = formatter.body("Text", text_content)
  # Create a link to return to index.html
  return_link = formatter.a("https://{}.s3.us-east-1.amazonaws.com/index.html".format(bucket),"Click to return to index.html")

  # Create the full HTML file
  formatter.fullWrite(head, heading, table, body, return_link)
  # Write the HTML file to S3
  path = "htmldata/{}.html".format(filename)
  formatter.writeHTML(bucket, path)

  return 0

In [21]:
def createAllHTML(bucket, textkey, htmlKey, event):
  if event == "create":
    # Scan for all files
    objs, paths = allHTML(bucket, textkey)
    # Create all html files
    for obj, path in zip(objs, paths):
      createSingleHTML(bucket, obj, path)
    return len(objs)

  elif event == "update":
    # Scan for new files that require an html file
    newObjs, newPaths = newHTML(bucket, textkey, htmlKey)
    # Create new html files
    for newObj, newPath in zip(newObjs, newPaths):
      createSingleHTML(bucket, newObj, newPath)
    return len(newObjs)

  elif event == "review":
    # Scan for all html files
    s3Helper = AccessS3()
    htmls = []
    htmlPages = s3Helper.scanFolder(bucket, htmlKey)
    for htmlPage in htmlPages:
      htmls.extend([obj["Key"] for obj in htmlPage["Contents"] if not obj["Key"].endswith("/")])
    texts = []
    textPages = s3Helper.scanFolder(bucket, textkey)
    for textPage in textPages:
      texts.extend([obj["Key"] for obj in textPage["Contents"] if not obj["Key"].endswith("/")])
    return htmls, texts

# main

In [13]:
def main(event, context):
  bucket = os.environ["bucket"]
  fileKey = os.environ["text"]
  htmlKey = os.environ['html']
  if event == "create":
    count = createAllHTML(bucket, fileKey, htmlKey, event)
    print("Successfully created {} HTML file(s)".format(count))

  elif event == "update":
    count = createAllHTML(bucket, fileKey, htmlKey, event)
    if count > 0:
      print("Successfully created {} new HTML file(s)".format(count))
    else:
      print("No new HTML files were created")

  elif event == "review":
    htmls, texts = createAllHTML(bucket, fileKey, htmlKey, event)
    print('There are {} htmls and {} texts'.format(len(htmls),len(texts)))

  return {
    'statusCode': 200,
  }

In [22]:
# event can consist of:
# create - create all html files
# update - only create html files that don't already exist
# review - view the html files that already exist
main("update","")

No new HTML files were created


{'statusCode': 200}