<a href="https://colab.research.google.com/github/iyoo2018/findatalake/blob/master/createIndex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Exclusive to Colab

In [1]:
import os
if 'COLAB_GPU' in os.environ:
  from google.colab import drive
  drive.mount('/content/gdrive')
  import sys
  sys.path.append('/content/gdrive/My Drive/Colab Notebooks')

Mounted at /content/gdrive


In [2]:
if 'COLAB_GPU' in os.environ:
  import boto3
  os.environ['AWS_CONFIG_FILE']="/content/gdrive/My Drive/cred-stockdata.txt"

  session = boto3.Session()
  s3 = session.client("s3")

  os.environ['bucket'] = "026090555438-stockdata"
  os.environ['key'] = "htmldata"

# Import Packages

In [3]:
import json
import re
import datetime

# HTML Formatter

In [4]:
# Support class for working with HTML
# Formats tags and generates an HTML file
class HTMLformatter:
  def __init__(self):
    self.text = ""

  # Format head tag
  # Arg: title **tab name** [str], meta **metadata** [dict]
  # Returns: formatted head tag [str]
  def head(self, title, meta):
    head = "<head>\n<title>{}</title>".format(title)
    for key, value in meta.items():
      head += "<meta name={} content={}>".format(key, value)
    return head

  # Format heading tag
  # Arg: heading **heading/section name** [str], size **font size** [str]
  # Returns: formatted heading tag [str]
  def heading(self, heading, size):
    return "<h{}>{}</h{}>".format(size, heading, size)

  # Format a tag
  # Arg: url **link to website** [str], text **displayed text** [str]
  # Returns: formatted a tag [str]
  def a(self, url, text):
    a_tag = "<a href={}>{}</a>".format(url,text)
    return a_tag

  # Format table tag
  # Arg: heading **table heading name** [str],
  #      column_names **column headings** [str],
  #      entries **objects to be put into the table** [list of dictionaries]
  # Returns: formatted table tag [str]
  def table(self, heading, column_names, entries):
    table_heading = self.heading(heading, 2)
    table = "<table border=\"1\" cellpadding=\"10\">\n  <tr>"
    for column_name in column_names:
      table += "\n    <th>{}</th>".format(column_name)
    table += "\n  </tr>"

    table += "\n  <tr>"
    for entry in entries:
      for column_name in column_names:
        table += "\n    <td>{}</td>".format(entry[column_name])
      table += "\n  </tr>"
    table += "\n</table>"
    return table_heading+"\n"+table

  # Format body tag
  # Arg: heading **body heading name** [str], text **text to be displayed** [str]
  # Returns: formatted body tag [str]
  def body(self, heading, text):
    body_heading = self.heading(heading, 2)
    body_text = "<body>"
    for para in text:
      body_text += "\n  <p>{}</p>".format(para)
    body_text += "\n</body>"
    return body_heading+"\n"+body_text

  # Begin the HTML file
  def openHTML(self):
    self.text += "<!DOCTYPE html>\n<html>"
    return 0

  # Add element(s) to HTML file
  # Arg: *elements **HTML elements produced by other functions** [str]
  def addHTML(self, *elements):
    for element in elements:
      self.text += "\n"+element
    return 0

  # End the HTML file
  def closeHTML(self):
    self.text += "\n</html>"
    return 0

  # Write the entire HTML file at once
  # Arg: *elements **HTML elements produced by other functions** [str]
  def fullWrite(self, *elements):
    self.clearHTML()
    self.openHTML()
    for element in elements:
      self.addHTML(element)
    self.closeHTML()
    return 0

  # Clear the HTML file
  def clearHTML(self):
    self.text = ""
    return 0

  # Display the HTML file
  def showHTML(self):
    print(self.text)
    return 0

  # Write the HTML file to S3
  # Arg: bucket, path
  def writeHTML(self, bucket, path):
    s3.put_object(
        Body=self.text,
        Bucket=bucket,
        Key=path,
        ContentType='text/html'
    )
    return 0

# S3 Helper

In [5]:
class AccessS3:
  def __init__(self):
    session = boto3.Session()
    self.s3 = session.client('s3')
    self.paginator = self.s3.get_paginator('list_objects_v2')

  # Get an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str],
  # Returns: object
  def getObj(self, bucket, key):
    return self.s3.get_object(Bucket=bucket, Key=key)

  # Delete an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  def deleteObj(self, bucket, key):
    self.s3.delete_object(Bucket=bucket, Key=key)
    print("Deleted object at {}".format(key))
    return 0

  # Save an object
  # Arg: data **data to be saved**
  #      bucket **bucket name** [str],
  #      key **object key** [str]
  def saveObj(self, data, bucket, key):
    self.s3.put_object(
      Body=data,
      Bucket=bucket,
      Key=key
    )
    print("Saved object at {}".format(key))
    return 0

  # Look at objects contained in a key
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  # Returns: pages **objects in key**
  def scanFolder(self, bucket, key):
    pages = self.paginator.paginate(Bucket=bucket, Prefix=key)
    return pages

# Create index.html

In [6]:
# Get IDs of objects within a bucket (and folder)
# Arg: bucket **bucket name** [str],
#      key **key/folder name** [str],
#      query **regex query** [str]
#      group **group number to return from query** [int]
# Returns: list of IDs in bucket [list of str]
def getIDs(bucket, key, query, group):
  s3Helper = AccessS3()
  pages = s3Helper.scanFolder(bucket, key)
  objs = []
  for page in pages:
    for content in page["Contents"]:
      obj = re.search(query, content["Key"])
      if obj:
        objs.append(obj.group(group))
  return objs

In [7]:
# Get file paths for objects within a bucket (and folder)
# Arg: bucket **bucket name** [str],
#      key **key/folder name** [str],
#      query **regex query** [str]
#      group **group number to return from query** [int]
# Returns: dictionary of paths in bucket [dict of str]
def getPaths(bucket, key, query, group):
  s3Helper = AccessS3()
  pages = s3Helper.scanFolder(bucket, key)
  objs = {}
  for page in pages:
    for content in page["Contents"]:
      obj = re.search(query, content["Key"])
      if obj:
        objs[obj.group(group)] = content["Key"]
  return objs

In [8]:
# Create an index.html file
# Arg: bucket **bucket name** [str], key **key/folder name** [str],
def createIndex(bucket, key):
  formatter = HTMLformatter()
  s3Helper = AccessS3()

  # Create head and heading
  head = formatter.head("index.html", {"description":"basic webpage for accessing saved stock data"})
  heading = formatter.heading("index.html", 1)

  # Scan for HTML files to put into index.html
  group = 1
  query_ID = ".*\/([\d]{9})\.[\D]{4}$"
  objs = getIDs(bucket, key, query_ID, group)
  # Scan for metadata files
  query_path = ".*\/([\d]{9})\.[\D]{4}$"
  meta_files = getPaths(bucket, "metadata", query_path, group)

  # Get the metadata for file
  meta_contents = []
  for obj in objs:
    meta_content = json.loads(s3Helper.getObj(bucket, meta_files[obj])['Body'].read().decode())
    # Add article id
    meta_content['id'] = obj
    # Add link to website as formatted a tag
    meta_content['external-link'] = formatter.a(meta_content['link'],'website')
    meta_content.pop('link')
    # Add link to S3 HTML as formatted a tag
    url = "https://{}.s3.us-east-1.amazonaws.com/htmldata/{}.html".format(bucket, obj)
    meta_content['internal-link'] = formatter.a(url,'s3')
    # Append metadata for this file to list for all files
    meta_contents.append(meta_content)
  # Create the table
  table = formatter.table("Stock Data Files", list(meta_contents[0].keys()), meta_contents)

  # Create the full HTML file
  formatter.fullWrite(head, heading, table)
  # Write the HTML file to S3
  path = "index.html"
  formatter.writeHTML(bucket, path)

  return 0

# main

In [9]:
def main(event, context):
  bucket = os.environ["bucket"]
  key = os.environ["key"]
  createIndex(bucket, key)
  print("Successfully created index HTML file")
  return {
      'statusCode': 200
  }

In [10]:
main("","")

106094284
metadata/CNBC/2024/12/18/106094284.json
106367010
metadata/CNBC/2024/12/13/106367010.json
106401083
metadata/CNBC/2024/12/10/106401083.json
106477203
metadata/CNBC/2024/12/12/106477203.json
106535706
metadata/CNBC/2024/12/5/106535706.json
106599767
metadata/CNBC/2024/12/16/106599767.json
107093512
metadata/CNBC/2024/12/11/107093512.json
107131833
metadata/CNBC/2024/12/2/107131833.json
107144383
metadata/CNBC/2024/12/6/107144383.json
107149002
metadata/CNBC/2024/12/2/107149002.json
107161362
metadata/CNBC/2024/12/17/107161362.json
107294378
metadata/CNBC/2024/12/4/107294378.json
107296865
metadata/CNBC/2024/12/8/107296865.json
107306299
metadata/CNBC/2024/12/4/107306299.json
107312522
metadata/CNBC/2024/12/3/107312522.json
107314912
metadata/CNBC/2024/12/3/107314912.json
107327888
metadata/CNBC/2024/12/4/107327888.json
107335229
metadata/CNBC/2024/12/5/107335229.json
107394318
metadata/CNBC/2024/12/17/107394318.json
108046174
metadata/CNBC/2024/12/15/108046174.json
108046333
m

{'statusCode': 200}