<a href="https://colab.research.google.com/github/iyoo2018/findatalake/blob/master/deleteDup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Exclusive to Colab

In [None]:
import os
if 'COLAB_GPU' in os.environ:
  from google.colab import drive
  drive.mount('/content/gdrive')
  import sys
  sys.path.append('/content/gdrive/My Drive/Colab Notebooks')

Mounted at /content/gdrive


In [None]:
if 'COLAB_GPU' in os.environ:
  os.environ['AWS_CONFIG_FILE']="/content/gdrive/My Drive/cred-stockdata.txt"
  os.environ["bucket"] = "026090555438-stockdata"

# Import Packages

In [None]:
import json
import re
import boto3
import datetime
from collections import Counter

# Get publication dates

In [None]:
# Get publication dates of files
# Arg: bucket **bucket name** [str],
#      keys **list of keys that require dates** [list of str]
# Returns: list of dates [list of str]
def getDates(bucket, keys):
  s3Helper = AccessS3()
  dates = []
  for key in keys:
    date = (json.loads(s3Helper.getObj(bucket, key)['Body'].read().decode()))
    dates.append(date["date"])
  return dates

# Convert publication dates

In [None]:
# Convert publication dates of files
# Arg: unstr_dates **unstructed dates** [list of str]
# Returns: list of structured dates [list of str]
def convertDates(unstrDates):
  dateFormat = "%a, %d %b %Y %H:%M:%S %Z"
  strDates = []
  for unstrDate in unstrDates:
    strDates.append(datetime.datetime.strptime(unstrDate, dateFormat))
  return strDates

# Lookup ID in a list of IDs

In [None]:
# Look up an ID in a list of IDs
# Arg: listIDs [list of str] **list of ids to look in**
#      queryID [str] **lookup id**
# Returns: key **object key if it exists**
def lookupID(listIDs, queryID, group):
  matchIDs = [ID.split("/", 1)[1] for ID in listIDs if queryID in ID]
  return matchIDs

# AccessS3 Class

In [None]:
class AccessS3:
  def __init__(self):
    session = boto3.Session()
    self.s3 = session.client('s3')
    self.paginator = self.s3.get_paginator('list_objects_v2')

  # Get an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str],
  # Returns: object
  def getObj(self, bucket, key):
    return self.s3.get_object(Bucket=bucket, Key=key)

  # Delete an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  def deleteObj(self, bucket, key):
    self.s3.delete_object(Bucket=bucket, Key=key)
    print("Deleted object at {}".format(key))
    return 0

  # Save an object
  # Arg: data **data to be saved**
  #      bucket **bucket name** [str],
  #      key **object key** [str]
  def saveObj(self, data, bucket, key):
    self.s3.put_object(
      Body=data,
      Bucket=bucket,
      Key=key
    )
    print("Saved object at {}".format(key))
    return 0

  # Look at objects contained in a key
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  # Returns: objs **objects in key**
  def scanFolder(self, bucket, key):
    objs = []
    pages = self.paginator.paginate(Bucket=bucket, Prefix=key)
    for page in pages:
      for content in page['Contents']:
        if not content['Key'].endswith("/"):
          objs.append(content['Key'])
    return objs

  # Look up an object
  # Arg: bucket [string] **S3 bucket to look in**
  #      key [string] **key to look in**
  #      id [str] **lookup object id**
  # Returns: key **object key if it exists**
  def lookupObj(self, bucket, key, query, group):
    keys = []
    objs = self.scanFolder(bucket, key)
    for obj in objs:
      lookup = re.search(query, obj)
      if lookup:
        keys.append(lookup.group(group))
    return keys

# Delete Duplicates

In [None]:
# Find duplicates and delete the older one
# Arg: bucket [string] **S3 bucket where data is stored**
#      key [string] **key where data is stored**
def deleteDup(bucket):
  s3Helper = AccessS3()
  count = 0
  objs = s3Helper.scanFolder(bucket, "metadata")
  ids = [obj.rsplit('/', 1)[-1].split('.')[0] for obj in objs]
  occurences = Counter(ids)

  for id, occurence in occurences.items():
    if occurence > 1:
      matchKey = lookupID(objs, id)
      matchMetaKeys = "metadata/"+matchKey
      matchTextKeys = "textdata/"+matchKey

      metaDates = getDates(bucket, matchMetaKeys)
      matchDates = convertDates(metaDates)
      newDate = max(matchDates)

      for metakey, textkey, date in zip(matchMetaKeys, matchTextKeys, matchDates):
        if not date==newDate:
          #print("Deleted {}".format(metakey))
          #print("Deleted {}".format(textkey))
          s3Helper.deleteObj(bucket, metakey)
          s3Helper.deleteObj(bucket, textkey)
          count += 1
  print("{} have been deleted".format(count))
  return 0

# main

In [None]:
def main(event, context):
  bucket = os.environ["bucket"]
  deleteDup(bucket)
  return {
      'statusCode': 200
  }

# Test

In [None]:
if 'COLAB_GPU' in os.environ:
  # import text and test
  result = main(None, None)
  print(result)

0 have been deleted
{'statusCode': 200}
