<a href="https://colab.research.google.com/github/iyoo2018/findatalake/blob/master/deleteDup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Exclusive to Colab

In [2]:
import os
if 'COLAB_GPU' in os.environ:
  from google.colab import drive
  drive.mount('/content/gdrive')
  import sys
  sys.path.append('/content/gdrive/My Drive/Colab Notebooks')

Mounted at /content/gdrive


In [3]:
if 'COLAB_GPU' in os.environ:
  os.environ['AWS_CONFIG_FILE']="/content/gdrive/My Drive/cred-stockdata.txt"
  os.environ["bucket"] = "026090555438-stockdata"

# Import Packages

In [4]:
import json
import re
import boto3
import datetime
from collections import Counter

# S3 Helper

In [5]:
class AccessS3:
  def __init__(self):
    session = boto3.Session()
    self.s3 = session.client('s3')
    self.paginator = self.s3.get_paginator('list_objects_v2')

  # Get an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str],
  # Returns: object
  def getObj(self, bucket, key):
    return self.s3.get_object(Bucket=bucket, Key=key)

  # Delete an object
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  def deleteObj(self, bucket, key):
    self.s3.delete_object(Bucket=bucket, Key=key)
    print("Deleted object at {}".format(key))
    return 0

  # Save an object
  # Arg: data **data to be saved**
  #      bucket **bucket name** [str],
  #      key **object key** [str]
  def saveObj(self, data, bucket, key):
    self.s3.put_object(
      Body=data,
      Bucket=bucket,
      Key=key
    )
    print("Saved object at {}".format(key))
    return 0

  # Look at objects contained in a key
  # Arg: bucket **bucket name** [str],
  #      key **object key** [str]
  # Returns: pages **objects in key**
  def scanFolder(self, bucket, key):
    pages = self.paginator.paginate(Bucket=bucket, Prefix=key)
    return pages

# Delete Duplicates

In [6]:
# Get IDs of objects within a bucket (and folder)
# Arg: bucket **bucket name** [str],
#      key **key/folder name** [str],
# Returns: list of IDs in bucket [list of str]
def getIDs(bucket, key):
  s3Helper = AccessS3()
  pages = s3Helper.scanFolder(bucket, key)
  objs = []
  for page in pages:
    for content in page["Contents"]:
      obj = re.search(".*\/([\d]{9})\.[\D]{4}$", content["Key"])
      if obj:
        objs.append(obj.group(1))
  return objs

In [7]:
# Get keys of objects within a bucket (and folder)
# Arg: bucket **bucket name** [str],
#      key **key/folder name** [str],
# Returns: list of keys in bucket [list of str]
def getKeys(bucket, key):
  s3Helper = AccessS3()
  allKeys = []
  pages = s3Helper.scanFolder(bucket, key)
  for page in pages:
    for content in page["Contents"]:
      allKeys.append(content)
  return allKeys

In [8]:
# Get publication dates of files
# Arg: bucket **bucket name** [str],
#      matchMetaKeys **list of keys that require dates** [list of str]
# Returns: list of dates [list of str]
def getDates(bucket, matchMetaKeys):
  s3Helper = AccessS3()
  metaDates = []
  for matchMetaKey in matchMetaKeys:
    metadata = (json.loads(s3Helper.getObj(bucket, matchMetaKey)['Body'].read().decode()))
    metaDates.append(metadata["date"])
  return metaDates

In [9]:
# Convert publication dates of files
# Arg: unstr_dates **unstructed dates** [list of str]
# Returns: list of structured dates [list of str]
def convertDates(unstr_dates):
  s3Helper = AccessS3()
  date_format = "%a, %d %b %Y %H:%M:%S %Z"
  str_dates = []
  for unstr_date in unstr_dates:
    str_dates.append(datetime.datetime.strptime(unstr_date, date_format))
  return str_dates

In [10]:
# Look up an object
# Arg: allKeys [page obj] **all keys to search from**
#      id [str] **object id to search for**
# Returns: matchKeys [list of str] **key(s) that contain search id**
def lookupID(allKeys, id):
  matchKeys = []
  query = ".*"+id+"\.[\D]{4}$"
  for key in allKeys:
    lookup = re.search(query, key["Key"])
    if lookup:
      matchKeys.append(lookup.group(0))
  return matchKeys

In [17]:
# Find duplicates and delete the older one
# Arg: bucket [string] **S3 bucket where data is stored**
#      key [string] **key where data is stored**
def deleteDup(bucket):
  s3Helper = AccessS3()
  count = 0
  ids = getIDs(bucket, "metadata")
  occurences = Counter(ids)
  allMetaKeys = getKeys(bucket, "metadata")
  allTextKeys = getKeys(bucket, "textdata")

  for id, occurence in occurences.items():
    if occurence > 1:
      matchMetaKeys = lookupID(allMetaKeys, id)
      matchTextKeys = lookupID(allTextKeys, id)
      metaDates = getDates(bucket, matchMetaKeys)
      matchDates = convertDates(metaDates)
      newDate = max(matchDates)

      for metakey, textkey, date in zip(matchMetaKeys, matchTextKeys, matchDates):
        if not date==newDate:
          #print("Deleted {}".format(metakey))
          #print("Deleted {}".format(textkey))
          s3Helper.deleteObj(bucket, metakey)
          s3Helper.deleteObj(bucket, textkey)
          count += 1
  print("{} have been deleted".format(count))
  return 0

# main

In [12]:
def main(event, context):
  bucket = os.environ["bucket"]
  deleteDup(bucket)
  return {
      'statusCode': 200
  }

# Test

In [18]:
if 'COLAB_GPU' in os.environ:
  # import text and test
  result = main(None, None)
  print(result)

2
2
2
Deleted metadata/CNBC/2024/12/9/108073242.json
Deleted textdata/CNBC/2024/12/9/108073242.json
3
3
3
Deleted metadata/CNBC/2024/12/10/108073292.json
Deleted textdata/CNBC/2024/12/10/108073292.json
Deleted metadata/CNBC/2024/12/11/108073292.json
Deleted textdata/CNBC/2024/12/11/108073292.json
2
2
2
Deleted metadata/CNBC/2024/12/10/108073874.json
Deleted textdata/CNBC/2024/12/10/108073874.json
2
2
2
Deleted metadata/CNBC/2024/12/10/108074131.json
Deleted textdata/CNBC/2024/12/10/108074131.json
2
2
2
Deleted metadata/CNBC/2024/12/12/108072014.json
Deleted textdata/CNBC/2024/12/12/108072014.json
2
2
2
Deleted metadata/CNBC/2024/12/12/108074836.json
Deleted textdata/CNBC/2024/12/12/108074836.json
2
2
2
Deleted metadata/CNBC/2024/12/12/108075249.json
Deleted textdata/CNBC/2024/12/12/108075249.json


KeyboardInterrupt: 