In [1]:
import os
import boto3
import json
import gzip
from tqdm.notebook import tqdm
# Retrieve environment variables
key_id = os.environ.get('S3_ACCESS_KEY_ID')
secret_key = os.environ.get('S3_SECRET_ACCESS_KEY')
bucket_name = "rowdy"

if key_id is None or secret_key is None or bucket_name is None:
    raise TypeError("An env variables is not set")

    

# Initialize S3 client
s3 = boto3.client('s3',
                  region_name='nyc3',
                  endpoint_url='https://nyc3.digitaloceanspaces.com',
                  aws_access_key_id=key_id,
                  aws_secret_access_key=secret_key)

# Paginator for handling large number of objects
paginator=s3.get_paginator('list_objects_v2')
pages=paginator.paginate(Bucket=bucket_name,Prefix=f"datasets/labeled/token-classification/products")

# Function to extract and transform JSON data
def transform_json_data(json_data):
    transformed_data = {
        "id": json_data["id"],
        "data": json_data["task"]["data"],
        "annotations": [
            {
                "result": json_data["result"]
            }
        ]
    }
    return transformed_data

# Array to store transformed JSON objects
json_array = []

i = 0
# Process each JSON file in the bucket
for page in pages:
    for obj in tqdm(page['Contents'], desc='Loading annotations'):
        data = s3.get_object(Bucket=bucket_name, Key=obj.get('Key'))
        content = (data['Body'].read()).decode("utf-8")
        if(i > 0 and isinstance(content, str)):
            annotation = json.loads(content)
            transformed_data = transform_json_data(annotation)
            json_array.append(transformed_data)
        i += 1
        
# Save the JSON array to a file
with open('dataset.json', 'w') as outfile:
    json.dump(json_array, outfile, indent=2)

print("dataset.json file created successfully")

Loading annotations:   0%|          | 0/1000 [00:00<?, ?it/s]

Loading annotations:   0%|          | 0/264 [00:00<?, ?it/s]

dataset.json file created successfully
