In [19]:
import os
os.environ['AWS_CONFIG_FILE'] = '../.aws/config'

import json
import random
from pathlib import Path

import boto3
import s3fs
import polars as pl
from PIL import Image
from io import BytesIO

## Configure S3

In [None]:
aws_region = 'gra'
aws_endpoint_url = 'https://s3.gra.io.cloud.ovh.net'
bucket_name = 'fc-gra-alejandria'
ds_path = f'{bucket_name}/ds/public/PD12M'

# Initialize boto3 S3 client
session = boto3.session.Session(profile_name='default')
credentials = session.get_credentials().get_frozen_credentials()

s3 = s3fs.S3FileSystem(
    key=credentials.access_key,
    secret=credentials.secret_key,
    endpoint_url=aws_endpoint_url,
    # region_name=aws_region
)

# Test that s3fs works
try:
    files_list = s3.ls(ds_path)
    print("S3FS works")
except Exception as e:
    print("S3FS does not work: ", e)

## Test sample parquet file

In [None]:
parquet_id = '00891'
parquet_path = f's3://{ds_path}/{parquet_id}.parquet'
with s3.open(parquet_path, 'rb') as f:
    df = pl.read_parquet(f)

display(df.head())

In [None]:
# List images inside the folder
item_id = random.choice(df['key'].to_list())
folder_id = item_id[:5]
print(f"Item ID: {item_id}. Folder ID: {folder_id}")

images_folder = f'{ds_path}/{folder_id}'
if not s3.exists(images_folder):
    raise ValueError(f"Images folder does not exist: {images_folder}")
print(f"Images folder: {images_folder}")

item = df.filter(pl.col('key') == item_id).row(0, named=True)
display(item)

image_path = f"{images_folder}/{item['key']}.jpg"
if not s3.exists(image_path):
    raise ValueError(f"Image does not exist: {image_path}")
print(f"Image path: {image_path}")

json_path = Path(image_path).with_suffix('.json')
if not s3.exists(json_path):
    raise ValueError(f"JSON file does not exist: {json_path}")
print(f"JSON path: {json_path}")

# Open the first image
with s3.open(image_path, 'rb') as f:
    image = Image.open(f).convert('RGB')

# Open json file 
with s3.open(json_path, 'rb') as f:
    json_data = json.load(f)

display(json_data)
display(image)