In [None]:
import os
from pathlib import Path
import time
from dotenv import load_dotenv

import polars as pl
import requests
from PIL import Image
from io import BytesIO
import boto3
import s3fs

from fc_ai_data_tools.utils import sign_url

load_dotenv()

## Configure S3

In [2]:
def s3_auth():
    session = boto3.session.Session(profile_name='default')
    credentials = session.get_credentials().get_frozen_credentials()
    
    s3_fs = s3fs.S3FileSystem(
        key=credentials.access_key,
        secret=credentials.secret_key,
        endpoint_url='https://s3.gra.io.cloud.ovh.net',
    )
    return s3_fs, credentials

In [3]:
aws_region = 'gra'
aws_endpoint_url = 'https://s3.gra.io.cloud.ovh.net'
bucket_name = 'fc-gra-alejandria'
ds_path = f'{bucket_name}/ds/public/PD12M'

# Initialize boto3 S3 client
s3_fs, credentials = s3_auth()
s3_storage_options = {
    "aws_access_key_id": credentials.access_key,
    "aws_secret_access_key": credentials.secret_key,
    "endpoint_url": aws_endpoint_url,
    "aws_region": aws_region,
}

## Utils

In [4]:
def measure_image_loading_time(df, url_column, num_samples=10, signed_url=False):
    """
    Measure the average time needed to load images from a given URL column
    
    Args:
        df: polars DataFrame containing image URLs
        url_column: Name of the column containing URLs to test
        num_samples: Number of images to test (default 10)
    
    Returns:
        float: Average loading time in seconds
    """
    # Sample URLs to test
    urls = df[url_column].sample(n=num_samples)
    
    total_time = 0
    successful_loads = 0
    
    for url in urls:
        try:
            start_time = time.time()
            
            # Load and verify the image
            if url.startswith("s3://"):
                if signed_url:
                    url = url[len("s3:/") :]
                    url = sign_url(url)
                    response = requests.get(url)
                    img = Image.open(BytesIO(response.content))
                    img = img.resize((100, 100))
                else:
                    with s3_fs.open(url, "rb") as f:
                        img = Image.open(f)
                        img = img.resize((100, 100))
            else:
                response = requests.get(url)
                img = Image.open(BytesIO(response.content))
                img = img.resize((100, 100))
            
            end_time = time.time()
            total_time += (end_time - start_time)
            successful_loads += 1
            
        except Exception as e:
            print(f"Failed to load {url}: {str(e)}")
            
    if successful_loads == 0:
        return float('inf')
        
    avg_time = total_time / successful_loads
    return avg_time

## Read data

In [None]:
feather_path = Path("../output/PD12M/global_pd12m_data.feather")
if not feather_path.exists():
    raise FileNotFoundError(f"File {feather_path} does not exist")

df = pl.read_ipc(feather_path)
print(f"Number of rows: {df.height}")

display(df.head())

In [None]:
target_df = df.with_columns(
    ("s3://fc-gra-alejandria/ds/public/PD12M/" + pl.col("image_path")).alias("ovh_url")
)
target_df = target_df[["caption", "url", "ovh_url", "image_width", "image_height"]]
target_df = target_df.rename({"url": "public_url"})

# Save a sample of the dataframe to a csv file
sampled_df = target_df.sample(n=10, seed=42)
sampled_df.write_csv(f"{feather_path.parent}/sampled_pd12m_data.csv")

display(target_df.head())


### Display an image from OVH

In [7]:
# item = target_df.row(0, named=True)
# ovh_url = item["ovh_url"]

# print(f"Displaying image from {ovh_url}")

# # with s3_fs.open(item["ovh_url"], "rb") as f:
# #     img = Image.open(f)
# #     display(img)

# response = requests.get(item["ovh_url"])
# img = Image.open(BytesIO(response.content))
# display(img)

## Benchmark loading times

In [None]:
# Test loading times for both URL columns
public_time = measure_image_loading_time(target_df, "public_url")
print(f"Average loading time for public URLs: {public_time:.2f} seconds")

ovh_time = measure_image_loading_time(target_df, "ovh_url")
print(f"Average loading time for OVH URLs: {ovh_time:.2f} seconds")

ovh_signed_time = measure_image_loading_time(target_df, "ovh_url", signed_url=True)
print(f"Average loading time for signed OVH URLs: {ovh_signed_time:.2f} seconds")