In [105]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
import h5py
from PIL import Image
from src.benchmarkers import *
from src.benchmarkersV2 import *
from tqdm import tqdm
import os
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from io import BytesIO

np.random.seed(0)

In [110]:
def make_offset(vector):
    return [vector[i] - vector[i - 1] for i in range(1, len(vector))]


def group_objects(objects, cardinality_list):
    result = []
    index = 0
    for count in cardinality_list:
        if count == 0:
            result.append(None)
        if count > 0:  # Only create a group if count > 0
            result.append(objects[index:index+count])
            index += count
        if index > len(objects):
            result.append(None)
    return result

def area(ctx,array):
    format = pc.list_element(array, 4)
    result = []
    for i in range(40):
        if i == 0 :
            mask = pc.equal(format,i)
            result.append(pc.if_else(mask,pc.multiply(pc.list_element(array,2),pc.list_element(array,3)),None))
        elif i == 1 :
            mask = pc.equal(format,i)
            result.append(pc.if_else(mask,pc.multiply(pc.subtract(pc.list_element(array,3),pc.list_element(array,1)),pc.subtract(pc.list_element(array,2),pc.list_element(array,0))),None))
    final_result = 0
    for i, item in enumerate(result):
        if i+1 < len(result):
            final_result = item.fill_null(result[i+1])
    
    return final_result

func_doc = {}
func_doc["summary"] = "calculate area of bboxes based on various formats"
func_doc["description"] = "calculate bbox area"
func_name = "area"
in_types = {"array": pa.list_(pa.float64())}
out_type = pa.float64()
pc.register_scalar_function(area, func_name, func_doc, in_types, out_type)

In [125]:
#table = pq.read_table(r"C:\Users\Cristiano Lavoro\Downloads\ds_10.parquet")
table = pq.read_table(r"C:\Users\Cristiano Lavoro\Desktop\benchmarks\imagenette\imagenette2\output\ds.parquet")

In [109]:
field1 = table.column("image_feature").chunk(0).values.field("boundingbox_feature").values.field("bbox")
field2 = table.column("image_feature").chunk(0).values.field("boundingbox_feature").values.field("image_1_feature")
field3 = table.column("image_feature").chunk(0).values.field("boundingbox_feature").values.field("label_feature")
new_field = pa.array([73 for i in range(50)])

new_struct_no_cardinality = pa.StructArray.from_arrays(
    [field1,field2, field3, new_field ],  # Both fields
    names=['bbox', 'image_1_feature',"label_feature","area"]  # New child field added
)

cardinality_list = make_offset(table.column("image_feature").chunk(0).values.field("boundingbox_feature").offsets.to_pylist())
new_struct = pa.array(group_objects(new_struct_no_cardinality.to_pylist(), cardinality_list))

# Step 5: Replace the 'age' struct in the 'info' struct
field4 = table.column("image_feature").chunk(0).values.field("image")
field5 = table.column("image_feature").chunk(0).values.field("shape")
#field6 = table.column("image_feature").values.field("boundingbox_feature")
field6 = table.column("image_feature").chunk(0).values.field("text_1_feature")

new_image_feature_column_no_cardinality = pa.StructArray.from_arrays(
    [new_struct,field4,field5,field6],  # Attach the updated 'age' struct
    names=["boundingbox_feature", "image","shape","text_1_feature"]
)

cardinality_list = make_offset(table.column("image_feature").chunk(0).offsets.to_pylist())
new_image_feature_column = pa.array(group_objects(new_image_feature_column_no_cardinality.to_pylist(), cardinality_list))

# Step 6: Replace the original 'info' column in the table with the updated one
new_table = table.set_column(0, 'image_feature', new_image_feature_column)

In [126]:
field1 = table.column("image_feature").chunk(0).values.field("boundingbox_feature").values.field("bbox")
field2 = table.column("image_feature").chunk(0).values.field("boundingbox_feature").values.field("image_1_feature")
field3 = table.column("image_feature").chunk(0).values.field("boundingbox_feature").values.field("label_feature")
new_field = pc.call_function(func_name, [table.column("image_feature").chunk(0).values.field("boundingbox_feature").values.field("bbox")])

new_struct_no_cardinality = pa.StructArray.from_arrays(
    [field1,field2, field3, new_field ],  
    names=['bbox', 'image_1_feature',"label_feature","area"]  
)

cardinality_list = make_offset(table.column("image_feature").chunk(0).values.field("boundingbox_feature").offsets.to_pylist())
new_struct = pa.array(group_objects(new_struct_no_cardinality.to_pylist(), cardinality_list))

field4 = table.column("image_feature").chunk(0).values.field("image")
field5 = table.column("image_feature").chunk(0).values.field("shape")
field6 = table.column("image_feature").chunk(0).values.field("text_1_feature")

new_image_feature_column_no_cardinality = pa.StructArray.from_arrays(
    [new_struct,field4,field5,field6],  
    names=["boundingbox_feature", "image","shape","text_1_feature"]
)

cardinality_list = make_offset(table.column("image_feature").chunk(0).offsets.to_pylist())
new_image_feature_column = pa.array(group_objects(new_image_feature_column_no_cardinality.to_pylist(), cardinality_list))

new_table = table.set_column(0, 'image_feature', new_image_feature_column)

KeyError: 'boundingbox_feature'

In [162]:
field1 = table.column("image_feature").chunk(0).values.field("image")
#field2 = table.column("image_feature").chunk(0).values.field("class_feature")
field2 = table.column("image_feature").chunk(0).values.field("text_feature")

field_tmp = table.column("image_feature").chunk(0).values.field("class_feature").values.to_pylist()
new_field = [[item,{"label":80}] for item in field_tmp]


new_struct_no_cardinality = pa.StructArray.from_arrays(
    [field1,field2, new_field ],  
    names=['image', 'text_feature',"class_feature"]  
)

cardinality_list = make_offset(table.column("image_feature").chunk(0).values.field("class_feature").offsets.to_pylist())
new_struct = pa.array(group_objects(new_struct_no_cardinality.to_pylist(), cardinality_list))



new_table = table.set_column(0, 'image_feature', new_struct)

In [166]:
new_table.column("image_feature").chunk(0).values.field("class_feature")

<pyarrow.lib.ListArray object at 0x000001FAAE757640>
[
  -- is_valid: all not null
  -- child 0 type: int64
    [
      0,
      80
    ],
  -- is_valid: all not null
  -- child 0 type: int64
    [
      0,
      80
    ],
  ...
  -- is_valid: all not null
  -- child 0 type: int64
    [
      9,
      80
    ],
  -- is_valid: all not null
  -- child 0 type: int64
    [
      9,
      80
    ]
]

In [None]:
add_field(["image_feature","boundingbox_feature"],"bbox","area","area")

In [124]:
[x.name for x in list(table.column("image_feature").chunk(0).values.field("boundingbox_feature").values.type)]

['bbox', 'image_1_feature', 'label_feature']

In [None]:
def add_field(table : pa.Table,
              feature_list_path: List[str],
              feature_to_manipulate: str,
              new_feature_name : str,
              function_name :str):
    
    obj = None

    for indx, feature_name in enumerate(feature_list_path):
        if indx == 0:
            obj = table.column(feature_name).chunk(0).values