In [None]:
%load_ext autoreload
%autoreload 2

## Library Imports 

In [None]:
# Import the datasets library
import fiftyone as fo
from datasets import load_dataset

### Load the datasets 

- Old Ds is the first dataset
- Hf Ds is the hugginf face dataset, which we turned into a fiftyone dataset
- We transformed the german columns to English columns, as well as renamed the ingredients to english, as shown in the demo notebook
- We cleaned the hf ds, before combining it with the old dataset
- All the 3 datasets are loaded below

In [None]:
old_ds = fo.Dataset.from_dir(
    dataset_dir="../food_waste_part_1", dataset_type=fo.types.FiftyOneDataset
)
hf_ds = fo.Dataset.from_dir(
    dataset_dir="../food_waste_part_2", dataset_type=fo.types.FiftyOneDataset
)
hf_old_ds = fo.Dataset.from_dir(
    dataset_dir="../old_ds_hf_combined", dataset_type=fo.types.FiftyOneDataset
)
# print(ds)

### Create FO from collected_data

- We created a fiftyone dataset from the collected data
- The process to create the necessary fields was done by parsing. 
    - First we took the folder names and used them as the file names. 
    - We later parsed these file names to fill the ingredient names and return quantity


In [None]:
import os


def rename_files_with_folder_name(folder_path):
    if not os.path.isdir(folder_path):
        print(f"Error: '{folder_path}' is not a valid directory.")
        return

    # Get the folder name
    folder_name = os.path.basename(os.path.normpath(folder_path))

    # Get a list of all files in the folder
    files = [
        f
        for f in os.listdir(folder_path)
        if os.path.isfile(os.path.join(folder_path, f))
    ]

    # Sort the files to ensure consistent numbering
    files.sort()

    for i, old_file_name in enumerate(files, 1):
        # Get the file extension
        _, file_extension = os.path.splitext(old_file_name)

        # Create the new file name
        new_file_name = f"{folder_name}_{i}{file_extension}"

        # Construct the full paths
        old_file_path = os.path.join(folder_path, old_file_name)
        new_file_path = os.path.join(folder_path, new_file_name)

        # Rename the file
        try:
            os.rename(old_file_path, new_file_path)
            # print(f"Renamed '{old_file_name}' to '{new_file_name}'")
        except OSError as e:
            print(f"Error renaming file {old_file_name}: {e}")


folder_list = [
    "../group_1/goulash_0-rice_36-potatoes_0",
    "../group_1/goulash_0-rice_36-potatoes_0-chickpeas_62",
    "../group_1/goulash_0-rice_36-potatoes_26-chickpeas_62",
    "../group_4/Goulash_129",
    "../group_4/Rice_48",
    "../group_4/Potatoes_101",
    "../group_4/Rice_48-Potatoes_101-Goulash_129",
    "../group_3/goulash_25",
    "../group_3/goulash_25-rice_31-potatoes_21",
    "../group_3/goulash_92",
    "../group_3/goulash_92-rice_66-potatoes_103",
    "../group_3/potatoes_21",
    "../group_3/potatoes_103",
    "../group_3/rice_31",
    "../group_3/rice_66",
]
for folder in folder_list:
    rename_files_with_folder_name(folder)

In [None]:
import fiftyone as fo
import os


# --- Define the filename parsing logic --- #
def parse_filename(filepath):
    filename = os.path.basename(filepath)
    name, _ = os.path.splitext(filename)

    parts = name.split("-")

    ingredient_names = []
    quantities = []

    for part in parts:
        tokens = part.split("_")
        if len(tokens) < 2:
            continue

        ing_name = tokens[0].lower()  # ensure lowercase
        try:
            qty = float(tokens[1])
        except ValueError:
            continue

        ingredient_names.append(ing_name)
        quantities.append(qty)

    return ingredient_names, quantities


# --- Create the dataset --- #
dataset = fo.Dataset("kool")

# Replace with your actual image directory path
image_dir = "../new_data"
dataset.add_images_dir(image_dir, recursive=True)

# --- Add custom fields --- #
dataset.add_sample_field("ingredient_name", fo.ListField, subfield=fo.StringField)
dataset.add_sample_field("return_quantity", fo.ListField, subfield=fo.FloatField)

# --- Populate fields --- #
for sample in dataset:
    ingredient_names, quantities = parse_filename(sample.filepath)
    sample["ingredient_name"] = ingredient_names
    sample["return_quantity"] = quantities
    sample.save()

# --- Show dataset summary --- #
print(dataset)


In [None]:
dataset.export("../new_data_ds", fo.types.FiftyOneDataset)

In [None]:
collected_ds = fo.Dataset.from_dir(
    dataset_dir="../new_data_ds", dataset_type=fo.types.FiftyOneDataset
)

### Combine the old_and_hf_ds with new data 

- Lastly before doing any training we combined the old_and_hf_ds with new collected data
- We exported this combined data as a fiftyone dataset 

In [None]:
def safe_resolve_conflicts_fixed(dataset1, dataset2, target_name):
    """Safely resolve conflicts with proper schema handling"""

    # Delete existing dataset if it exists
    try:
        existing = fo.load_dataset(target_name)
        existing.delete()
        print(f"Deleted existing dataset: {target_name}")
    except:
        pass

    # Create new unified dataset
    unified = fo.Dataset(target_name)

    # Define field type mappings
    listfield_conversions = {
        # Nutritional data should be floats
        "kcal_per_plate": float,
        "kj_per_plate": float,
        "fat_per_plate": float,
        "saturated_fat_per_plate": float,
        "carbohydrates_per_plate": float,
        "sugar_per_plate": float,
        "protein_per_plate": float,
        "salt_per_plate": float,
        "weight_per_portion": float,
        "weight_per_plate": float,
        "return_quantity": float,
        "return_percentage": float,
        # Portions should be integers
        "number_of_portions": int,
        # IDs and names should be strings
        "bon_id": str,
        "article_number": str,
        "ingredient_name": str,
        "piece_article": str,
    }

    single_field_conversions = {
        # Before measurements (floats)
        "salt_before": float,
        "carbohydrates_before": float,
        "kcal_before": float,
        "saturated_fat_before": float,
        "kj_before": float,
        "fat_before": float,
        "sugar_before": float,
        "protein_before": float,
        # After measurements (integers)
        "saturated_fat_after": int,
        "fat_after": int,
        "kcal_after": int,
        "kj_after": int,
        "protein_after": int,
        "sugar_after": int,
        "salt_after": int,
        "carbohydrates_after": int,
    }

    def convert_value(value, target_type):
        """Convert a single value to target type, handling None properly"""
        if value is None or value == "" or value == "None":
            return None

        try:
            if target_type == float:
                return float(value)
            elif target_type == int:
                return int(float(value))
            elif target_type == str:
                return str(value)
            else:
                return value
        except (ValueError, TypeError):
            return None

    def convert_list_values(value_list, target_type):
        """Convert all values in a list to target type, filtering out None values"""
        if value_list is None:
            return None

        if not isinstance(value_list, list):
            # If it's not a list, try to convert the single value and make it a list
            converted = convert_value(value_list, target_type)
            return [converted] if converted is not None else []

        converted_list = []
        for item in value_list:
            converted = convert_value(item, target_type)
            if converted is not None:  # Only add non-None values
                converted_list.append(converted)

        # Return None if list is empty, otherwise return the converted list
        return converted_list if converted_list else None

    def create_clean_sample(original_sample):
        """Create a new sample with proper type conversions"""

        new_sample = fo.Sample(filepath=original_sample.filepath)

        for field_name in original_sample.field_names:
            if field_name in ["id", "filepath"]:
                continue

            value = original_sample[field_name]

            # Skip None values entirely to avoid schema inference issues
            if value is None:
                continue

            # Handle ListField conversions
            if field_name in listfield_conversions:
                target_type = listfield_conversions[field_name]
                converted_value = convert_list_values(value, target_type)

                # Only set the field if we have valid data
                if converted_value is not None and len(converted_value) > 0:
                    new_sample[field_name] = converted_value

            # Handle single field conversions
            elif field_name in single_field_conversions:
                target_type = single_field_conversions[field_name]
                converted_value = convert_value(value, target_type)

                # Only set the field if we have valid data
                if converted_value is not None:
                    new_sample[field_name] = converted_value

            # Handle other fields (copy as-is if not None)
            else:
                try:
                    new_sample[field_name] = value
                except Exception as e:
                    print(f"Warning: Could not set field {field_name}: {e}")
                    continue

        return new_sample

    # Process Dataset2 first (it has cleaner schema)
    print("Processing Dataset2 samples first...")
    dataset2_samples = []
    for sample in dataset2.iter_samples(progress=True):
        try:
            clean_sample = create_clean_sample(sample)
            dataset2_samples.append(clean_sample)
        except Exception as e:
            print(f"Error processing Dataset2 sample {sample.id}: {e}")
            continue

    # Add Dataset2 samples first to establish schema
    print("Adding Dataset2 samples to establish schema...")
    if dataset2_samples:
        unified.add_samples(dataset2_samples[:10])  # Add first 10 to establish schema

        # Add remaining Dataset2 samples
        if len(dataset2_samples) > 10:
            remaining_samples = dataset2_samples[10:]
            batch_size = 100
            for i in range(0, len(remaining_samples), batch_size):
                batch = remaining_samples[i : i + batch_size]
                unified.add_samples(batch)

    # Now process Dataset1 samples
    print("Processing Dataset1 samples...")
    dataset1_samples = []
    for sample in dataset1.iter_samples(progress=True):
        try:
            clean_sample = create_clean_sample(sample)
            dataset1_samples.append(clean_sample)
        except Exception as e:
            print(f"Error processing Dataset1 sample {sample.id}: {e}")
            continue

    # Add Dataset1 samples in batches
    print("Adding Dataset1 samples...")
    if dataset1_samples:
        batch_size = 100
        for i in range(0, len(dataset1_samples), batch_size):
            batch = dataset1_samples[i : i + batch_size]
            try:
                unified.add_samples(batch)
            except Exception as e:
                print(f"Error adding batch {i // batch_size + 1}: {e}")
                # Try adding samples one by one for this batch
                for j, sample in enumerate(batch):
                    try:
                        unified.add_sample(sample)
                    except Exception as sample_error:
                        print(f"  Failed to add sample {i + j}: {sample_error}")
                        continue

    unified.persistent = True

    print(f"Successfully created unified dataset with {len(unified)} samples")

    # Print final schema for verification
    print("\nFinal schema for key fields:")
    schema = unified.get_field_schema()
    key_fields = ["kcal_per_plate", "kj_per_plate", "kcal_before", "kcal_after"]
    for field_name in key_fields:
        if field_name in schema:
            field_type = type(schema[field_name]).__name__
            print(f"  {field_name}: {field_type}")

    return unified

In [None]:
# Usage
unified_dataset = safe_resolve_conflicts_fixed(
    hf_old_ds, collected_ds, "food_waste_all"
)

In [None]:
unified_dataset.export('all_data', fo.types.FiftyOneDataset)

In [None]:
# fo.launch_app(unified_dataset, port=3000)