### Library Installation

In [None]:
#!pip install fiftyone

### Library Imports 

In [None]:
from pathlib import Path
import os

import fiftyone as fo, types
import fiftyone.core.fields as fof
from datasets import load_dataset
from PIL import Image
from tqdm import tqdm
import uuid

# Load the food_waste_part_1 FO DS

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
general_path = Path("drive/MyDrive/food_waste_part_1/")
print(f"All folders: {os.listdir(general_path)}")

In [None]:
fo.list_datasets()

In [None]:

# Create a dataset from a directory of images
dataset = fo.Dataset.from_dir(
    dataset_dir=general_path,
    dataset_type=fo.types.FiftyOneDataset,
    name="food_waste_part_1"
)

print(f"Created dataset with {len(dataset)} samples")

In [None]:
session = fo.launch_app(dataset)
print(session.url)

# Transform Food Waste Part 2 Dataset 

- Load the 2nd ds from hf
- Do transformations on the data 

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("Dldermann/food-waste-dataset-v2")

In [None]:
# Apply your feature mapping
feature_mapping = {
    'bonid': 'bonid',
    'image': 'image',
    'Bon_ID': 'bon_id',
    'Artikelnummer': 'article_number',
    'Artikel': 'ingredient_name',
    'Stückartikel': 'piece_article',
    'Anzahl_Kellen': 'number_of_portions',
    'Gewicht_Kelle': 'weight_per_portion',
    'Gewicht_Teller': 'weight_per_plate',
    'kcal_Teller': 'kcal_per_plate',
    'kj_Teller': 'kj_per_plate',
    'Fett_Teller': 'fat_per_plate',
    'ges_Fettsäuren_Teller': 'saturated_fat_per_plate',
    'Kohlenhydrate_Teller': 'carbohydrates_per_plate',
    'Zucker_Teller': 'sugar_per_plate',
    'Eiweiß_Teller': 'protein_per_plate',
    'Salz_Teller': 'salt_per_plate',
    'Menge_Rückläufer': 'return_quantity',
    'Prozent_Rückläufer': 'return_percentage',
    'Gericht': 'dish',
    'Portionsgröße': 'portion_size',
    'Gewicht_vorher': 'weight_before',
    'kcal_vorher': 'kcal_before',
    'kj_vorher': 'kj_before',
    'Fett_vorher': 'fat_before',
    'ges_Fettsäuren_vorher': 'saturated_fat_before',
    'Kohlenhydrate_vorher': 'carbohydrates_before',
    'Zucker_vorher': 'sugar_before',
    'Eiweiß_vorher': 'protein_before',
    'Salz_vorher': 'salt_before',
    'Gewicht_nachher': 'weight_after',
    'kcal_nachher': 'kcal_after',
    'kj_nachher': 'kj_after',
    'Fett_nachher': 'fat_after',
    'ges_fettsäuren_nachher': 'saturated_fat_after',
    'Kohlenhydrate_nachher': 'carbohydrates_after',
    'Zucker_nachher': 'sugar_after',
    'Eiweiß_nachher': 'protein_after',
    'Salz_nachher': 'salt_after'
}

# German to English ingredient mapping
german_to_english_ingredients_hyphenated = {
    'Fleischbällchen gebrüht': 'poached-meatballs',
    'Reis': 'rice',
    'Paniertes Fischfilet': 'breaded-fish-fillet',
    'Linseneintopf': 'lentil-stew',
    'Apfelmus': 'applesauce',
    'Helle Sauce': 'light-sauce-or-white-sauce',
    'Kartoffelpüree': 'mashed-potatoes',
    'Rinderbraten': 'roast-beef',
    'Semmelknödel': 'bread-dumplings',
    'Grüne Bohnen': 'green-beans',
    'Möhre': 'carrot',
    'Pflanzencreme': 'vegetable-based-cream',
    'Schinken Mettwurst': 'ham-sausage',
    'Paprika': 'paprika-or-bell-pepper',
    'Seelachs': 'pollock-or-coalfish',
    'Bratenjus': 'gravy',
    'Hähnchenstreifen': 'chicken-strips',
    'Eisbergsalat': 'iceberg-lettuce',
    'Rotkohl': 'red-cabbage',
    'Sauerkraut': 'sauerkraut',
    'Reibekuchen': 'potato-pancakes-or-potato-fritters',
    'Krautsalat': 'coleslaw',
    'Schnitzel': 'schnitzel-or-cutlet',
    'Blumenkohl': 'cauliflower',
    'Rostbratwurst': 'grilled-sausage',
    'Braune Sauce': 'brown-sauce',
    'Kartoffeln': 'potatoes',
    'Kartoffelwürfel': 'diced-potatoes',
    'Sahne': 'cream',
    'Zucchini': 'zucchini-or-courgette',
    'Eierspätzle': 'egg-spaetzle',
    'Pilze': 'mushrooms',
    'Erbsen': 'peas',
    'Wirsing': 'savoy-cabbage',
    'Malzbier-Senf-Sauce': 'malt-beer-mustard-sauce',
    'Dressing Portion': 'dressing-portion',
    'Linsen': 'lentils',
    'Zwiebel': 'onion',
    'Schweinenackenbraten': 'pork-neck-roast',
    'Hähnchen': 'chicken',
    'Tomaten-Curry-Sauce': 'tomato-curry-sauce'
}

In [None]:
hf_dataset = ds.rename_columns(feature_mapping)

In [None]:
# Create a persistent directory for images
persistent_dir = "food_waste_dataset-2"
os.makedirs(persistent_dir, exist_ok=True)

# Create FiftyOne dataset
fiftyone_dataset = fo.load_dataset(name="food_waste_dataset-2")

sample_count = 0
for split in ['train']:
    split_count = 0
    for item in hf_dataset[split]:
        # Save image to persistent directory with unique filename
        image_filename = f"{split}_{split_count:06d}.jpg"
        image_path = os.path.join(persistent_dir, image_filename)

        # Save the PIL Image to the persistent file
        item['image'].save(image_path)

        # Create FiftyOne sample
        sample = fo.Sample(filepath=image_path)
        sample['split'] = split

        # Add any additional metadata from the original dataset
        for key, value in item.items():
            if key != 'image':  # Skip the image field since we've handled it
                sample[key] = value

        fiftyone_dataset.add_sample(sample)
        split_count += 1
        sample_count += 1

print(f"FiftyOne dataset created with {len(fiftyone_dataset)} samples.")
print(f"Images saved to: {os.path.abspath(persistent_dir)}")



In [None]:
fiftyone_dataset.compute_metadata()

In [None]:
session1 = fo.launch_app(fiftyone_dataset)
print(session1.url)

In [None]:
fo.list_datasets()

In [None]:
dataset1 = fo.load_dataset('food_waste_part_1')
dataset2 = fo.load_dataset('food_waste_dataset')


# Combine Food Waste Part 2 Dataset with Part 1 

In [None]:
dataset1 = fo.load_dataset('food_waste_part_1')
dataset2 = fo.load_dataset('food_waste_dataset')

def diagnose_schema_conflicts(dataset1, dataset2):
    """Find exact field conflicts between datasets"""
    schema1 = dataset1.get_field_schema(ftype=None, embedded_doc_type=None)
    schema2 = dataset2.get_field_schema(ftype=None, embedded_doc_type=None)

    conflicts = {}
    all_fields = set(schema1.keys()) | set(schema2.keys())

    for field_name in all_fields:
        field1 = schema1.get(field_name)
        field2 = schema2.get(field_name)

        if field1 and field2:
            type1 = type(field1).__name__
            type2 = type(field2).__name__
            if type1 != type2:
                conflicts[field_name] = (type1, type2)
        elif field1:
            conflicts[field_name] = (type(field1).__name__, "Missing")
        elif field2:
            conflicts[field_name] = ("Missing", type(field2).__name__)

    return conflicts

conflicts = diagnose_schema_conflicts(dataset1, dataset2)
print("Schema conflicts:")
for field, (type1, type2) in conflicts.items():
    print(f"  {field}: {type1} vs {type2}")

# Identify differences between nested fields of the same types (ex. List[str], List[int])

In [None]:

def diagnose_detailed_schema_conflicts(dataset1, dataset2):
    """Find exact field conflicts including ListField element types and nested fields"""
    schema1 = dataset1.get_field_schema(ftype=None, embedded_doc_type=None)
    schema2 = dataset2.get_field_schema(ftype=None, embedded_doc_type=None)

    conflicts = {}
    all_fields = set(schema1.keys()) | set(schema2.keys())

    def get_detailed_field_info(field):
        """Get detailed information about a field including nested types"""
        if field is None:
            return "Missing"

        field_type = type(field).__name__

        # Check ListField element types
        if isinstance(field, fof.ListField):
            if hasattr(field, 'field') and field.field is not None:
                element_type = type(field.field).__name__
                return f"{field_type}({element_type})"
            else:
                return f"{field_type}(Unknown)"

        # Check VectorField dimensions
        elif isinstance(field, fof.VectorField):
            if hasattr(field, 'dim') and field.dim is not None:
                return f"{field_type}(dim={field.dim})"
            else:
                return f"{field_type}(dim=Unknown)"

        # Check EmbeddedDocumentField types
        elif isinstance(field, fof.EmbeddedDocumentField):
            if hasattr(field, 'document_type') and field.document_type is not None:
                doc_type = field.document_type.__name__
                return f"{field_type}({doc_type})"
            else:
                return f"{field_type}(Unknown)"

        # Check DictField value types
        elif isinstance(field, fof.DictField):
            if hasattr(field, 'field') and field.field is not None:
                value_type = type(field.field).__name__
                return f"{field_type}({value_type})"
            else:
                return f"{field_type}(Unknown)"

        # For other field types, just return the type name
        else:
            return field_type

    def compare_fields(field1, field2):
        """Compare two fields and return detailed conflict info"""
        info1 = get_detailed_field_info(field1)
        info2 = get_detailed_field_info(field2)

        if info1 != info2:
            return (info1, info2)
        return None

    # Check all fields
    for field_name in all_fields:
        field1 = schema1.get(field_name)
        field2 = schema2.get(field_name)

        conflict = compare_fields(field1, field2)
        if conflict:
            conflicts[field_name] = conflict

    return conflicts

def print_detailed_conflicts(conflicts):
    """Print conflicts with better formatting"""
    if not conflicts:
        print("No schema conflicts found!")
        return

    print("Detailed Schema Conflicts:")
    print("=" * 50)

    # Group conflicts by type
    listfield_conflicts = {}
    regular_conflicts = {}
    missing_conflicts = {}

    for field, (type1, type2) in conflicts.items():
        if "ListField" in type1 or "ListField" in type2:
            listfield_conflicts[field] = (type1, type2)
        elif "Missing" in type1 or "Missing" in type2:
            missing_conflicts[field] = (type1, type2)
        else:
            regular_conflicts[field] = (type1, type2)

    if listfield_conflicts:
        print("\nListField Element Type Conflicts:")
        print("-" * 35)
        for field, (type1, type2) in listfield_conflicts.items():
            print(f"  {field}:")
            print(f"    Dataset1: {type1}")
            print(f"    Dataset2: {type2}")

    if regular_conflicts:
        print("\nRegular Field Type Conflicts:")
        print("-" * 30)
        for field, (type1, type2) in regular_conflicts.items():
            print(f"  {field}:")
            print(f"    Dataset1: {type1}")
            print(f"    Dataset2: {type2}")

    if missing_conflicts:
        print("\nMissing Fields:")
        print("-" * 15)
        for field, (type1, type2) in missing_conflicts.items():
            print(f"  {field}:")
            print(f"    Dataset1: {type1}")
            print(f"    Dataset2: {type2}")

detailed_conflicts = diagnose_detailed_schema_conflicts(dataset1, dataset2)
print_detailed_conflicts(detailed_conflicts)

# Also get some sample data to understand the actual content
print("\nSample Data Analysis:")
print("=" * 25)

def analyze_sample_data(dataset, dataset_name, max_samples=3):
    """Analyze actual data in samples to understand content types"""
    print(f"\n{dataset_name} Sample Data:")

    for i, sample in enumerate(dataset.take(max_samples)):
        print(f"\nSample {i+1}:")

        # Check ListField contents
        listfield_names = []
        for field_name, field in dataset.get_field_schema().items():
            if isinstance(field, fof.ListField):
                listfield_names.append(field_name)

        for field_name in listfield_names[:5]:  # Show first 5 ListFields
            if hasattr(sample, field_name):
                value = getattr(sample, field_name)
                if value is not None and len(value) > 0:
                    print(f"  {field_name}: {value[:3]}... (len={len(value)}, types={[type(x).__name__ for x in value[:3]]})")

analyze_sample_data(dataset1, "Dataset1")
analyze_sample_data(dataset2, "Dataset2")

# Create a new schema unified schema and convert all entries to respective types

In [None]:

def safe_resolve_conflicts_fixed(dataset1, dataset2, target_name):
    """Safely resolve conflicts with proper schema handling"""

    # Delete existing dataset if it exists
    try:
        existing = fo.load_dataset(target_name)
        existing.delete()
        print(f"Deleted existing dataset: {target_name}")
    except:
        pass

    # Create new unified dataset
    unified = fo.Dataset(target_name)

    # Define field type mappings
    listfield_conversions = {
        # Nutritional data should be floats
        'kcal_per_plate': float,
        'kj_per_plate': float,
        'fat_per_plate': float,
        'saturated_fat_per_plate': float,
        'carbohydrates_per_plate': float,
        'sugar_per_plate': float,
        'protein_per_plate': float,
        'salt_per_plate': float,
        'weight_per_portion': float,
        'weight_per_plate': float,
        'return_quantity': float,
        'return_percentage': float,

        # Portions should be integers
        'number_of_portions': int,

        # IDs and names should be strings
        'bon_id': str,
        'article_number': str,
        'ingredient_name': str,
        'piece_article': str,
    }

    single_field_conversions = {
        # Before measurements (floats)
        'salt_before': float,
        'carbohydrates_before': float,
        'kcal_before': float,
        'saturated_fat_before': float,
        'kj_before': float,
        'fat_before': float,
        'sugar_before': float,
        'protein_before': float,

        # After measurements (integers)
        'saturated_fat_after': int,
        'fat_after': int,
        'kcal_after': int,
        'kj_after': int,
        'protein_after': int,
        'sugar_after': int,
        'salt_after': int,
        'carbohydrates_after': int,
    }

    def convert_value(value, target_type):
        """Convert a single value to target type, handling None properly"""
        if value is None or value == '' or value == 'None':
            return None

        try:
            if target_type == float:
                return float(value)
            elif target_type == int:
                return int(float(value))
            elif target_type == str:
                return str(value)
            else:
                return value
        except (ValueError, TypeError):
            return None

    def convert_list_values(value_list, target_type):
        """Convert all values in a list to target type, filtering out None values"""
        if value_list is None:
            return None

        if not isinstance(value_list, list):
            # If it's not a list, try to convert the single value and make it a list
            converted = convert_value(value_list, target_type)
            return [converted] if converted is not None else []

        converted_list = []
        for item in value_list:
            converted = convert_value(item, target_type)
            if converted is not None:  # Only add non-None values
                converted_list.append(converted)

        # Return None if list is empty, otherwise return the converted list
        return converted_list if converted_list else None

    def create_clean_sample(original_sample):
        """Create a new sample with proper type conversions"""

        new_sample = fo.Sample(filepath=original_sample.filepath)

        for field_name in original_sample.field_names:
            if field_name in ['id', 'filepath']:
                continue

            value = original_sample[field_name]

            # Skip None values entirely to avoid schema inference issues
            if value is None:
                continue

            # Handle ListField conversions
            if field_name in listfield_conversions:
                target_type = listfield_conversions[field_name]
                converted_value = convert_list_values(value, target_type)

                # Only set the field if we have valid data
                if converted_value is not None and len(converted_value) > 0:
                    new_sample[field_name] = converted_value

            # Handle single field conversions
            elif field_name in single_field_conversions:
                target_type = single_field_conversions[field_name]
                converted_value = convert_value(value, target_type)

                # Only set the field if we have valid data
                if converted_value is not None:
                    new_sample[field_name] = converted_value

            # Handle other fields (copy as-is if not None)
            else:
                try:
                    new_sample[field_name] = value
                except Exception as e:
                    print(f"Warning: Could not set field {field_name}: {e}")
                    continue

        return new_sample

    # Process Dataset2 first (it has cleaner schema)
    print("Processing Dataset2 samples first...")
    dataset2_samples = []
    for sample in dataset2.iter_samples(progress=True):
        try:
            clean_sample = create_clean_sample(sample)
            dataset2_samples.append(clean_sample)
        except Exception as e:
            print(f"Error processing Dataset2 sample {sample.id}: {e}")
            continue

    # Add Dataset2 samples first to establish schema
    print("Adding Dataset2 samples to establish schema...")
    if dataset2_samples:
        unified.add_samples(dataset2_samples[:10])  # Add first 10 to establish schema

        # Add remaining Dataset2 samples
        if len(dataset2_samples) > 10:
            remaining_samples = dataset2_samples[10:]
            batch_size = 100
            for i in range(0, len(remaining_samples), batch_size):
                batch = remaining_samples[i:i + batch_size]
                unified.add_samples(batch)

    # Now process Dataset1 samples
    print("Processing Dataset1 samples...")
    dataset1_samples = []
    for sample in dataset1.iter_samples(progress=True):
        try:
            clean_sample = create_clean_sample(sample)
            dataset1_samples.append(clean_sample)
        except Exception as e:
            print(f"Error processing Dataset1 sample {sample.id}: {e}")
            continue

    # Add Dataset1 samples in batches
    print("Adding Dataset1 samples...")
    if dataset1_samples:
        batch_size = 100
        for i in range(0, len(dataset1_samples), batch_size):
            batch = dataset1_samples[i:i + batch_size]
            try:
                unified.add_samples(batch)
            except Exception as e:
                print(f"Error adding batch {i//batch_size + 1}: {e}")
                # Try adding samples one by one for this batch
                for j, sample in enumerate(batch):
                    try:
                        unified.add_sample(sample)
                    except Exception as sample_error:
                        print(f"  Failed to add sample {i+j}: {sample_error}")
                        continue

    unified.persistent = True

    print(f"Successfully created unified dataset with {len(unified)} samples")

    # Print final schema for verification
    print("\nFinal schema for key fields:")
    schema = unified.get_field_schema()
    key_fields = ['kcal_per_plate', 'kj_per_plate', 'kcal_before', 'kcal_after']
    for field_name in key_fields:
        if field_name in schema:
            field_type = type(schema[field_name]).__name__
            print(f"  {field_name}: {field_type}")

    return unified

# Usage
unified_dataset = safe_resolve_conflicts_fixed(dataset1, dataset2, "food_waste_final_v11")

In [None]:
unified_ds = fo.load_dataset("food_waste_final_v11")
print(f"All entries are merged: {len(unified_ds) == len(dataset2)+len(dataset1)}")

In [None]:
session = fo.launch_app(unified_ds)
print(session.url)

# Save dataset in drive externally

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Export your dataset to Drive
export_path = "/content/drive/MyDrive/food_waste_merged_v11"

# Export as FiftyOne dataset (preserves all metadata, embeddings, etc.)
unified_dataset.export(
    export_dir=export_path,
    dataset_type=fo.types.FiftyOneDataset
)

print(f"Dataset exported to: {export_path}")

In [None]:
unified_ds