In [4]:
import pandas as pd
import json

# Load the train.csv, parquet file, and attribute_mapping.csv
df_train = pd.read_csv('/Users/guptatilak/Documents/visual-taxonomy-meesho/VisualFashionAttributePrediction/Data/iMaterialist/train.csv')
df_parquet = pd.read_parquet('/Users/guptatilak/Documents/visual-taxonomy-meesho/VisualFashionAttributePrediction/Data/iMaterialist/category_attributes.parquet')
df_mapping = pd.read_csv('/Users/guptatilak/Documents/visual-taxonomy-meesho/VisualFashionAttributePrediction/Data/iMaterialist/attribute_mapping.csv')

# Normalize case and strip any whitespaces in the mapping file to avoid issues
df_mapping['taskName'] = df_mapping['taskName'].str.strip().str.lower()
df_mapping['labelName'] = df_mapping['labelName'].str.strip().str.lower()

# Create a dictionary to map taskName (attribute description) to labelId for faster lookup
task_label_mapping = df_mapping.set_index(['taskName', 'labelName'])['labelId'].to_dict()

# Normalize case and strip any whitespaces in the parquet file for attribute matching
df_parquet['Attribute_list'] = df_parquet['Attribute_list'].apply(lambda x: [attr.strip().lower() for attr in x])

# Create a dictionary to map category to attribute list (from the parquet file)
category_attributes = df_parquet.set_index('Category')['Attribute_list'].to_dict()

# Create the list of image dictionaries and annotations
images_list = []
annotations_list = []

# Process each row in the train.csv
for index, row in df_train.iterrows():
    category = row['Category']
    
    # Get the attribute list for the current category from the parquet file
    attribute_list = category_attributes.get(category, [])
    
    # Append the image info
    image_dict = {
        "url": "https://example.com/random_image.jpg",  # Replace with a random URL
        "imageId": str(row['id']).zfill(6)  # Extract the imageId from train.csv
    }
    images_list.append(image_dict)

    # Normalize case and strip any whitespaces in train.csv attribute values
    label_ids = []
    for attr_col, task_name in zip(
        ['attr_1', 'attr_2', 'attr_3', 'attr_4', 'attr_5', 'attr_6', 'attr_7', 'attr_8', 'attr_9', 'attr_10'], 
        attribute_list
    ):
        attr_value = row[attr_col].strip().lower() if pd.notna(row[attr_col]) else None
        
        # Ensure proper mapping and handle missing task_name cases
        if attr_value and (task_name, attr_value) in task_label_mapping:
            label_ids.append(str(task_label_mapping[(task_name, attr_value)]))
        elif attr_value:
            print(f"Warning: TaskName '{task_name}' with value '{attr_value}' not found in mapping!")

    # Append the annotation info
    annotation_dict = {
        "labelId": label_ids,
        "imageId": str(row['id']).zfill(6)  # Match the imageId
    }
    annotations_list.append(annotation_dict)

# Structure as per your requirement
output_data = {
    "images": images_list,
    "annotations": annotations_list
}

# Save the structure to a JSON file
with open('/Users/guptatilak/Documents/visual-taxonomy-meesho/VisualFashionAttributePrediction/Data/iMaterialist/images_data_with_annotations.json', 'w') as json_file:
    json.dump(output_data, json_file, indent=4)

print("JSON file with annotations created successfully!")


JSON file with annotations created successfully!
