In [None]:
import pandas as pd
import json
import os
import ast

# Define Paths
v33_path = 'v33_Dome-Recommendations-With_Provenance_Verified.tsv'
# Using the specified JSON as template structure
template_path = 'dome_review_raw_human_20260204.json'

# Load Dataset
df_v33 = pd.read_csv(v33_path, sep='\t')
print(f"Loaded {len(df_v33)} rows from {v33_path}")

# Load Template - it is a list of objects, we take the first one as schema
with open(template_path, 'r') as f:
    template_data = json.load(f)
    if isinstance(template_data, list) and len(template_data) > 0:
        template_item = template_data[0]
    else:
        template_item = template_data

# Use the keys from the template as target keys
target_keys = list(template_item.keys())
print("Template keys loaded from first item in dome_review_raw_human_20260204.json:", target_keys)

: 

In [None]:
def transform_row_to_json(row, template):
    new_entry = {}
    
    # Helper to clean value
    def clean_val(val):
        if pd.isna(val):
            return "" 
        return str(val)

    # Helper to handle numbers
    def clean_int(val):
        if pd.isna(val):
            return 0 # Default for numeric fields in schema seems to be int or 0 if missing
        try:
            return int(float(val)) # float cast just in case "1.0"
        except:
            return 0

    # Helper to handle booleans
    def clean_bool(val):
        if pd.isna(val):
            return False
        if isinstance(val, bool):
             return val
        if str(val).lower() in ['true', '1', '1.0', 'yes']:
            return True
        return False
    
    # Iterate through keys in template
    for key, template_val in template.items():
        
        # 1. _id
        if key == '_id':
            # TSV has _id/$oid
            oid_val = row.get('_id/$oid')
            if pd.notna(oid_val):
                new_entry['_id'] = {"$oid": str(oid_val)}
            else:
                 # If missing in TSV, maybe generate or leave empty? 
                 # Generating empty for now as strictly structure mapping
                new_entry['_id'] = {"$oid": ""}
        
        # 2. user
        elif key == 'user':
             # TSV has user/$oid
            oid_val = row.get('user/$oid')
            if pd.notna(oid_val):
                new_entry['user'] = {"$oid": str(oid_val)}
            else:
                new_entry['user'] = {"$oid": ""}

        # 3. created / updated (Date objects)
        elif key == 'created':
            date_val = row.get('created/$date')
            if pd.notna(date_val):
                 new_entry['created'] = {"$date": str(date_val)}
            else:
                 new_entry['created'] = {"$date": ""}
        
        elif key == 'updated':
            date_val = row.get('updated/$date')
            if pd.notna(date_val):
                 new_entry['updated'] = {"$date": str(date_val)}
            else:
                 new_entry['updated'] = {"$date": ""}

        # 4. Nested objects
        elif isinstance(template_val, dict) and key not in ['_id', 'user', 'created', 'updated']:
            new_entry[key] = {}
            # Iterate subkeys of the template object
            for sub_key in template_val.keys():
                tsv_col = f"{key}/{sub_key}"
                val = row.get(tsv_col)
                
                # Check target type based on template value
                target_val_type = type(template_val[sub_key])
                
                # Special handling for known schema types if template is empty
                if sub_key == 'pmid': # Usually int or string? In JSON example it is "32344344" (string)
                     new_entry[key][sub_key] = clean_val(val)
                elif sub_key == 'tags': # List
                     if pd.isna(val) or val == "":
                         new_entry[key][sub_key] = []
                     else:
                         # Attempt to parse list string or split
                         try:
                             parsed = ast.literal_eval(str(val))
                             if isinstance(parsed, list):
                                 new_entry[key][sub_key] = parsed
                             else:
                                 new_entry[key][sub_key] = [str(val)]
                         except:
                             new_entry[key][sub_key] = [str(val)]
                elif isinstance(template_val[sub_key], int):
                    new_entry[key][sub_key] = clean_int(val)
                elif isinstance(template_val[sub_key], bool):
                    new_entry[key][sub_key] = clean_bool(val)
                else:
                    new_entry[key][sub_key] = clean_val(val)

        # 5. Simple fields
        else:
            val = row.get(key)
            # Check target type
            if isinstance(template_val, bool):
                 new_entry[key] = clean_bool(val)
            elif isinstance(template_val, int):
                 new_entry[key] = clean_int(val)
            elif isinstance(template_val, list):
                 new_entry[key] = [] # Assuming simple empty list if not present
            else:
                 new_entry[key] = clean_val(val)

    return new_entry

# Perform transformation
new_json_data = []

count = 0 
for idx, row in df_v33.iterrows():
    entry = transform_row_to_json(row, template_item)
    new_json_data.append(entry)
    count += 1

print(f"Transformed {len(new_json_data)} entries.")

# Save the result
output_filename = 'v33_Dome-Recommendations-With_Provenance_reformatted.json'
with open(output_filename, 'w') as f:
    json.dump(new_json_data, f, indent=4)

print(f"\nSaved reformatted JSON to {output_filename}")

Transformed 270 entries.

--- Report ---
Original V32 count: 270
New JSON count: 270
Fields dropped from TSV (not in JSON schema): {'provenance_source'}
Fields added to JSON (not in TSV): set()

Saved reformatted JSON to v32_Dome-Recommendations-With_Provenance_reformatted.json
