In [5]:
def transform_row_to_json(row, template_keys):
    new_entry = {}
    
    # Helper to clean value
    def clean_val(val):
        if pd.isna(val):
            return "" # Default for string fields? Or None? 
            # JSON example shows empty strings for missing text, e.g. "dataset": {"availability": "", ...}
        return val

    # Helper to handle numbers that might be floats in DF but ints in JSON
    def clean_int(val):
        if pd.isna(val):
            return 0 # Default to 0? Or skip? JSON has 0, 2, etc.
        try:
            return int(val)
        except:
            return 0

    # Helper to handle booleans
    def clean_bool(val):
        if pd.isna(val):
            return False
        # If 1.0/0.0
        if isinstance(val, (int, float)):
            return bool(val)
        return False
    
    # Iterate through keys expected in the output JSON
    for key in template_keys:
        
        # 1. _id
        if key == '_id':
            # TSV has _id/$oid
            oid_val = row.get('_id/$oid')
            if pd.notna(oid_val):
                new_entry['_id'] = {"$oid": str(oid_val)}
            else:
                # Should not happen for ID, but handle graceful
                new_entry['_id'] = {"$oid": ""}
        
        # 2. user
        elif key == 'user':
             # TSV has user/$oid
            oid_val = row.get('user/$oid')
            if pd.notna(oid_val):
                new_entry['user'] = {"$oid": str(oid_val)}
            else:
                new_entry['user'] = {"$oid": ""}

        # 3. created / updated (Date objects)
        elif key == 'created':
            date_val = row.get('created/$date')
            if pd.notna(date_val):
                 new_entry['created'] = {"$date": str(date_val)}
            else:
                 new_entry['created'] = {"$date": ""}
        
        elif key == 'updated':
            date_val = row.get('updated/$date')
            if pd.notna(date_val):
                 new_entry['updated'] = {"$date": str(date_val)}
            else:
                 new_entry['updated'] = {"$date": ""}

        # 4. Nested objects: dataset, evaluation, model, optimization, publication
        elif key in ['dataset', 'evaluation', 'model', 'optimization', 'publication']:
            new_entry[key] = {}
            # Need to know the sub-fields for this key.
            # We look at the template item to see what sub-keys exist.
            # And map them from TSV columns key/sub_key
            
            # Get template sub-object
            sub_template = template_item.get(key, {})
            
            # Start collecting sub keys from schema
            sub_keys = []
            if isinstance(sub_template, dict):
                sub_keys = list(sub_template.keys())

            # Specific adjustment: Force add pmcid if in publication and not present
            if key == 'publication':
                 # Check if pmid exists to insert after, or just append
                 # Schema doesn't guarantee order in sub_keys list here, but we iterate.
                 # Let's rebuild the list carefully.
                 ordered_keys = []
                 pmcid_added = False
                 
                 for k in sub_keys:
                     ordered_keys.append(k)
                     if k == 'pmid':
                         # Insert pmcid after pmid
                         ordered_keys.append('pmcid')
                         pmcid_added = True
                 
                 if not pmcid_added:
                     # If pmid wasn't found (unlikely), append at end or handle if empty
                     ordered_keys.append('pmcid')
                 
                 # Reassign for iteration
                 sub_keys = ordered_keys
                 # Remove duplicates if pmcid was already in schema (it won't correspond to user request of "bringing back" if it was)
                 # A set would destroy order. List comprehension with seen set:
                 seen = set()
                 sub_keys = [x for x in sub_keys if not (x in seen or seen.add(x))]

            for sub_key in sub_keys:
                tsv_col = f"{key}/{sub_key}"
                
                # Handle specific sub-field types based on known usage or simple heuristics
                val = row.get(tsv_col)
                
                # Special cases for integer fields like 'done', 'skip', 'year', 'update'
                if sub_key in ['done', 'skip', 'update', '__v', 'score']:
                        new_entry[key][sub_key] = clean_int(val)
                elif sub_key == 'publication/year': # just 'year' here
                        new_entry[key][sub_key] = str(clean_int(val)) # Year is string in JSON example "2020"
                elif sub_key == 'tags':
                        # In JSON example it is []
                        # In TSV valid might be NaN or string?
                        if pd.isna(val) or val == "":
                            new_entry[key][sub_key] = []
                        else:
                            # If comma separated string?
                            new_entry[key][sub_key] = [str(val)] # Or split? Assuming list format needed.
                elif sub_key == 'public':
                        new_entry[key][sub_key] = clean_bool(val)
                else:
                    # Default string handling
                    # Just in case 'year' is treated as string in publication
                    if sub_key == 'year':
                            new_entry[key][sub_key] = str(clean_int(val)) if pd.notna(val) else ""
                    else:
                            new_entry[key][sub_key] = clean_val(val)

        # 5. Simple fields
        else:
            # key is something like 'public', 'uuid', 'reviewState', 'shortid', 'update', '__v', 'score'
            # Check if it exists in TSV directly
            val = row.get(key)
            
            if key == 'public':
                new_entry[key] = clean_bool(val)
            elif key in ['update', '__v', 'score']:
                # The TSV has 'score' and '__v'
                new_entry[key] = clean_int(val)
            elif key == 'Duplicate_shortid':
                # Not in TSV, add as empty list (based on JSON list type)
                new_entry[key] = [] 
                # (If TSV has a way to construct this, logic goes here. Assuming empty for now as "field not present add in")
            else:
                # String fields: uuid, reviewState, shortid
                new_entry[key] = clean_val(val)

    return new_entry

# Perform transformation
new_json_data = []

# Since we need to follow the schema strictly, ensure we have the template keys from the JSON
# (Already loaded in target_keys)

for idx, row in df_v32.iterrows():
    entry = transform_row_to_json(row, target_keys)
    new_json_data.append(entry)

print(f"Transformed {len(new_json_data)} entries.")

# Reporting
tsv_cols = set(df_v32.columns)
json_keys_flat = set()

# Flatten JSON keys for comparison
for k in target_keys:
    if k in ['dataset', 'evaluation', 'model', 'optimization', 'publication']:
        # Get from first transformed entry instead of template to reflect modifications
        if new_json_data and k in new_json_data[0]:
             for sub_k in new_json_data[0][k].keys():
                json_keys_flat.add(f"{k}/{sub_k}")
        else:
             for sub_k in template_item[k].keys():
                json_keys_flat.add(f"{k}/{sub_k}")
    elif k == '_id':
        json_keys_flat.add('_id/$oid')
    elif k == 'user':
        json_keys_flat.add('user/$oid')
    elif k == 'created':
        json_keys_flat.add('created/$date')
    elif k == 'updated':
        json_keys_flat.add('updated/$date')
    else:
        json_keys_flat.add(k)

dropped_cols = tsv_cols - json_keys_flat
added_keys = json_keys_flat - tsv_cols

print("\n--- Report ---")
print(f"Original V32 count: {len(df_v32)}")
print(f"New JSON count: {len(new_json_data)}")
print(f"Fields dropped from TSV (not in JSON schema): {dropped_cols}")
print(f"Fields added to JSON (not in TSV): {added_keys}")

# Save the result
output_filename = 'v32_Dome-Recommendations-With_Provenance_reformatted.json'
with open(output_filename, 'w') as f:
    json.dump(new_json_data, f, indent=4)

print(f"\nSaved reformatted JSON to {output_filename}")

Transformed 270 entries.

--- Report ---
Original V32 count: 270
New JSON count: 270
Fields dropped from TSV (not in JSON schema): {'provenance_source'}
Fields added to JSON (not in TSV): set()

Saved reformatted JSON to v32_Dome-Recommendations-With_Provenance_reformatted.json


In [3]:
import json
import pandas as pd
import numpy as np

# Load source data
v32_path = 'v32_Dome-Recommendations-With_Provenance.tsv'
df_v32 = pd.read_csv(v32_path, sep='\t')

# Load template JSON to understand schema
json_template_path = 'dome_review_raw_human_20260204.json'
with open(json_template_path, 'r') as f:
    json_data = json.load(f)

# Use the first entry as a schema template (or analyze all keys if schema varies, but usually consistent)
# The user wants to "use info from ... reformat the v32 file in the same manner"
# We will inspect the keys of the first item to build our target schema structure.
template_item = json_data[0]
target_keys = list(template_item.keys())

print(f"Loaded {len(df_v32)} entries from TSV.")
print(f"Loaded {len(json_data)} entries from JSON template.")
print(f"Target Schema Keys: {target_keys}")

Loaded 270 entries from TSV.
Loaded 354 entries from JSON template.
Target Schema Keys: ['_id', 'dataset', 'evaluation', 'model', 'optimization', 'user', 'publication', 'public', 'created', 'updated', 'uuid', 'reviewState', 'shortid', 'update', '__v', 'score']
