In [None]:
# Block 5.0: Filter IDs based on Copilot Processed folder and update JSONs
import os
import pandas as pd
import json

print("="*60)
print("FILTERING AND UPDATING JSON METADATA (Block 5.0)")
print("="*60)

# 1. Define paths
input_tsv_path = 'Positive_PMC_TSV_Files/positive_entries_status.tsv'
json_folder_path = 'Copilot_1000_v0_Processed_2026-01-15'
filtered_tsv_output = 'Positive_PMC_TSV_Files/positive_entries_pmid_pmcid_filtered.tsv'

try:
    if os.path.exists(input_tsv_path) and os.path.exists(json_folder_path):
        # 2. Read TSV
        print(f"Reading TSV: {input_tsv_path}")
        df = pd.read_csv(input_tsv_path, sep='\t')
        
        # 3. Extract and Clean IDs
        # We need PMCID and PMID. Note: PMID might be float from previous steps
        # Create a simplified copy
        df_ids = df[['PMID', 'PMCID']].copy()
        
        def clean_pmid(val):
            if pd.isna(val) or val == '':
                return None
            try:
                # Convert to float then int to drop decimal, then string
                return str(int(float(val)))
            except:
                return str(val)

        df_ids['PMID'] = df_ids['PMID'].apply(clean_pmid)
        df_ids['PMCID'] = df_ids['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        
        # 4. Get list of JSON files to filter against
        json_files = [f for f in os.listdir(json_folder_path) if f.endswith('.json')]
        # Create a set of PMCIDs from filenames (remove .json extension)
        # Assuming filenames are like "PMC12345.json"
        json_pmcids = set(f.replace('.json', '') for f in json_files)
        
        print(f"Found {len(json_pmcids)} JSON files in {json_folder_path}")
        
        # 5. Filter the DataFrame
        # Keep row if its PMCID matches one in the folder
        df_filtered = df_ids[df_ids['PMCID'].isin(json_pmcids)].copy()
        
        count = len(df_filtered)
        print(f"Entries matching JSON files: {count}")
        
        # 6. Save the filtered TSV
        df_filtered.to_csv(filtered_tsv_output, sep='\t', index=False)
        print(f"Saved filtered TSV to: {filtered_tsv_output}")
        
        # 7. Update JSON files
        print("Updating JSON files with publication IDs (checking order)...")
        updated_count = 0
        
        for index, row in df_filtered.iterrows():
            pmcid = row['PMCID']
            pmid = row['PMID']
            
            if not pmcid:
                continue
                
            json_file_path = os.path.join(json_folder_path, f"{pmcid}.json")
            
            if os.path.exists(json_file_path):
                try:
                    with open(json_file_path, 'r') as f:
                        data = json.load(f)
                    
                    # Prepare new data dict to preserve/enforce order
                    # Target order: ..., publication/year, publication/pmid, publication/pmcid, publication/doi ...
                    new_data = {}
                    inserted = False
                    
                    pmid_val = pmid if pmid else ""
                    pmcid_val = pmcid
                    
                    # If pmid/pmcid keys already exist in data, skip them during iteration
                    keys_to_skip = ['publication/pmid', 'publication/pmcid']
                    
                    for key, value in data.items():
                        if key in keys_to_skip:
                            continue
                            
                        new_data[key] = value
                        
                        # Insert new keys immediately after publication/year
                        if key == 'publication/year':
                            new_data['publication/pmid'] = pmid_val
                            new_data['publication/pmcid'] = pmcid_val
                            inserted = True
                            
                    # Fallback: if 'publication/year' was not found, add them at the end
                    if not inserted:
                        new_data['publication/pmid'] = pmid_val
                        new_data['publication/pmcid'] = pmcid_val
                    
                    with open(json_file_path, 'w') as f:
                        json.dump(new_data, f, indent=2)
                        
                    updated_count += 1
                    
                except Exception as e:
                    print(f"Error updating {pmcid}.json: {e}")
            else:
                pass
                
        print(f"Successfully updated {updated_count} JSON files.")

    else:
        print(f"Error: Input file or folder not found.")
        print(f"TSV: {os.path.exists(input_tsv_path)}")
        print(f"Folder: {os.path.exists(json_folder_path)}")

except Exception as e:
    print(f"An error occurred: {str(e)}")


FILTERING AND UPDATING JSON METADATA (Block 5.0)
Reading TSV: Positive_PMC_TSV_Files/positive_entries_status.tsv
Found 1012 JSON files in Copilot_1000_v0_Processed_2026-01-15
Entries matching JSON files: 1012
Saved filtered TSV to: Positive_PMC_TSV_Files/positive_entries_pmid_pmcid_filtered.tsv
Updating JSON files with publication IDs...
Successfully updated 1012 JSON files.
