In [3]:
# Block 5.0: Filter IDs based on Copilot Processed folder and update JSONs
import os
import pandas as pd
import json

print("="*60)
print("FILTERING AND UPDATING JSON METADATA (Block 5.0)")
print("="*60)

# 1. Define paths
input_tsv_path = 'Positive_PMC_TSV_Files/positive_entries_status.tsv'
json_folder_path = 'Copilot_1000_v0_Processed_2026-01-15'
filtered_tsv_output = 'Positive_PMC_TSV_Files/positive_entries_pmid_pmcid_filtered.tsv'

try:
    if os.path.exists(input_tsv_path) and os.path.exists(json_folder_path):
        # 2. Read TSV
        print(f"Reading TSV: {input_tsv_path}")
        df = pd.read_csv(input_tsv_path, sep='\t')
        
        # 3. Extract and Clean IDs
        # We need PMCID and PMID. Note: PMID might be float from previous steps
        # Create a simplified copy
        df_ids = df[['PMID', 'PMCID']].copy()
        
        def clean_pmid(val):
            if pd.isna(val) or val == '':
                return None
            try:
                # Convert to float then int to drop decimal, then string
                return str(int(float(val)))
            except:
                return str(val)

        df_ids['PMID'] = df_ids['PMID'].apply(clean_pmid)
        df_ids['PMCID'] = df_ids['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        
        # 4. Get list of JSON files to filter against
        json_files = [f for f in os.listdir(json_folder_path) if f.endswith('.json')]
        # Create a set of PMCIDs from filenames (remove .json extension)
        # Assuming filenames are like "PMC12345.json"
        json_pmcids = set(f.replace('.json', '') for f in json_files)
        
        print(f"Found {len(json_pmcids)} JSON files in {json_folder_path}")
        
        # 5. Filter the DataFrame
        # Keep row if its PMCID matches one in the folder
        df_filtered = df_ids[df_ids['PMCID'].isin(json_pmcids)].copy()
        
        count = len(df_filtered)
        print(f"Entries matching JSON files: {count}")
        
        # 6. Save the filtered TSV
        df_filtered.to_csv(filtered_tsv_output, sep='\t', index=False)
        print(f"Saved filtered TSV to: {filtered_tsv_output}")
        
        # 7. Update JSON files
        print("Updating JSON files with publication IDs (checking order)...")
        updated_count = 0
        
        for index, row in df_filtered.iterrows():
            pmcid = row['PMCID']
            pmid = row['PMID']
            
            if not pmcid:
                continue
                
            json_file_path = os.path.join(json_folder_path, f"{pmcid}.json")
            
            if os.path.exists(json_file_path):
                try:
                    with open(json_file_path, 'r') as f:
                        data = json.load(f)
                    
                    # Prepare new data dict to preserve/enforce order
                    # Target order: ..., publication/year, publication/pmid, publication/pmcid, publication/doi ...
                    new_data = {}
                    inserted = False
                    
                    pmid_val = pmid if pmid else ""
                    pmcid_val = pmcid
                    
                    # If pmid/pmcid keys already exist in data, skip them during iteration
                    keys_to_skip = ['publication/pmid', 'publication/pmcid']
                    
                    for key, value in data.items():
                        if key in keys_to_skip:
                            continue
                            
                        new_data[key] = value
                        
                        # Insert new keys immediately after publication/year
                        if key == 'publication/year':
                            new_data['publication/pmid'] = pmid_val
                            new_data['publication/pmcid'] = pmcid_val
                            inserted = True
                            
                    # Fallback: if 'publication/year' was not found, add them at the end
                    if not inserted:
                        new_data['publication/pmid'] = pmid_val
                        new_data['publication/pmcid'] = pmcid_val
                    
                    with open(json_file_path, 'w') as f:
                        json.dump(new_data, f, indent=2)
                        
                    updated_count += 1
                    
                except Exception as e:
                    print(f"Error updating {pmcid}.json: {e}")
            else:
                pass
                
        print(f"Successfully updated {updated_count} JSON files.")

    else:
        print(f"Error: Input file or folder not found.")
        print(f"TSV: {os.path.exists(input_tsv_path)}")
        print(f"Folder: {os.path.exists(json_folder_path)}")

except Exception as e:
    print(f"An error occurred: {str(e)}")


FILTERING AND UPDATING JSON METADATA (Block 5.0)
Reading TSV: Positive_PMC_TSV_Files/positive_entries_status.tsv
Found 1012 JSON files in Copilot_1000_v0_Processed_2026-01-15
Entries matching JSON files: 1012
Saved filtered TSV to: Positive_PMC_TSV_Files/positive_entries_pmid_pmcid_filtered.tsv
Updating JSON files with publication IDs (checking order)...
Successfully updated 1012 JSON files.


In [4]:
# Block 6.0: Analyze Metadata Differences (TSV vs JSON)
import os
import pandas as pd
import json
import difflib
import numpy as np

print("="*60)
print("ANALYZING METADATA DIFFERENCES (Block 6.0)")
print("="*60)

# Paths
tsv_path = 'Positive_PMC_TSV_Files/positive_entries_status.tsv'
json_folder = 'Copilot_1000_v0_Processed_2026-01-15'

# Field mapping: JSON key -> TSV column
field_map = {
    'publication/title': 'Title',
    'publication/authors': 'Authors',
    'publication/journal': 'Journal',
    'publication/year': 'Year',
    'publication/doi': 'DOI'
}

try:
    if os.path.exists(tsv_path) and os.path.exists(json_folder):
        print("Loading TSV...")
        df = pd.read_csv(tsv_path, sep='\t')
        
        # Helper to normalize TSV PMCID for matching
        df['PMCID_clean'] = df['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        
        # Metrics storage
        total_jsons = 0
        matched_jsons = 0
        diff_counts = {k: 0 for k in field_map.keys()}
        title_diff_types = {
            'exact': 0,
            'case_only': 0,
            'minor_diffs': 0, # High similarity
            'major_diffs': 0  # Low similarity / completely different
        }
        major_title_diffs = []

        print("Comparing files...")
        
        json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]
        total_jsons = len(json_files)
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            
            # Find row
            row = df[df['PMCID_clean'] == pmcid]
            
            if len(row) == 0:
                continue
            
            matched_jsons += 1
            row = row.iloc[0]
            
            # Load JSON
            with open(os.path.join(json_folder, json_file), 'r') as f:
                data = json.load(f)
                
            # Compare fields
            for json_key, tsv_col in field_map.items():
                json_val = str(data.get(json_key, "")).strip()
                
                # Get TSV val and clean
                tsv_val = row[tsv_col]
                if pd.isna(tsv_val):
                    tsv_val = ""
                
                # Handle Year specialized cleaning (float -> int -> str)
                if tsv_col == 'Year' and tsv_val != "":
                    try:
                        tsv_val = str(int(float(tsv_val)))
                    except:
                        tsv_val = str(tsv_val)
                else:
                    tsv_val = str(tsv_val).strip()
                
                # Comparison
                if json_val != tsv_val:
                    diff_counts[json_key] += 1
                    
                    # Deep dive for Title
                    if json_key == 'publication/title':
                        # Case insensitive check
                        if json_val.lower() == tsv_val.lower():
                            title_diff_types['case_only'] += 1
                        else:
                            # Sequence Matcher for similarity
                            matcher = difflib.SequenceMatcher(None, json_val, tsv_val)
                            ratio = matcher.ratio()
                            
                            if ratio > 0.8:
                                title_diff_types['minor_diffs'] += 1
                            else:
                                title_diff_types['major_diffs'] += 1
                                major_title_diffs.append({
                                    'PMCID': pmcid,
                                    'JSON_Title': json_val,
                                    'TSV_Title': tsv_val,
                                    'Similarity': f"{ratio:.2f}"
                                })
                    else:
                        # Non-title fields logic (simple mismatch counted above)
                        pass
                else:
                    if json_key == 'publication/title':
                        title_diff_types['exact'] += 1

        print("\n" + "-"*40)
        print("SUMMARY METRICS")
        print("-"*40)
        print(f"Total JSON files found: {total_jsons}")
        print(f"JSONs matched to TSV rows: {matched_jsons}")
        
        print("\nField Mismatch Counts (Values differ between JSON and TSV):")
        for key, count in diff_counts.items():
            print(f"  - {key}: {count} mismatch(es) ({(count/matched_jsons)*100:.1f}%)")

        print("\nTitle Difference Breakdown:")
        print(f"  - Exact Matches: {title_diff_types['exact']}")
        print(f"  - Case-only Differences: {title_diff_types['case_only']}")
        print(f"  - Minor Differences (>80% similarity): {title_diff_types['minor_diffs']}")
        print(f"  - Major Differences (<80% similarity): {title_diff_types['major_diffs']}")
        
        if len(major_title_diffs) > 0:
            print("\nExamples of Major Title Differences (first 5):")
            for item in major_title_diffs[:5]:
                print(f"  [{item['PMCID']}]")
                print(f"    JSON: {item['JSON_Title']}")
                print(f"    TSV : {item['TSV_Title']}")
                print(f"    Sim : {item['Similarity']}")

    else:
        print("Error: Input paths do not exist.")

except Exception as e:
    print(f"An error occurred: {str(e)}")


ANALYZING METADATA DIFFERENCES (Block 6.0)
Loading TSV...
Comparing files...

----------------------------------------
SUMMARY METRICS
----------------------------------------
Total JSON files found: 1012
JSONs matched to TSV rows: 1012

Field Mismatch Counts (Values differ between JSON and TSV):
  - publication/title: 1001 mismatch(es) (98.9%)
  - publication/authors: 1012 mismatch(es) (100.0%)
  - publication/journal: 964 mismatch(es) (95.3%)
  - publication/year: 180 mismatch(es) (17.8%)
  - publication/doi: 679 mismatch(es) (67.1%)

Title Difference Breakdown:
  - Exact Matches: 11
  - Case-only Differences: 0
  - Minor Differences (>80% similarity): 471
  - Major Differences (<80% similarity): 530

Examples of Major Title Differences (first 5):
  [PMC4407517]
    JSON: Applying Machine Learning Techniques in Detecting Bacterial Vaginosis
    TSV : APPLYING MACHINE LEARNING TECHNIQUES IN DETECTING BACTERIAL VAGINOSIS.
    Sim : 0.20
  [PMC9347213]
    JSON: Not enough information i

In [6]:
# Block 6.5: Check for "Not enough information is available" in Title
import os
import json

print("="*60)
print("CHECKING FOR MISSING TITLES IN JSON (Block 6.5)")
print("="*60)

json_folder_path = 'Copilot_1000_v0_Processed_2026-01-15'
target_string = "Not enough information is available"
missing_title_count = 0
missing_title_files = []

try:
    if os.path.exists(json_folder_path):
        json_files = [f for f in os.listdir(json_folder_path) if f.endswith('.json')]
        total_files = len(json_files)
        
        print(f"Scanning {total_files} JSON files...")
        
        for json_file in json_files:
            file_path = os.path.join(json_folder_path, json_file)
            
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                title = data.get('publication/title', '')
                
                # Check for specific phrase
                if title == target_string:
                    missing_title_count += 1
                    missing_title_files.append(json_file)
                    
            except Exception as e:
                print(f"Error reading {json_file}: {e}")
                
        print("-" * 40)
        print(f"Files with title '{target_string}': {missing_title_count}")
        print("-" * 40)
        
        if missing_title_count > 0:
            print("Files listed:")
            for f in missing_title_files:
                print(f"  - {f}")
                
    else:
        print(f"Folder not found: {json_folder_path}")

except Exception as e:
    print(f"An error occurred: {str(e)}")


CHECKING FOR MISSING TITLES IN JSON (Block 6.5)
Scanning 1012 JSON files...
----------------------------------------
Files with title 'Not enough information is available': 25
----------------------------------------
Files listed:
  - PMC8080676.json
  - PMC11148103.json
  - PMC4368063.json
  - PMC3205469.json
  - PMC9420706.json
  - PMC7988437.json
  - PMC7874964.json
  - PMC10052279.json
  - PMC10365090.json
  - PMC6992687.json
  - PMC7821214.json
  - PMC10785655.json
  - PMC9086604.json
  - PMC10046420.json
  - PMC11140654.json
  - PMC10239131.json
  - PMC10791584.json
  - PMC2846370.json
  - PMC11110913.json
  - PMC11127166.json
  - PMC10235219.json
  - PMC6924628.json
  - PMC10060474.json
  - PMC5685313.json
  - PMC9869541.json


In [5]:
# Block 7.0: Create Updated JSONs with Corrected Metadata
import os
import shutil

print("="*60)
print("CREATING UPDATED JSONS IN NEW FOLDER (Block 7.0)")
print("="*60)

# Paths
source_json_folder = 'Copilot_1000_v0_Processed_2026-01-15'
target_json_folder = 'Copilot_1000_v0_Processed_2026-01-15_Updated_Metadata'
tsv_path = 'Positive_PMC_TSV_Files/positive_entries_status.tsv'

# Field mapping: JSON key -> TSV column
field_map = {
    'publication/title': 'Title',
    'publication/authors': 'Authors',
    'publication/journal': 'Journal',
    'publication/year': 'Year',
    'publication/doi': 'DOI'
}

try:
    # 1. Create target directory
    if not os.path.exists(target_json_folder):
        os.makedirs(target_json_folder)
        print(f"Created directory: {target_json_folder}")
    else:
        print(f"Directory exists: {target_json_folder}")

    if os.path.exists(tsv_path) and os.path.exists(source_json_folder):
        print("Loading TSV reference data...")
        df = pd.read_csv(tsv_path, sep='\t')
        df['PMCID_clean'] = df['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        
        updated_files_count = 0
        
        json_files = [f for f in os.listdir(source_json_folder) if f.endswith('.json')]
        total_files = len(json_files)
        
        print(f"Processing {total_files} files...")
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            
            # Find TSV row
            row = df[df['PMCID_clean'] == pmcid]
            
            # Load original JSON
            source_path = os.path.join(source_json_folder, json_file)
            target_path = os.path.join(target_json_folder, json_file)
            
            with open(source_path, 'r') as f:
                data = json.load(f)
            
            # If we have TSV data, update fields if different
            if len(row) > 0:
                row = row.iloc[0]
                
                changes_made = False
                
                for json_key, tsv_col in field_map.items():
                    current_val = str(data.get(json_key, "")).strip()
                    
                    # Prepare new value
                    new_val_raw = row[tsv_col]
                    if pd.isna(new_val_raw):
                        new_val = ""
                    else:
                        if tsv_col == 'Year':
                            try:
                                new_val = str(int(float(new_val_raw)))
                            except:
                                new_val = str(new_val_raw)
                        else:
                            new_val = str(new_val_raw).strip()
                    
                    # Check difference
                    if current_val != new_val:
                        data[json_key] = new_val
                        changes_made = True
                
                if changes_made:
                    updated_files_count += 1
            
            # Save to new location (either updated or original copy)
            with open(target_path, 'w') as f:
                json.dump(data, f, indent=2)
                
        print(f"Completed.")
        print(f"Total files processed: {total_files}")
        print(f"Files with metadata updates: {updated_files_count}")
        print(f"All files saved to: {target_json_folder}")

    else:
        print("Error: Source data not found.")

except Exception as e:
    print(f"An error occurred: {str(e)}")


CREATING UPDATED JSONS IN NEW FOLDER (Block 7.0)
Created directory: Copilot_1000_v0_Processed_2026-01-15_Updated_Metadata
Loading TSV reference data...
Processing 1012 files...
Completed.
Total files processed: 1012
Files with metadata updates: 1012
All files saved to: Copilot_1000_v0_Processed_2026-01-15_Updated_Metadata


In [None]:
# Block 8.0: Manual Visual Inspection Interface (Alternating High/Low Sim)
# Run this cell repeatedly (Ctrl+Enter) to view entries.
# It will alternate between High Similarity mismatches (>=80%) and Low Similarity (<=20%).
import os
import pandas as pd
import json
import difflib
import random
from IPython.display import display, Markdown, HTML, clear_output

# Global toggle for alternation (persists across cell runs)
# Initialize only if not present
if 'inspection_mode_high' not in globals():
    inspection_mode_high = True # True=High, False=Low

source_json_folder = 'Copilot_1000_v0_Processed_2026-01-15'
tsv_path = 'Positive_PMC_TSV_Files/positive_entries_status.tsv'

try:
    if os.path.exists(tsv_path) and os.path.exists(source_json_folder):
        # Load Data
        df = pd.read_csv(tsv_path, sep='\t')
        df['PMCID_clean'] = df['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        
        # Helper to clean numeric strings
        def clean_val(v):
            if pd.isna(v) or v == '': return ""
            try: return str(int(float(v)))
            except: return str(v).strip()

        high_sim = [] 
        low_sim = []
        
        json_files = [f for f in os.listdir(source_json_folder) if f.endswith('.json')]
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            row = df[df['PMCID_clean'] == pmcid]
            if len(row) == 0: continue
            row = row.iloc[0]
            
            with open(os.path.join(source_json_folder, json_file), 'r') as f:
                data = json.load(f)
                
            j_title = str(data.get('publication/title', "")).strip()
            t_title = str(row['Title']).strip() if pd.notna(row['Title']) else ""
            
            if j_title != t_title:
                ratio = difflib.SequenceMatcher(None, j_title, t_title).ratio()
                
                entry = {
                    'pmcid': pmcid,
                    'json': data,
                    'tsv': row,
                    'ratio': ratio
                }
                
                if ratio >= 0.8: high_sim.append(entry)
                elif ratio <= 0.2: low_sim.append(entry)

        # Toggle Selection Logic
        target_pool = []
        mode_str = ""
        
        # Try to respect toggle, but fallback if one pool is empty
        if inspection_mode_high:
            if high_sim: 
                target_pool = high_sim
                mode_str = "High Similarity (>=80%)"
            elif low_sim:
                target_pool = low_sim
                mode_str = "Low Similarity (<=20%) [High list empty]"
        else:
            if low_sim:
                target_pool = low_sim
                mode_str = "Low Similarity (<=20%)"
            elif high_sim:
                target_pool = high_sim
                mode_str = "High Similarity (>=80%) [Low list empty]"
        
        # Flip toggle for next run
        inspection_mode_high = not inspection_mode_high
        
        if not target_pool:
            print("No mismatches found in either category.")
        else:
            item = random.choice(target_pool)
            
            # Prepare IDs for Link
            curr_pmcid = item['pmcid']
            curr_pmid = clean_val(item['tsv']['PMID'])
            
            # --- DISPLAY SECTION ---
            
            # Using display() ensures rich output isn't hidden/truncated easily by text buffer limits
            display(Markdown(f"### {mode_str} | Similarity: {item['ratio']:.2f}"))
            
            # Create HTML links to Europe PMC Search
            url_pmcid = f"https://europepmc.org/search?query={curr_pmcid}"
            url_pmid = f"https://europepmc.org/search?query={curr_pmid}" if curr_pmid else "#"
            
            # Render Links
            display(HTML(f"""
            <div style="background-color: #e8e8e8; padding: 12px; border-radius: 4px; border-left: 5px solid #007acc; font-family: sans-serif;">
                <span style="font-weight: bold; margin-right: 10px;">IDS:</span>
                <a href="{url_pmcid}" target="_blank" style="text-decoration: none; font-weight: bold; color: #0066cc; margin-right: 20px; font-size: 1.1em;">{curr_pmcid} ↗</a>
                <a href="{url_pmid}" target="_blank" style="text-decoration: none; font-weight: bold; color: #0066cc; font-size: 1.1em;">PMID:{curr_pmid} ↗</a>
            </div>
            <br>
            """))
            
            # Comparison Loop
            fields = [
                ('Title', 'publication/title', 'Title'),
                ('Authors', 'publication/authors', 'Authors'),
                ('Journal', 'publication/journal', 'Journal'),
                ('Year', 'publication/year', 'Year'),
                ('DOI', 'publication/doi', 'DOI')
            ]
            
            for label, k_json, k_tsv in fields:
                v_json = str(item['json'].get(k_json, "")).strip()
                v_tsv = item['tsv'][k_tsv]
                
                # Special clean for display
                if k_tsv == 'Year': v_tsv = clean_val(v_tsv)
                else: v_tsv = str(v_tsv).strip() if pd.notna(v_tsv) else ""
                
                match = v_json == v_tsv
                symbol = "✅" if match else "❌"
                
                # Use print for content to avoid HTML rendering issues with weird chars
                print(f"{symbol} [{label}]")
                if not match:
                    print(f"  JSON: {v_json}")
                    print(f"  TSV : {v_tsv}")
                else:
                    print(f"  {v_json}")
                print("-" * 60)
            
            print("\n(Run cell again [Ctrl+Enter] to flip category next time)")

    else:
        print("Error: Files not found.")

except Exception as e:
    print(f"Error: {e}")


MANUAL VISUAL INSPECTION - SINGLE RANDOM ENTRY (Block 8.0)
Pool Size: 570 mismatches (High Sim: 471, Low Sim: 99)

################################################################################
PMCID: PMC11404989
Category: High Similarity (>=80%) | Title Similarity: 0.99
################################################################################

✅ [PMID]
  39292083

✅ [PMCID]
  PMC11404989

❌ [Title]
  JSON: Investigating artificial intelligence models for predicting joint pain from serum biochemistry
  TSV : Investigating artificial intelligence models for predicting joint pain from serum biochemistry.

❌ [Authors]
  JSON: The authors who contributed to this article are:

- Shahid S, who was involved in conceptualization, formal analysis, investigation, software development, supervision, and writing the original draft.
- AJ, who contributed to data curation, investigation, and reviewing and editing the manuscript.
- UA, who was responsible for data curation, resources, and val