In [3]:
# Block 5.0: Filter IDs based on Copilot Processed folder and update JSONs
import os
import pandas as pd
import json

print("="*60)
print("FILTERING AND UPDATING JSON METADATA (Block 5.0)")
print("="*60)

# 1. Define paths
input_tsv_path = 'Positive_PMC_TSV_Files/positive_entries_status.tsv'
json_folder_path = 'Copilot_1000_v0_Processed_2026-01-15'
filtered_tsv_output = 'Positive_PMC_TSV_Files/positive_entries_pmid_pmcid_filtered.tsv'

try:
    if os.path.exists(input_tsv_path) and os.path.exists(json_folder_path):
        # 2. Read TSV
        print(f"Reading TSV: {input_tsv_path}")
        df = pd.read_csv(input_tsv_path, sep='\t')
        
        # 3. Extract and Clean IDs
        # We need PMCID and PMID. Note: PMID might be float from previous steps
        # Create a simplified copy
        df_ids = df[['PMID', 'PMCID']].copy()
        
        def clean_pmid(val):
            if pd.isna(val) or val == '':
                return None
            try:
                # Convert to float then int to drop decimal, then string
                return str(int(float(val)))
            except:
                return str(val)

        df_ids['PMID'] = df_ids['PMID'].apply(clean_pmid)
        df_ids['PMCID'] = df_ids['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        
        # 4. Get list of JSON files to filter against
        json_files = [f for f in os.listdir(json_folder_path) if f.endswith('.json')]
        # Create a set of PMCIDs from filenames (remove .json extension)
        # Assuming filenames are like "PMC12345.json"
        json_pmcids = set(f.replace('.json', '') for f in json_files)
        
        print(f"Found {len(json_pmcids)} JSON files in {json_folder_path}")
        
        # 5. Filter the DataFrame
        # Keep row if its PMCID matches one in the folder
        df_filtered = df_ids[df_ids['PMCID'].isin(json_pmcids)].copy()
        
        count = len(df_filtered)
        print(f"Entries matching JSON files: {count}")
        
        # 6. Save the filtered TSV
        df_filtered.to_csv(filtered_tsv_output, sep='\t', index=False)
        print(f"Saved filtered TSV to: {filtered_tsv_output}")
        
        # 7. Update JSON files
        print("Updating JSON files with publication IDs (checking order)...")
        updated_count = 0
        
        for index, row in df_filtered.iterrows():
            pmcid = row['PMCID']
            pmid = row['PMID']
            
            if not pmcid:
                continue
                
            json_file_path = os.path.join(json_folder_path, f"{pmcid}.json")
            
            if os.path.exists(json_file_path):
                try:
                    with open(json_file_path, 'r') as f:
                        data = json.load(f)
                    
                    # Prepare new data dict to preserve/enforce order
                    # Target order: ..., publication/year, publication/pmid, publication/pmcid, publication/doi ...
                    new_data = {}
                    inserted = False
                    
                    pmid_val = pmid if pmid else ""
                    pmcid_val = pmcid
                    
                    # If pmid/pmcid keys already exist in data, skip them during iteration
                    keys_to_skip = ['publication/pmid', 'publication/pmcid']
                    
                    for key, value in data.items():
                        if key in keys_to_skip:
                            continue
                            
                        new_data[key] = value
                        
                        # Insert new keys immediately after publication/year
                        if key == 'publication/year':
                            new_data['publication/pmid'] = pmid_val
                            new_data['publication/pmcid'] = pmcid_val
                            inserted = True
                            
                    # Fallback: if 'publication/year' was not found, add them at the end
                    if not inserted:
                        new_data['publication/pmid'] = pmid_val
                        new_data['publication/pmcid'] = pmcid_val
                    
                    with open(json_file_path, 'w') as f:
                        json.dump(new_data, f, indent=2)
                        
                    updated_count += 1
                    
                except Exception as e:
                    print(f"Error updating {pmcid}.json: {e}")
            else:
                pass
                
        print(f"Successfully updated {updated_count} JSON files.")

    else:
        print(f"Error: Input file or folder not found.")
        print(f"TSV: {os.path.exists(input_tsv_path)}")
        print(f"Folder: {os.path.exists(json_folder_path)}")

except Exception as e:
    print(f"An error occurred: {str(e)}")


FILTERING AND UPDATING JSON METADATA (Block 5.0)
Reading TSV: Positive_PMC_TSV_Files/positive_entries_status.tsv
Found 1012 JSON files in Copilot_1000_v0_Processed_2026-01-15
Entries matching JSON files: 1012
Saved filtered TSV to: Positive_PMC_TSV_Files/positive_entries_pmid_pmcid_filtered.tsv
Updating JSON files with publication IDs (checking order)...
Successfully updated 1012 JSON files.


In [12]:
# Block 6.0: Analyze Metadata Differences (TSV vs JSON) - Enhanced Reporting
import os
import pandas as pd
import json
import difflib
import matplotlib.pyplot as plt
import datetime

print("="*60)
print("ANALYZING METADATA DIFFERENCES & GENERATING REPORT (Block 6.0)")
print("="*60)

# Paths
tsv_path = 'Positive_PMC_TSV_Files/positive_entries_status.tsv'
json_folder = 'Copilot_1000_v0_Processed_2026-01-15'

# Create Report Folder
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
report_folder = f"Metadata_Analysis_Reports/Report_{timestamp}"

# Field mapping
field_map = {
    'publication/title': 'Title',
    'publication/authors': 'Authors',
    'publication/journal': 'Journal',
    'publication/year': 'Year',
    'publication/doi': 'DOI'
}

try:
    if not os.path.exists(report_folder):
        os.makedirs(report_folder)
        print(f"Created report directory: {report_folder}")

    if os.path.exists(tsv_path) and os.path.exists(json_folder):
        print("Loading TSV...")
        df = pd.read_csv(tsv_path, sep='\t')
        df['PMCID_clean'] = df['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        
        # Metrics
        total_jsons = 0
        matched_jsons = 0
        diff_counts = {k: 0 for k in field_map.keys()}
        
        # Title specific
        title_stats = {
            'exact': 0,
            'case_only': 0,
            'minor_diffs': 0, # > 0.8
            'major_diffs': 0  # < 0.8
        }
        
        # Detailed mismatch logs
        major_title_diffs = []
        all_field_mismatches = [] # store dicts of {pmcid, field, json_val, tsv_val}

        print("Comparing files...")
        json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]
        total_jsons = len(json_files)
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            row = df[df['PMCID_clean'] == pmcid]
            
            if len(row) == 0: continue
            matched_jsons += 1
            row = row.iloc[0]
            
            with open(os.path.join(json_folder, json_file), 'r') as f:
                data = json.load(f)
                
            for json_key, tsv_col in field_map.items():
                j_val = str(data.get(json_key, "")).strip()
                t_val = row[tsv_col]
                
                # Check TSV nan
                if pd.isna(t_val): t_val = ""
                
                # Clean Year
                if tsv_col == 'Year' and t_val != "":
                    try: t_val = str(int(float(t_val)))
                    except: t_val = str(t_val)
                else:
                    t_val = str(t_val).strip()
                
                if j_val != t_val:
                    diff_counts[json_key] += 1
                    
                    # Log mismatch
                    all_field_mismatches.append({
                        'PMCID': pmcid,
                        'Field': json_key,
                        'JSON_Value': j_val,
                        'TSV_Value': t_val
                    })
                    
                    if json_key == 'publication/title':
                         if j_val.lower() == t_val.lower():
                            title_stats['case_only'] += 1
                         else:
                            ratio = difflib.SequenceMatcher(None, j_val, t_val).ratio()
                            if ratio > 0.8:
                                title_stats['minor_diffs'] += 1
                            else:
                                title_stats['major_diffs'] += 1
                                major_title_diffs.append({
                                    'PMCID': pmcid, 
                                    'JSON': j_val, 
                                    'TSV': t_val, 
                                    'Sim': f"{ratio:.2f}"
                                })
                else:
                     if json_key == 'publication/title':
                        title_stats['exact'] += 1
                        
        # --- GENERATE OUTPUTS ---
        
        # 1. Visualization
        # Mismatch Counts Bar Chart
        plt.figure(figsize=(10, 6))
        keys = list(diff_counts.keys())
        vals = list(diff_counts.values())
        colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99', '#c2c2f0']
        
        plt.bar(keys, vals, color=colors[:len(keys)])
        plt.title(f'Metadata Mismatches per Field (N={matched_jsons})')
        plt.xlabel('Field')
        plt.ylabel('Count of Mismatches')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(report_folder, 'mismatch_counts.png'))
        plt.close()
        
        # Title Breakdown Pie Chart
        plt.figure(figsize=(8, 8))
        labels = [f"Exact ({title_stats['exact']})", 
                  f"Case Only ({title_stats['case_only']})", 
                  f"Minor >0.8 ({title_stats['minor_diffs']})",
                  f"Major <0.8 ({title_stats['major_diffs']})"]
        sizes = [title_stats['exact'], title_stats['case_only'], title_stats['minor_diffs'], title_stats['major_diffs']]
        
        # Filter zero slices to avoid cluttered legend
        pie_data = [(l, s) for l, s in zip(labels, sizes) if s > 0]
        if pie_data:
            p_labels, p_sizes = zip(*pie_data)
            plt.pie(p_sizes, labels=p_labels, autopct='%1.1f%%', startangle=140)
            plt.title('Title Similarity Breakdown')
            plt.tight_layout()
            plt.savefig(os.path.join(report_folder, 'title_similarity.png'))
        plt.close()
        
        # 2. Detailed Report File (Markdown)
        report_path = os.path.join(report_folder, 'Analysis_Report.md')
        with open(report_path, 'w') as f:
            f.write(f"# Metadata Analysis Report\n")
            f.write(f"**Date:** {timestamp}\n\n")
            f.write(f"**Source JSON Folder:** `{json_folder}`\n")
            f.write(f"**Source TSV:** `{tsv_path}`\n")
            f.write(f"**Total Files Scanned:** {total_jsons}\n")
            f.write(f"**Files Matched to TSV:** {matched_jsons}\n\n")
            
            f.write("## 1. Field Mismatch Summary\n")
            f.write("| Field | Mismatches | %\n")
            f.write("|---|---|---|\n")
            for k, v in diff_counts.items():
                pct = (v/matched_jsons)*100 if matched_jsons else 0
                f.write(f"| {k} | {v} | {pct:.1f}% |\n")
            
            f.write("\n![Mismatch Counts](mismatch_counts.png)\n\n")
            
            f.write("## 2. Title Similarity Breakdown\n")
            f.write(f"- **Exact Matches:** {title_stats['exact']}\n")
            f.write(f"- **Case-only Differences:** {title_stats['case_only']}\n")
            f.write(f"- **Minor Differences (>80%):** {title_stats['minor_diffs']}\n")
            f.write(f"- **Major Differences (<80%):** {title_stats['major_diffs']}\n\n")
            
            f.write("\n![Title Breakdown](title_similarity.png)\n\n")
            
            f.write("## 3. Major Title Differences (Low Similarity)\n")
            if major_title_diffs:
                f.write("| PMCID | JSON Title | TSV Title | Similarity |\n")
                f.write("|---|---|---|---|\n")
                for m in major_title_diffs:
                    # Escape pipes for markdown table
                    j_t = m['JSON'].replace('|', '\|')
                    t_t = m['TSV'].replace('|', '\|')
                    f.write(f"| {m['PMCID']} | {j_t} | {t_t} | {m['Sim']} |\n")
            else:
                f.write("No major title differences found.\n")
                
        # 3. CSV export of all mismatches
        if all_field_mismatches:
            csv_path = os.path.join(report_folder, 'all_mismatches.csv')
            pd.DataFrame(all_field_mismatches).to_csv(csv_path, index=False)
            print(f"Detailed mismatch CSV saved to: {csv_path}")

        print(f"\nAnalysis Complete!")
        print(f"Report and Visualizations generated in folder: {report_folder}")
        print(f"  - {os.path.join(report_folder, 'Analysis_Report.md')}")
        print(f"  - {os.path.join(report_folder, 'mismatch_counts.png')}")
        print(f"  - {os.path.join(report_folder, 'title_similarity.png')}")
        
    else:
        print("Error: Input files not found.")

except Exception as e:
    print(f"Error: {e}")

  j_t = m['JSON'].replace('|', '\|')
  t_t = m['TSV'].replace('|', '\|')


ANALYZING METADATA DIFFERENCES & GENERATING REPORT (Block 6.0)
Created report directory: Metadata_Analysis_Reports/Report_2026-01-15_15-18-53
Loading TSV...
Comparing files...
Detailed mismatch CSV saved to: Metadata_Analysis_Reports/Report_2026-01-15_15-18-53/all_mismatches.csv

Analysis Complete!
Report and Visualizations generated in folder: Metadata_Analysis_Reports/Report_2026-01-15_15-18-53
  - Metadata_Analysis_Reports/Report_2026-01-15_15-18-53/Analysis_Report.md
  - Metadata_Analysis_Reports/Report_2026-01-15_15-18-53/mismatch_counts.png
  - Metadata_Analysis_Reports/Report_2026-01-15_15-18-53/title_similarity.png


In [24]:

# Block 9.0: "Not Enough Information" Coverage Analysis
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np

print("="*60)
print("RUNNING COVERAGE ANALYSIS (Block 9.0)")
print("="*60)

# 1. Setup
json_folder = 'Copilot_1000_v0_Processed_2026-01-15'
target_phrase = "Not enough information is available" # We'll match this loosely (starts with)

# Create Output Folder
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"Coverage_Analysis_Reports/Report_{timestamp}"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created report directory: {output_dir}")

# Categories Mapping
def get_category(key):
    if key.startswith('publication'): return 'Publication'
    if key.startswith('dataset'): return 'Data'
    if key.startswith('optimization'): return 'Optimisation'
    if key.startswith('model'): return 'Model'
    if key.startswith('evaluation'): return 'Evaluation'
    return 'Other'

try:
    if os.path.exists(json_folder):
        json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]
        total_files = len(json_files)
        print(f"Scanning {total_files} files for missing information markers...")
        
        # Structure to hold counts
        # counts[field] = num_missing
        field_counts = {}
        all_fields = set()
        
        # 2. Scan Files
        for json_file in json_files:
            with open(os.path.join(json_folder, json_file), 'r') as f:
                data = json.load(f)
                
            for key, val in data.items():
                all_fields.add(key)
                if key not in field_counts:
                    field_counts[key] = 0
                
                # Check for target phrase
                val_str = str(val).strip()
                if val_str.startswith(target_phrase):
                    field_counts[key] += 1

        print("Scan complete. Generating analysis...")

        # 3. Process Data into DataFrame
        data_list = []
        for key in all_fields:
            missing_count = field_counts.get(key, 0)
            category = get_category(key)
            pct_missing = (missing_count / total_files) * 100
            
            data_list.append({
                'Field': key,
                'Category': category,
                'Missing_Count': missing_count,
                'Total_Files': total_files,
                'Missing_Percentage': pct_missing
            })
            
        df_stats = pd.DataFrame(data_list)
        
        # Sort for consistency
        df_stats = df_stats.sort_values(by=['Category', 'Field'])
        
        # 4. Generate Visualizations
        
        # A. Main Category Aggregation (Mean % Missing per category)
        category_stats = df_stats.groupby('Category')['Missing_Percentage'].mean().reset_index()
        # Custom sort order
        cat_order = ['Publication', 'Data', 'Optimisation', 'Model', 'Evaluation']
        category_stats['Category'] = pd.Categorical(category_stats['Category'], categories=cat_order, ordered=True)
        category_stats = category_stats.sort_values('Category')
        
        plt.figure(figsize=(10, 6))
        bars = plt.bar(category_stats['Category'], category_stats['Missing_Percentage'], color='#4c72b0')
        plt.title('Average Information Gap by Main Category\n(% of fields marked "Not enough information is available")', fontsize=12)
        plt.ylabel('Avg. Missing %', fontsize=10)
        plt.xlabel('Category', fontsize=10)
        plt.ylim(0, 100)
        
        # Add labels on bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                     f'{height:.1f}%', ha='center', va='bottom')
                     
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'Category_Summary.png'), dpi=150)
        plt.close()
        
        # B. Granular Subfields (Grouped Bar Chart)
        # We'll make subplots for each category to ensure readability
        unique_cats = [c for c in cat_order if c in df_stats['Category'].unique()]
        
        # Figure size depends on number of fields
        fig, axes = plt.subplots(nrows=len(unique_cats), ncols=1, figsize=(12, 4 * len(unique_cats)), constrained_layout=True)
        if len(unique_cats) == 1: axes = [axes] # Handle single category case
        
        for i, cat in enumerate(unique_cats):
            ax = axes[i]
            subset = df_stats[df_stats['Category'] == cat].sort_values('Missing_Percentage', ascending=False)
            
            # Simple barh
            y_pos = np.arange(len(subset))
            ax.barh(y_pos, subset['Missing_Percentage'], align='center', color='#55a868')
            ax.set_yticks(y_pos)
            ax.set_yticklabels(subset['Field'])
            ax.invert_yaxis() # labels read top-to-bottom
            ax.set_xlabel('% Coverage Gap')
            ax.set_title(f'Category: {cat}')
            ax.set_xlim(0, 100)
            
            # Add text labels
            for j, v in enumerate(subset['Missing_Percentage']):
                ax.text(v + 1, j, f"{v:.1f}%", va='center', fontsize=9)

        plt.suptitle(f'Detailed Gap Analysis by Field (Total Files: {total_files})', fontsize=16)
        plt.savefig(os.path.join(output_dir, 'Field_Level_Analysis.png'), dpi=150)
        plt.close()

        # 5. Generate Report Document
        report_file = os.path.join(output_dir, 'Coverage_Analysis_Report.md')
        
        with open(report_file, 'w') as r:
            r.write(f"# Metadata Coverage Analysis Report\n")
            r.write(f"**Date:** {timestamp}\n")
            r.write(f"**Dataset:** {total_files} JSON files from `{json_folder}`\n")
            r.write(f"**Target Phrase:** \"{target_phrase}...\"\n\n")
            
            r.write("## 1. Executive Summary\n")
            r.write("The following chart shows the average percentage of fields marked as 'Not enough information is available' within each main category.\n\n")
            r.write("![Category Summary](Category_Summary.png)\n\n")
            
            r.write("## 2. Category Statistics\n")
            r.write("| Category | Avg Missing % | Max Missing Field | Min Missing Field |\n")
            r.write("|---|---|---|---|\n")
            
            for cat in unique_cats:
                subset = df_stats[df_stats['Category'] == cat]
                avg = subset['Missing_Percentage'].mean()
                max_f = subset.loc[subset['Missing_Percentage'].idxmax()]
                min_f = subset.loc[subset['Missing_Percentage'].idxmin()]
                
                r.write(f"| **{cat}** | {avg:.1f}% | {max_f['Field']} ({max_f['Missing_Percentage']:.1f}%) | {min_f['Field']} ({min_f['Missing_Percentage']:.1f}%) |\n")
                
            r.write("\n## 3. Detailed Field Breakdown\n")
            r.write("![Field Level Analysis](Field_Level_Analysis.png)\n\n")
            
            r.write("### Full Data Table\n")
            r.write("| Category | Field | Missing Count | Missing % |\n")
            r.write("|---|---|---|---|\n")
            
            # Re-sort nicely for table
            df_table = df_stats.sort_values(by=['Category', 'Missing_Percentage'], ascending=[True, False])
            
            for _, row in df_table.iterrows():
                r.write(f"| {row['Category']} | {row['Field']} | {row['Missing_Count']} | {row['Missing_Percentage']:.1f}% |\n")
        
        # 6. Save CSV Data
        df_stats.to_csv(os.path.join(output_dir, 'coverage_stats.csv'), index=False)
        
        print(f"\nAnalysis Successfully Completed.")
        print(f"Outputs saved to: {output_dir}/")
        print(f"  - Report: Coverage_Analysis_Report.md")
        print(f"  - Plots: Category_Summary.png, Field_Level_Analysis.png")
        print(f"  - Data: coverage_stats.csv")

    else:
        print(f"Error: Folder {json_folder} not found.")

except Exception as e:
    print(f"An error occurred: {e}")


RUNNING COVERAGE ANALYSIS (Block 9.0)
Created report directory: Coverage_Analysis_Reports/Report_2026-01-15_15-27-46
Scanning 1012 files for missing information markers...
Scan complete. Generating analysis...

Analysis Successfully Completed.
Outputs saved to: Coverage_Analysis_Reports/Report_2026-01-15_15-27-46/
  - Report: Coverage_Analysis_Report.md
  - Plots: Category_Summary.png, Field_Level_Analysis.png
  - Data: coverage_stats.csv


In [6]:
# Block 6.5: Check for "Not enough information is available" in Title
import os
import json

print("="*60)
print("CHECKING FOR MISSING TITLES IN JSON (Block 6.5)")
print("="*60)

json_folder_path = 'Copilot_1000_v0_Processed_2026-01-15'
target_string = "Not enough information is available"
missing_title_count = 0
missing_title_files = []

try:
    if os.path.exists(json_folder_path):
        json_files = [f for f in os.listdir(json_folder_path) if f.endswith('.json')]
        total_files = len(json_files)
        
        print(f"Scanning {total_files} JSON files...")
        
        for json_file in json_files:
            file_path = os.path.join(json_folder_path, json_file)
            
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                title = data.get('publication/title', '')
                
                # Check for specific phrase
                if title == target_string:
                    missing_title_count += 1
                    missing_title_files.append(json_file)
                    
            except Exception as e:
                print(f"Error reading {json_file}: {e}")
                
        print("-" * 40)
        print(f"Files with title '{target_string}': {missing_title_count}")
        print("-" * 40)
        
        if missing_title_count > 0:
            print("Files listed:")
            for f in missing_title_files:
                print(f"  - {f}")
                
    else:
        print(f"Folder not found: {json_folder_path}")

except Exception as e:
    print(f"An error occurred: {str(e)}")


CHECKING FOR MISSING TITLES IN JSON (Block 6.5)
Scanning 1012 JSON files...
----------------------------------------
Files with title 'Not enough information is available': 25
----------------------------------------
Files listed:
  - PMC8080676.json
  - PMC11148103.json
  - PMC4368063.json
  - PMC3205469.json
  - PMC9420706.json
  - PMC7988437.json
  - PMC7874964.json
  - PMC10052279.json
  - PMC10365090.json
  - PMC6992687.json
  - PMC7821214.json
  - PMC10785655.json
  - PMC9086604.json
  - PMC10046420.json
  - PMC11140654.json
  - PMC10239131.json
  - PMC10791584.json
  - PMC2846370.json
  - PMC11110913.json
  - PMC11127166.json
  - PMC10235219.json
  - PMC6924628.json
  - PMC10060474.json
  - PMC5685313.json
  - PMC9869541.json


In [5]:
# Block 7.0: Create Updated JSONs with Corrected Metadata
import os
import shutil

print("="*60)
print("CREATING UPDATED JSONS IN NEW FOLDER (Block 7.0)")
print("="*60)

# Paths
source_json_folder = 'Copilot_1000_v0_Processed_2026-01-15'
target_json_folder = 'Copilot_1000_v0_Processed_2026-01-15_Updated_Metadata'
tsv_path = 'Positive_PMC_TSV_Files/positive_entries_status.tsv'

# Field mapping: JSON key -> TSV column
field_map = {
    'publication/title': 'Title',
    'publication/authors': 'Authors',
    'publication/journal': 'Journal',
    'publication/year': 'Year',
    'publication/doi': 'DOI'
}

try:
    # 1. Create target directory
    if not os.path.exists(target_json_folder):
        os.makedirs(target_json_folder)
        print(f"Created directory: {target_json_folder}")
    else:
        print(f"Directory exists: {target_json_folder}")

    if os.path.exists(tsv_path) and os.path.exists(source_json_folder):
        print("Loading TSV reference data...")
        df = pd.read_csv(tsv_path, sep='\t')
        df['PMCID_clean'] = df['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        
        updated_files_count = 0
        
        json_files = [f for f in os.listdir(source_json_folder) if f.endswith('.json')]
        total_files = len(json_files)
        
        print(f"Processing {total_files} files...")
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            
            # Find TSV row
            row = df[df['PMCID_clean'] == pmcid]
            
            # Load original JSON
            source_path = os.path.join(source_json_folder, json_file)
            target_path = os.path.join(target_json_folder, json_file)
            
            with open(source_path, 'r') as f:
                data = json.load(f)
            
            # If we have TSV data, update fields if different
            if len(row) > 0:
                row = row.iloc[0]
                
                changes_made = False
                
                for json_key, tsv_col in field_map.items():
                    current_val = str(data.get(json_key, "")).strip()
                    
                    # Prepare new value
                    new_val_raw = row[tsv_col]
                    if pd.isna(new_val_raw):
                        new_val = ""
                    else:
                        if tsv_col == 'Year':
                            try:
                                new_val = str(int(float(new_val_raw)))
                            except:
                                new_val = str(new_val_raw)
                        else:
                            new_val = str(new_val_raw).strip()
                    
                    # Check difference
                    if current_val != new_val:
                        data[json_key] = new_val
                        changes_made = True
                
                if changes_made:
                    updated_files_count += 1
            
            # Save to new location (either updated or original copy)
            with open(target_path, 'w') as f:
                json.dump(data, f, indent=2)
                
        print(f"Completed.")
        print(f"Total files processed: {total_files}")
        print(f"Files with metadata updates: {updated_files_count}")
        print(f"All files saved to: {target_json_folder}")

    else:
        print("Error: Source data not found.")

except Exception as e:
    print(f"An error occurred: {str(e)}")


CREATING UPDATED JSONS IN NEW FOLDER (Block 7.0)
Created directory: Copilot_1000_v0_Processed_2026-01-15_Updated_Metadata
Loading TSV reference data...
Processing 1012 files...
Completed.
Total files processed: 1012
Files with metadata updates: 1012
All files saved to: Copilot_1000_v0_Processed_2026-01-15_Updated_Metadata


In [23]:
# Block 8.0: Manual Visual Inspection Interface (Alternating High/Low Sim)
# Run this cell repeatedly (Ctrl+Enter) to view entries.
# It will alternate between High Similarity mismatches (>=80%) and Low Similarity (<=20%).
import os
import pandas as pd
import json
import difflib
import random
import html
from IPython.display import display, Markdown, HTML, clear_output

# Global toggle for alternation (persists across cell runs)
# Initialize only if not present
if 'inspection_mode_high' not in globals():
    inspection_mode_high = True # True=High, False=Low

source_json_folder = 'Copilot_1000_v0_Processed_2026-01-15'
tsv_path = 'Positive_PMC_TSV_Files/positive_entries_status.tsv'

try:
    if os.path.exists(tsv_path) and os.path.exists(source_json_folder):
        # Load Data
        df = pd.read_csv(tsv_path, sep='\t')
        df['PMCID_clean'] = df['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        
        # Helper to clean numeric strings
        def clean_val(v):
            if pd.isna(v) or v == '': return ""
            try: return str(int(float(v)))
            except: return str(v).strip()

        high_sim = [] 
        low_sim = []
        
        json_files = [f for f in os.listdir(source_json_folder) if f.endswith('.json')]
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            row = df[df['PMCID_clean'] == pmcid]
            if len(row) == 0: continue
            row = row.iloc[0]
            
            with open(os.path.join(source_json_folder, json_file), 'r') as f:
                data = json.load(f)
                
            j_title = str(data.get('publication/title', "")).strip()
            t_title = str(row['Title']).strip() if pd.notna(row['Title']) else ""
            
            if j_title != t_title:
                ratio = difflib.SequenceMatcher(None, j_title, t_title).ratio()
                
                entry = {
                    'pmcid': pmcid,
                    'json': data,
                    'tsv': row,
                    'ratio': ratio
                }
                
                if ratio >= 0.8: high_sim.append(entry)
                elif ratio <= 0.2: low_sim.append(entry)

        # Toggle Selection Logic
        target_pool = []
        mode_str = ""
        
        # Try to respect toggle, but fallback if one pool is empty
        if inspection_mode_high:
            if high_sim: 
                target_pool = high_sim
                mode_str = "High Similarity (>=80%)"
            elif low_sim:
                target_pool = low_sim
                mode_str = "Low Similarity (<=20%) [High list empty]"
        else:
            if low_sim:
                target_pool = low_sim
                mode_str = "Low Similarity (<=20%)"
            elif high_sim:
                target_pool = high_sim
                mode_str = "High Similarity (>=80%) [Low list empty]"
        
        # Flip toggle for next run
        inspection_mode_high = not inspection_mode_high
        
        if not target_pool:
            print("No mismatches found in either category.")
        else:
            item = random.choice(target_pool)
            
            # Prepare IDs for Link
            curr_pmcid = item['pmcid']
            curr_pmid = clean_val(item['tsv']['PMID'])
            
            # --- DISPLAY SECTION ---
            
            # Using display() ensures rich output isn't hidden/truncated easily by text buffer limits
            display(Markdown(f"### {mode_str} | Similarity: {item['ratio']:.2f}"))
            
            # Create HTML links to Europe PMC Search
            url_pmcid = f"https://europepmc.org/search?query={curr_pmcid}"
            url_pmid = f"https://europepmc.org/search?query={curr_pmid}" if curr_pmid else "#"
            
            # Render Links
            display(HTML(f"""
            <div style="background-color: #e8e8e8; padding: 12px; border-radius: 4px; border-left: 5px solid #007acc; font-family: sans-serif;">
                <span style="font-weight: bold; margin-right: 10px;">IDS:</span>
                <a href="{url_pmcid}" target="_blank" style="text-decoration: none; font-weight: bold; color: #0066cc; margin-right: 20px; font-size: 1.1em;">{curr_pmcid} ↗</a>
                <a href="{url_pmid}" target="_blank" style="text-decoration: none; font-weight: bold; color: #0066cc; font-size: 1.1em;">PMID:{curr_pmid} ↗</a>
            </div>
            <br>
            """))
            
            # Comparison Loop
            fields = [
                ('Title', 'publication/title', 'Title'),
                ('Authors', 'publication/authors', 'Authors'),
                ('Journal', 'publication/journal', 'Journal'),
                ('Year', 'publication/year', 'Year'),
                ('DOI', 'publication/doi', 'DOI')
            ]
            
            for label, k_json, k_tsv in fields:
                v_json = str(item['json'].get(k_json, "")).strip()
                v_tsv = item['tsv'][k_tsv]
                
                # Special clean for display
                if k_tsv == 'Year': v_tsv = clean_val(v_tsv)
                else: v_tsv = str(v_tsv).strip() if pd.notna(v_tsv) else ""
                
                match = v_json == v_tsv
                symbol = "✅" if match else "❌"
                
                if label == 'DOI':
                    # Special HTML handling for clickable DOI
                    def make_doi_link(v):
                        if not v: return "<em>(empty)</em>"
                        # Simple cleanup if formatted strangely, but usually just the DOI string
                        return f'<a href="https://doi.org/{v}" target="_blank" style="text-decoration: underline; color: #0066cc;">{v} ↗</a>'
                    
                    display(HTML(f"<strong>{symbol} [{label}]</strong>"))
                    if not match:
                        display(HTML(f"&nbsp;&nbsp;JSON: {make_doi_link(v_json)}"))
                        display(HTML(f"&nbsp;&nbsp;TSV : {make_doi_link(v_tsv)}"))
                    else:
                        display(HTML(f"&nbsp;&nbsp;{make_doi_link(v_json)}"))
                    print("-" * 60)
                else:
                    # Use print for content to avoid HTML rendering issues with weird chars
                    print(f"{symbol} [{label}]")
                    if not match:
                        print(f"  JSON: {v_json}")
                        print(f"  TSV : {v_tsv}")
                    else:
                        print(f"  {v_json}")
                    print("-" * 60)
            
            # --- EXPANDABLE REST OF DATA ---
            # Identify keys already shown
            shown_keys = [f[1] for f in fields]
            
            # Collect remaining data
            remaining_data = {k: v for k, v in item['json'].items() if k not in shown_keys}
            
            if remaining_data:
                # Format to JSON string and escape for HTML
                json_str = json.dumps(remaining_data, indent=2)
                safe_json_str = html.escape(json_str)
                
                display(HTML(f"""
                <br>
                <details style="border: 1px solid #ddd; border-radius: 4px; padding: 10px; background-color: #fafafa;">
                    <summary style="cursor: pointer; color: #555; font-weight: bold; padding: 5px;">
                        ▶ Show Remaining JSON Data ({len(remaining_data)} fields)
                    </summary>
                    <pre style="margin-top: 10px; background-color: #fff; padding: 10px; border: 1px solid #eee; border-radius: 4px; overflow-x: auto;">{safe_json_str}</pre>
                </details>
                """))
            
            print("\n(Run cell again [Ctrl+Enter] to flip category next time)")

    else:
        print("Error: Files not found.")

except Exception as e:
    print(f"Error: {e}")

### Low Similarity (<=20%) | Similarity: 0.19

❌ [Title]
  JSON: Internet Interventions 25 (2021) 100424
  TSV : Predicting acute suicidal ideation on Instagram using ensemble machine learning models.
------------------------------------------------------------
❌ [Authors]
  JSON: The authors who contributed to this article are Damien Lekkas, Robert J. Klein, and Nicholas C. Jacobson. Damien Lekkas was involved in conceptualization, methodology, software, formal analysis, writing the original draft, reviewing and editing, and visualization. Robert J. Klein contributed to writing the original draft and reviewing and editing. Nicholas C. Jacobson assisted with methodology and formal analysis, as well as reviewing and editing the paper.
  TSV : Lekkas D, Klein RJ, Jacobson NC
------------------------------------------------------------
❌ [Journal]
  JSON: Internet Interventions
  TSV : Internet interventions
------------------------------------------------------------
✅ [Year]
  2021
---------------------------------------------------

------------------------------------------------------------



(Run cell again [Ctrl+Enter] to flip category next time)


In [29]:
# Block 10.0: Unified Analysis on Copilot_v0 (Older Batch) with Registry Metadata
# This block repeats:
# 1. Title Mismatch Analysis
# 2. "No Information" Coverage Analysis & Visualisation
# 3. Uses 'PMCIDs_DOME_Registry_Contents_2026-01-09.tsv' because it contains 'mapped_pmcid'

import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np
import difflib

print("="*80)
print("RUNNING UNIFIED ANALYSIS ON REGISTRY DATASET (Copilot_v0_Processed_2025-12-04)")
print("="*80)

# --- Configuration ---
# Updated path to point to the inner directory containing JSONs
json_folder = 'Copilot_v0_Processed_2025-12-04/registry_v0'
# User requested 'flattened_...' but that file lacks 'mapped_pmcid'. 
# Using 'PMCIDs_...' which contains the mapping column.
tsv_path = 'DOME_Registry_TSV_Files/PMCIDs_DOME_Registry_Contents_2026-01-09.tsv'

timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"Registry_Analysis_Reports/Report_{timestamp}"
target_phrase = "Not enough information is available"

# Create output directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print(f"Output Directory: {output_dir}")

# --- Helper Functions ---
def get_category(key):
    if key.startswith('publication'): return 'Publication'
    if key.startswith('dataset'): return 'Data'
    if key.startswith('optimization'): return 'Optimisation'
    if key.startswith('model'): return 'Model'
    if key.startswith('evaluation'): return 'Evaluation'
    return 'Other'

try:
    # 1. Load Metadata
    print(f"Loading metadata from: {tsv_path}")
    df = pd.read_csv(tsv_path, sep='\t')
    
    # Filter for rows that have a mapped PMCID
    df_mapped = df[df['mapped_pmcid'].notna()].copy()
    df_mapped['clean_pmcid'] = df_mapped['mapped_pmcid'].apply(lambda x: str(x).strip())
    
    # Handle duplicates by taking the first occurrence
    initial_len = len(df_mapped)
    df_mapped = df_mapped.drop_duplicates(subset=['clean_pmcid'], keep='first')
    if len(df_mapped) < initial_len:
        print(f"Dropped {initial_len - len(df_mapped)} duplicate PMCIDs from metadata.")
    
    # Create lookup dictionary
    meta_lookup = df_mapped.set_index('clean_pmcid').to_dict('index')
    print(f"Loaded {len(df)} rows. Found {len(df_mapped)} unique mapped PMCIDs.")

    # 2. Scan JSON Files
    if os.path.exists(json_folder):
        json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]
        total_files = len(json_files)
        print(f"Found {total_files} JSON files in {json_folder}")
        
        if total_files == 0:
            print("Warning: No JSON files found. Checking path...")
            print(f"Path contents: {os.listdir(json_folder)}")
        
        # Stats Containers
        field_counts = {}
        all_fields = set()
        
        # Mismatch Containers
        mismatches = []
        matched_count = 0
        missing_title_in_json_count = 0
        
        print("Processing files...")
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            file_path = os.path.join(json_folder, json_file)
            
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            # --- Analysis A: Coverage ("Not enough info") ---
            for key, val in data.items():
                all_fields.add(key)
                if key not in field_counts: field_counts[key] = 0
                
                if str(val).strip().startswith(target_phrase):
                    field_counts[key] += 1
            
            # --- Analysis B: Title Mismatches & Metadata Validation ---
            # Check if JSON title says "Not enough info"
            json_title = str(data.get('publication/title', '')).strip()
            if json_title == target_phrase:
                missing_title_in_json_count += 1
            
            # Compare with Metadata
            if pmcid in meta_lookup:
                matched_count += 1
                row = meta_lookup[pmcid]
                
                # Metadata Title
                tsv_title = str(row.get('publication_title', '')).strip()
                
                # Calculate Similarity
                # Ignore if JSON title is missing/not-enough-info
                if json_title and json_title != target_phrase:
                    if json_title != tsv_title:
                        ratio = difflib.SequenceMatcher(None, json_title, tsv_title).ratio() 
                        if ratio < 1.0: # Keep all diffs for report
                            mismatches.append({
                                'pmcid': pmcid,
                                'json_title': json_title,
                                'tsv_title': tsv_title,
                                'ratio': ratio
                            })

        print("Processing complete.")
        
        if total_files > 0:
            # --- 3. Generate Coverage Report (Same as Block 9.0) ---
            data_list = []
            for key in all_fields:
                missing_count = field_counts.get(key, 0)
                category = get_category(key)
                pct_missing = (missing_count / total_files) * 100 if total_files > 0 else 0
                
                data_list.append({
                    'Field': key,
                    'Category': category,
                    'Missing_Count': missing_count,
                    'Total_Files': total_files,
                    'Missing_Percentage': pct_missing
                })
                
            df_stats = pd.DataFrame(data_list)
            df_stats = df_stats.sort_values(by=['Category', 'Field'])
            
            # -- Plotting --
            # 1. Category Summary
            category_stats = df_stats.groupby('Category')['Missing_Percentage'].mean().reset_index()
            cat_order = ['Publication', 'Data', 'Optimisation', 'Model', 'Evaluation']
            category_stats['Category'] = pd.Categorical(category_stats['Category'], categories=cat_order, ordered=True)
            category_stats = category_stats.sort_values('Category')
            
            plt.figure(figsize=(10, 6))
            bars = plt.bar(category_stats['Category'], category_stats['Missing_Percentage'], color='#4c72b0')
            plt.title('Average Information Gap by Category (Registry Dataset)', fontsize=12)
            plt.ylabel('Avg. Missing %')
            plt.ylim(0, 100)
            for bar in bars:
                height = bar.get_height()
                plt.text(bar.get_x() + bar.get_width()/2., height + 1, f'{height:.1f}%', ha='center', va='bottom')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'Registry_Category_Summary.png'), dpi=150)
            plt.close()
            
            # 2. Field Level
            unique_cats = [c for c in cat_order if c in df_stats['Category'].unique()]
            if not unique_cats:
                print("No categories found to plot.")
            else:
                fig, axes = plt.subplots(nrows=len(unique_cats), ncols=1, figsize=(12, 4 * len(unique_cats)), constrained_layout=True)
                if len(unique_cats) == 1: axes = [axes]
                
                for i, cat in enumerate(unique_cats):
                    if i < len(axes): # Safety check
                        ax = axes[i]
                        subset = df_stats[df_stats['Category'] == cat].sort_values('Missing_Percentage', ascending=False)
                        y_pos = np.arange(len(subset))
                        ax.barh(y_pos, subset['Missing_Percentage'], align='center', color='#55a868')
                        ax.set_yticks(y_pos)
                        ax.set_yticklabels(subset['Field'])
                        ax.invert_yaxis()
                        ax.set_xlabel('% Coverage Gap')
                        ax.set_title(f'Category: {cat}')
                        ax.set_xlim(0, 100)
                        for j, v in enumerate(subset['Missing_Percentage']):
                            ax.text(v + 1, j, f"{v:.1f}%", va='center', fontsize=9)
                    
                plt.suptitle(f'Detailed Gap Analysis (Registry Data, n={total_files})', fontsize=16)
                plt.savefig(os.path.join(output_dir, 'Registry_Field_Analysis.png'), dpi=150)
                plt.close()
            
            # --- 4. Validation Report ---
            
            report_file = os.path.join(output_dir, 'Registry_Analysis_Report.md')
            with open(report_file, 'w') as r:
                r.write(f"# DOME Registry Data Analysis Report\n")
                r.write(f"**Date:** {timestamp}\n")
                r.write(f"**JSON Dataset:** `{json_folder}` ({total_files} files)\n")
                r.write(f"**Metadata:** `{tsv_path}`\n\n")
                
                r.write("## 1. Metadata Linking\n")
                r.write(f"- Total JSON Files: {total_files}\n")
                r.write(f"- Matched to Registry Metadata: {matched_count} ({(matched_count/total_files)*100:.1f}%)\n\n")
                
                r.write("## 2. Title Analysis\n")
                r.write(f"- JSONs with '{target_phrase}' as Title: {missing_title_in_json_count}\n")
                r.write(f"- Title Mismatches (vs Metadata): {len(mismatches)}\n\n")
                
                if mismatches:
                    r.write("### Low Similarity Title Mismatches (Ratio < 0.5)\n")
                    r.write("| PMCID | JSON Title | Metadata Title | Similarity |\n")
                    r.write("|---|---|---|---|\n")
                    severe_mismatches = [m for m in mismatches if m['ratio'] < 0.5]
                    for m in severe_mismatches[:20]: # Show top 20
                        r.write(f"| {m['pmcid']} | {m['json_title'][:50]}... | {m['tsv_title'][:50]}... | {m['ratio']:.2f} |\n")
                    if len(severe_mismatches) > 20:
                        r.write(f"| ... | ... | ... | ... |\n")
                
                r.write("\n## 3. Information Coverage\n")
                r.write("![Category Summary](Registry_Category_Summary.png)\n\n")
                
                r.write("| Category | Avg Missing % |\n")
                r.write("|---|---|\n")
                for _, row in category_stats.iterrows():
                    r.write(f"| {row['Category']} | {row['Missing_Percentage']:.1f}% |\n")
                    
            print(f"Analysis Complete.")
            print(f"Report saved to: {report_file}")
        
    else:
        print(f"Error: JSON folder {json_folder} not found.")

except Exception as e:
    import traceback
    traceback.print_exc()
    print(f"Critical Error: {e}")


RUNNING UNIFIED ANALYSIS ON REGISTRY DATASET (Copilot_v0_Processed_2025-12-04)
Output Directory: Registry_Analysis_Reports/Report_2026-01-15_16-42-24
Loading metadata from: DOME_Registry_TSV_Files/PMCIDs_DOME_Registry_Contents_2026-01-09.tsv
Dropped 10 duplicate PMCIDs from metadata.
Loaded 280 rows. Found 237 unique mapped PMCIDs.
Found 231 JSON files in Copilot_v0_Processed_2025-12-04/registry_v0
Processing files...
Processing complete.
Analysis Complete.
Report saved to: Registry_Analysis_Reports/Report_2026-01-15_16-42-24/Registry_Analysis_Report.md


In [30]:
# Block 11.0: Update Registry JSONs via Europe PMC API (Fetch & Apply)
# 1. Scans JSONs in 'Copilot_v0_Processed_2025-12-04/registry_v0' to get PMCIDs.
# 2. Queries Europe PMC API for metadata (Title, Authors, Journal, Year, DOI).
# 3. Saves fetched metadata to 'DOME_Registry_Remediation/registry_metadata_remediation.tsv'.
# 4. Updates JSONs with this new metadata and saves them to 'Copilot_v0_Processed_2025-12-04_Updated_Metadata'.

import os
import json
import requests
import pandas as pd
import time
import shutil

print("="*80)
print("RUNNING METADATA UPDATE VIA EUROPE PMC API (Block 11.0)")
print("="*80)

# --- Configuration ---
source_folder = 'Copilot_v0_Processed_2025-12-04/registry_v0'
target_folder = 'Copilot_v0_Processed_2025-12-04_Updated_Metadata'
remediation_tsv = 'DOME_Registry_Remediation/registry_metadata_remediation.tsv'
api_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"

# Ensure directories exist
os.makedirs(os.path.dirname(remediation_tsv), exist_ok=True)
if not os.path.exists(target_folder):
    os.makedirs(target_folder)
    print(f"Created target directory: {target_folder}")

# --- Helper Functions ---
def fetch_metadata(pmcids):
    """
    Fetches metadata for a list of PMCIDs in batches.
    Returns a dictionary keyed by PMCID.
    """
    results = {}
    batch_size = 50 # Europe PMC handles largish queries, but keep it safe
    
    # Chunk the PMCIDs
    for i in range(0, len(pmcids), batch_size):
        batch = pmcids[i:i + batch_size]
        query = " OR ".join([f'PMCID:{pid}' for pid in batch])
        
        params = {
            'query': query,
            'format': 'json',
            'resultType': 'core',
            'pageSize': batch_size
        }
        
        try:
            response = requests.get(api_url, params=params)
            response.raise_for_status()
            data = response.json()
            
            for item in data.get('resultList', {}).get('result', []):
                pmcid = item.get('pmcid')
                if pmcid:
                    # Extract fields
                    # Authors list to string
                    author_list = item.get('authorList', {}).get('author', [])
                    authors_str = ", ".join([f"{a.get('lastName', '')} {a.get('firstName', '')}".strip() for a in author_list])
                    
                    results[pmcid] = {
                        'PMCID': pmcid,
                        'Title': item.get('title', ''),
                        'Authors': authors_str,
                        'Journal': item.get('journalInfo', {}).get('journal', {}).get('title', ''),
                        'Year': item.get('pubYear', ''),
                        'DOI': item.get('doi', '')
                    }
            
            # Be polite to the API
            time.sleep(0.5)
            
        except Exception as e:
            print(f"Error fetching batch starting with {batch[0]}: {e}")
            
    return results

try:
    # 1. Identify PMCIDs from files
    if os.path.exists(source_folder):
        json_files = [f for f in os.listdir(source_folder) if f.endswith('.json')]
        pmcids_to_fetch = [f.replace('.json', '') for f in json_files]
        total_files = len(pmcids_to_fetch)
        
        print(f"Found {total_files} files. Starting metadata fetch...")
        
        # 2. Fetch Metadata
        metadata_map = fetch_metadata(pmcids_to_fetch)
        print(f"Successfully fetched metadata for {len(metadata_map)} entries.")
        
        # 3. Save to TSV
        df_rem = pd.DataFrame(list(metadata_map.values()))
        # Ensure column order
        cols = ['PMCID', 'Title', 'Authors', 'Journal', 'Year', 'DOI']
        df_rem = df_rem[cols] if not df_rem.empty else pd.DataFrame(columns=cols)
        
        df_rem.to_csv(remediation_tsv, sep='\t', index=False)
        print(f"Saved remediation TSV to: {remediation_tsv}")
        
        # 4. Update JSONs
        print(f"Updating JSONs in {target_folder}...")
        updated_count = 0
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            source_path = os.path.join(source_folder, json_file)
            target_path = os.path.join(target_folder, json_file)
            
            # Read Source
            with open(source_path, 'r') as f:
                data = json.load(f)
            
            # Check if we have new data
            if pmcid in metadata_map:
                meta = metadata_map[pmcid]
                
                # Apply updates (always overwrite with fresh API data)
                data['publication/title'] = meta['Title']
                data['publication/authors'] = meta['Authors']
                data['publication/journal'] = meta['Journal']
                data['publication/year'] = meta['Year']
                data['publication/doi'] = meta['DOI']
                
                updated_count += 1
            
            # Write to Target
            with open(target_path, 'w') as f:
                json.dump(data, f, indent=2)
                
        print(f"Update Process Complete.")
        print(f"Total files written: {total_files}")
        print(f"Files updated with API data: {updated_count}")
        
    else:
        print(f"Error: Source folder {source_folder} not found.")

except Exception as e:
    import traceback
    traceback.print_exc()
    print(f"Critical Error: {e}")


RUNNING METADATA UPDATE VIA EUROPE PMC API (Block 11.0)
Created target directory: Copilot_v0_Processed_2025-12-04_Updated_Metadata
Found 231 files. Starting metadata fetch...
Successfully fetched metadata for 231 entries.
Saved remediation TSV to: DOME_Registry_Remediation/registry_metadata_remediation.tsv
Updating JSONs in Copilot_v0_Processed_2025-12-04_Updated_Metadata...
Update Process Complete.
Total files written: 231
Files updated with API data: 231


In [31]:
# Block 12.0: Unified Analysis on UPDATED Registry JSONs vs Registry Metadata
# This performs the same cross-check as Block 10.0, but on the newly remediated JSONs
# folder: 'Copilot_v0_Processed_2025-12-04_Updated_Metadata'

import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np
import difflib

print("="*80)
print("RUNNING ANALYSIS ON UPDATED REGISTRY DATASET")
print("="*80)

# --- Configuration ---
# Pointing to the new folder created in Block 11.0
json_folder = 'Copilot_v0_Processed_2025-12-04_Updated_Metadata'
tsv_path = 'DOME_Registry_TSV_Files/PMCIDs_DOME_Registry_Contents_2026-01-09.tsv'

timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"Registry_Analysis_Reports/Report_Updated_{timestamp}"
target_phrase = "Not enough information is available"

# Create output directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print(f"Output Directory: {output_dir}")

# --- Helper Functions ---
def get_category(key):
    if key.startswith('publication'): return 'Publication'
    if key.startswith('dataset'): return 'Data'
    if key.startswith('optimization'): return 'Optimisation'
    if key.startswith('model'): return 'Model'
    if key.startswith('evaluation'): return 'Evaluation'
    return 'Other'

try:
    # 1. Load Metadata
    print(f"Loading metadata from: {tsv_path}")
    df = pd.read_csv(tsv_path, sep='\t')
    
    # Filter for rows that have a mapped PMCID
    df_mapped = df[df['mapped_pmcid'].notna()].copy()
    df_mapped['clean_pmcid'] = df_mapped['mapped_pmcid'].apply(lambda x: str(x).strip())
    
    # Handle duplicates by taking the first occurrence
    initial_len = len(df_mapped)
    df_mapped = df_mapped.drop_duplicates(subset=['clean_pmcid'], keep='first')
    if len(df_mapped) < initial_len:
        print(f"Dropped {initial_len - len(df_mapped)} duplicate PMCIDs from metadata.")
    
    # Create lookup dictionary
    meta_lookup = df_mapped.set_index('clean_pmcid').to_dict('index')
    print(f"Loaded {len(df)} rows. Found {len(df_mapped)} unique mapped PMCIDs.")

    # 2. Scan JSON Files
    if os.path.exists(json_folder):
        json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]
        total_files = len(json_files)
        print(f"Found {total_files} JSON files in {json_folder}")
        
        if total_files == 0:
            print("Warning: No JSON files found.")
        
        # Stats Containers
        field_counts = {}
        all_fields = set()
        
        # Mismatch Containers
        mismatches = []
        matched_count = 0
        missing_title_in_json_count = 0
        
        print("Processing files...")
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            file_path = os.path.join(json_folder, json_file)
            
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            # --- Analysis A: Coverage ("Not enough info") ---
            for key, val in data.items():
                all_fields.add(key)
                if key not in field_counts: field_counts[key] = 0
                
                if str(val).strip().startswith(target_phrase):
                    field_counts[key] += 1
            
            # --- Analysis B: Title Mismatches & Metadata Validation ---
            # Check if JSON title says "Not enough info"
            json_title = str(data.get('publication/title', '')).strip()
            if json_title == target_phrase:
                missing_title_in_json_count += 1
            
            # Compare with Metadata
            if pmcid in meta_lookup:
                matched_count += 1
                row = meta_lookup[pmcid]
                
                # Metadata Title
                tsv_title = str(row.get('publication_title', '')).strip()
                
                # Calculate Similarity
                # Ignore if JSON title is missing/not-enough-info
                if json_title and json_title != target_phrase:
                    if json_title != tsv_title:
                        ratio = difflib.SequenceMatcher(None, json_title, tsv_title).ratio() 
                        if ratio < 1.0: # Keep all diffs for report
                            mismatches.append({
                                'pmcid': pmcid,
                                'json_title': json_title,
                                'tsv_title': tsv_title,
                                'ratio': ratio
                            })

        print("Processing complete.")
        
        if total_files > 0:
            # --- 3. Generate Coverage Report ---
            data_list = []
            for key in all_fields:
                missing_count = field_counts.get(key, 0)
                category = get_category(key)
                pct_missing = (missing_count / total_files) * 100 if total_files > 0 else 0
                
                data_list.append({
                    'Field': key,
                    'Category': category,
                    'Missing_Count': missing_count,
                    'Total_Files': total_files,
                    'Missing_Percentage': pct_missing
                })
                
            df_stats = pd.DataFrame(data_list)
            df_stats = df_stats.sort_values(by=['Category', 'Field'])
            
            # -- Plotting --
            # 1. Category Summary
            category_stats = df_stats.groupby('Category')['Missing_Percentage'].mean().reset_index()
            cat_order = ['Publication', 'Data', 'Optimisation', 'Model', 'Evaluation']
            category_stats['Category'] = pd.Categorical(category_stats['Category'], categories=cat_order, ordered=True)
            category_stats = category_stats.sort_values('Category')
            
            plt.figure(figsize=(10, 6))
            bars = plt.bar(category_stats['Category'], category_stats['Missing_Percentage'], color='#4c72b0')
            plt.title('Average Information Gap by Category (Updated Dataset)', fontsize=12)
            plt.ylabel('Avg. Missing %')
            plt.ylim(0, 100)
            for bar in bars:
                height = bar.get_height()
                plt.text(bar.get_x() + bar.get_width()/2., height + 1, f'{height:.1f}%', ha='center', va='bottom')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'Updated_Registry_Category_Summary.png'), dpi=150)
            plt.close()
            
            # 2. Field Level
            unique_cats = [c for c in cat_order if c in df_stats['Category'].unique()]
            if not unique_cats:
                 print("No categories found to plot.")
            else:
                fig, axes = plt.subplots(nrows=len(unique_cats), ncols=1, figsize=(12, 4 * len(unique_cats)), constrained_layout=True)
                if len(unique_cats) == 1: axes = [axes]
                
                for i, cat in enumerate(unique_cats):
                    if i < len(axes):
                        ax = axes[i]
                        subset = df_stats[df_stats['Category'] == cat].sort_values('Missing_Percentage', ascending=False)
                        y_pos = np.arange(len(subset))
                        ax.barh(y_pos, subset['Missing_Percentage'], align='center', color='#55a868')
                        ax.set_yticks(y_pos)
                        ax.set_yticklabels(subset['Field'])
                        ax.invert_yaxis()
                        ax.set_xlabel('% Coverage Gap')
                        ax.set_title(f'Category: {cat}')
                        ax.set_xlim(0, 100)
                        for j, v in enumerate(subset['Missing_Percentage']):
                            ax.text(v + 1, j, f"{v:.1f}%", va='center', fontsize=9)
                        
                plt.suptitle(f'Detailed Gap Analysis (Updated Registry Data, n={total_files})', fontsize=16)
                plt.savefig(os.path.join(output_dir, 'Updated_Registry_Field_Analysis.png'), dpi=150)
                plt.close()
            
            # --- 4. Validation Report ---
            
            report_file = os.path.join(output_dir, 'Updated_Registry_Analysis_Report.md')
            with open(report_file, 'w') as r:
                r.write(f"# Updated DOME Registry Data Analysis Report\n")
                r.write(f"**Date:** {timestamp}\n")
                r.write(f"**JSON Dataset:** `{json_folder}` ({total_files} files)\n")
                r.write(f"**Metadata:** `{tsv_path}`\n\n")
                
                r.write("## 1. Metadata Linking\n")
                r.write(f"- Total JSON Files: {total_files}\n")
                r.write(f"- Matched to Registry Metadata: {matched_count} ({(matched_count/total_files)*100:.1f}%)\n\n")
                
                r.write("## 2. Title Analysis\n")
                r.write(f"- JSONs with '{target_phrase}' as Title: {missing_title_in_json_count}\n")
                r.write(f"- Title Mismatches (vs Metadata): {len(mismatches)}\n\n")
                
                if mismatches:
                    r.write("### Low Similarity Title Mismatches (Ratio < 0.5)\n")
                    r.write("| PMCID | JSON Title | Metadata Title | Similarity |\n")
                    r.write("|---|---|---|---|\n")
                    severe_mismatches = [m for m in mismatches if m['ratio'] < 0.5]
                    for m in severe_mismatches[:20]: # Show top 20
                        r.write(f"| {m['pmcid']} | {m['json_title'][:50]}... | {m['tsv_title'][:50]}... | {m['ratio']:.2f} |\n")
                    if len(severe_mismatches) > 20:
                        r.write(f"| ... | ... | ... | ... |\n")
                
                r.write("\n## 3. Information Coverage\n")
                r.write("![Category Summary](Updated_Registry_Category_Summary.png)\n\n")
                
                r.write("| Category | Avg Missing % |\n")
                r.write("|---|---|\n")
                for _, row in category_stats.iterrows():
                    r.write(f"| {row['Category']} | {row['Missing_Percentage']:.1f}% |\n")
                    
            print(f"Analysis Complete.")
            print(f"Report saved to: {report_file}")
            
    else:
        print(f"Error: JSON folder {json_folder} not found.")

except Exception as e:
    import traceback
    traceback.print_exc()
    print(f"Critical Error: {e}")


RUNNING ANALYSIS ON UPDATED REGISTRY DATASET
Output Directory: Registry_Analysis_Reports/Report_Updated_2026-01-15_16-53-53
Loading metadata from: DOME_Registry_TSV_Files/PMCIDs_DOME_Registry_Contents_2026-01-09.tsv
Dropped 10 duplicate PMCIDs from metadata.
Loaded 280 rows. Found 237 unique mapped PMCIDs.
Found 231 JSON files in Copilot_v0_Processed_2025-12-04_Updated_Metadata
Processing files...
Processing complete.
Analysis Complete.
Report saved to: Registry_Analysis_Reports/Report_Updated_2026-01-15_16-53-53/Updated_Registry_Analysis_Report.md


In [69]:
# Block 13.0: Manual Visual Inspection Interface for Remediated Registry JSONs
# Comparison:
# - JSONs: Copilot_v0_Processed_2025-12-04_Updated_Metadata (Updated via API)
# - TSV: DOME_Registry_Remediation/registry_metadata_remediation.tsv (Source of truth from API)

import os
import pandas as pd
import json
import difflib
import random
import html
from IPython.display import display, Markdown, HTML, clear_output

# Global toggle for alternation (persists across cell runs)
if 'remediation_inspection_mode_high' not in globals():
    remediation_inspection_mode_high = True # True=High, False=Low

source_json_folder = 'Copilot_v0_Processed_2025-12-04_Updated_Metadata'
tsv_path = 'DOME_Registry_Remediation/registry_metadata_remediation.tsv'

try:
    if os.path.exists(tsv_path) and os.path.exists(source_json_folder):
        # Load Data
        df = pd.read_csv(tsv_path, sep='\t')
        
        # Prepare TSV data 
        # Remediation TSV uses 'PMCID' as key
        df['PMCID_clean'] = df['PMCID'].apply(lambda x: str(x).strip() if pd.notna(x) else None)
        df_mapped = df.drop_duplicates(subset=['PMCID_clean'], keep='first')
        
        # Helper to clean numeric strings
        def clean_val(v):
            if pd.isna(v) or v == '': return ""
            try: return str(int(float(v)))
            except: return str(v).strip()

        high_sim = [] 
        low_sim = []
        
        json_files = [f for f in os.listdir(source_json_folder) if f.endswith('.json')]
        
        for json_file in json_files:
            pmcid = json_file.replace('.json', '')
            
            # Find matching TSV row
            row = df_mapped[df_mapped['PMCID_clean'] == pmcid]
            if len(row) == 0: continue
            row = row.iloc[0]
            
            with open(os.path.join(source_json_folder, json_file), 'r') as f:
                data = json.load(f)
                
            j_title = str(data.get('publication/title', "")).strip()
            # TSV column is 'Title' in the remediation file
            t_title = str(row['Title']).strip() if pd.notna(row['Title']) else ""
            
            # We expect them to be identical if update worked, but let's check
            if j_title != t_title:
                ratio = difflib.SequenceMatcher(None, j_title, t_title).ratio()
            else:
                ratio = 1.0
                
            entry = {
                'pmcid': pmcid,
                'json': data,
                'tsv': row,
                'ratio': ratio
            }
            
            if ratio >= 0.8: high_sim.append(entry)
            elif ratio <= 0.8: low_sim.append(entry)

        # Toggle Selection Logic
        target_pool = []
        mode_str = ""
        
        if remediation_inspection_mode_high:
            if high_sim: 
                target_pool = high_sim
                mode_str = "Exact Match / High Similarity (Sampling verified entries)"
            elif low_sim:
                target_pool = low_sim
                mode_str = "Mismatches (Sampling despite preference)"
        else:
            if low_sim:
                target_pool = low_sim
                mode_str = "Mismatches / Differences"
            elif high_sim:
                target_pool = high_sim
                mode_str = "Exact Matches (No mismatches found)"
        
        remediation_inspection_mode_high = not remediation_inspection_mode_high
        
        if not target_pool:
            print("No entries found.")
        else:
            item = random.choice(target_pool)
            
            # Prepare IDs for Link
            curr_pmcid = item['pmcid']
            curr_pmid = '' # Not in this TSV usually, unless we want to fetch it? The remediation TSV in Block 11 didn't save PMID.
            
            # --- DISPLAY SECTION ---
            
            display(Markdown(f"### {mode_str} | Similarity: {item['ratio']:.2f}"))
            
            # HTML Links
            url_pmcid = f"https://europepmc.org/search?query={curr_pmcid}"
            
            display(HTML(f"""
            <div style="background-color: #e8e8e8; padding: 12px; border-radius: 4px; border-left: 5px solid #6f42c1; font-family: sans-serif;">
                <span style="font-weight: bold; margin-right: 10px;">ID:</span>
                <a href="{url_pmcid}" target="_blank" style="text-decoration: none; font-weight: bold; color: #6610f2; margin-right: 20px; font-size: 1.1em;">{curr_pmcid} ↗</a>
            </div>
            <br>
            """))
            
            # Comparison Loop for Remediation Fields ('Title', 'Authors', 'Journal', 'Year', 'DOI')
            fields = [
                ('Title', 'publication/title', 'Title'),
                ('Authors', 'publication/authors', 'Authors'),
                ('Journal', 'publication/journal', 'Journal'),
                ('Year', 'publication/year', 'Year'),
                ('DOI', 'publication/doi', 'DOI')
            ]
            
            for label, k_json, k_tsv in fields:
                v_json = str(item['json'].get(k_json, "")).strip()
                v_tsv = item['tsv'].get(k_tsv, "")
                
                if k_tsv == 'Year': v_tsv = clean_val(v_tsv)
                else: v_tsv = str(v_tsv).strip() if pd.notna(v_tsv) else ""
                
                match = v_json == v_tsv
                symbol = "✅" if match else "❌"
                
                if label == 'DOI':
                    def make_doi_link(v):
                        if not v: return "<em>(empty)</em>"
                        return f'<a href="https://doi.org/{v}" target="_blank" style="text-decoration: underline; color: #0066cc;">{v} ↗</a>'
                    
                    display(HTML(f"<strong>{symbol} [{label}]</strong>"))
                    if not match:
                        display(HTML(f"&nbsp;&nbsp;JSON: {make_doi_link(v_json)}"))
                        display(HTML(f"&nbsp;&nbsp;TSV : {make_doi_link(v_tsv)}"))
                    else:
                        display(HTML(f"&nbsp;&nbsp;{make_doi_link(v_json)}"))
                    print("-" * 60)
                else:
                    print(f"{symbol} [{label}]")
                    if not match:
                        print(f"  JSON: {v_json}")
                        print(f"  TSV : {v_tsv}")
                    else:
                        if len(v_json) > 100:
                            print(f"  {v_json[:100]}...")
                        else:
                            print(f"  {v_json}")
                    print("-" * 60)
            
            # --- EXPANDABLE REST OF DATA ---
            shown_keys = [f[1] for f in fields]
            remaining_data = {k: v for k, v in item['json'].items() if k not in shown_keys}
            
            if remaining_data:
                json_str = json.dumps(remaining_data, indent=2)
                safe_json_str = html.escape(json_str)
                
                display(HTML(f"""
                <br>
                <details style="border: 1px solid #ddd; border-radius: 4px; padding: 10px; background-color: #fafafa;">
                    <summary style="cursor: pointer; color: #555; font-weight: bold; padding: 5px;">
                        ▶ Show Remaining JSON Data ({len(remaining_data)} fields)
                    </summary>
                    <pre style="margin-top: 10px; background-color: #fff; padding: 10px; border: 1px solid #eee; border-radius: 4px; overflow-x: auto;">{safe_json_str}</pre>
                </details>
                """))
            
            print("\n(Run cell again [Ctrl+Enter] to flip between Matches and Mismatches)")

    else:
        print("Error: Files not found.")
        print(f"TSV Exists: {os.path.exists(tsv_path)}")
        print(f"JSON Folder Exists: {os.path.exists(source_json_folder)}")

except Exception as e:
    import traceback
    traceback.print_exc()
    print(f"Error: {e}")


### Exact Match / High Similarity (Sampling verified entries) | Similarity: 1.00

✅ [Title]
  Cross-modal contrastive learning decodes developmental regulatory features through chromatin potenti...
------------------------------------------------------------
✅ [Authors]
  Yang Yueyuxiao, Xie Chenxi, He Qiushun, Yang Meng
------------------------------------------------------------
✅ [Journal]
  GigaScience
------------------------------------------------------------
✅ [Year]
  2025
------------------------------------------------------------


------------------------------------------------------------



(Run cell again [Ctrl+Enter] to flip between Matches and Mismatches)
