# **Manual DOI Remediation Interface**

This notebook provides an interactive interface to manually remediate DOME Registry entries that failed DOI-to-PMCID mapping.

## Instructions:
1. Run all cells in order
2. For each failed entry, click the DOI link to verify the article
3. Search PubMed if needed
4. Enter the PMID and/or PMCID you found
5. Click "Save & Next" to move to the next entry
6. A remediated TSV file will be automatically created and updated


In [1]:
# Setup: Import required libraries
import pandas as pd
import os
from datetime import datetime
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from IPython.display import Javascript

print("Libraries imported successfully!")
print("Ready to start remediation process.")

Libraries imported successfully!
Ready to start remediation process.


In [2]:
# Load the failed mappings TSV file
import glob

# Find the most recent failed mappings file
remediation_folder = 'DOME_Registry_Remediation'
tsv_pattern = os.path.join(remediation_folder, 'Failed_DOI_Mappings_*.tsv')
tsv_files = glob.glob(tsv_pattern)

if not tsv_files:
    print("ERROR: No failed mappings TSV file found!")
    print(f"Looking in: {remediation_folder}/")
    print("Please run Block 9 of the main notebook first to generate the failed mappings file.")
else:
    # Get the most recent file
    latest_tsv = max(tsv_files, key=os.path.getmtime)
    print(f"Found failed mappings file: {latest_tsv}")
    
    # Load the data
    df_failed = pd.read_csv(latest_tsv, sep='\t')
    
    # Filter only entries that still need review
    df_to_remediate = df_failed[df_failed['Remediation_Status'] == 'NEEDS_REVIEW'].copy()
    
    print(f"\nTotal failed mappings: {len(df_failed)}")
    print(f"Already remediated: {len(df_failed) - len(df_to_remediate)}")
    print(f"Still need review: {len(df_to_remediate)}")
    
    if len(df_to_remediate) == 0:
        print("\n‚úì All entries have been remediated!")
    else:
        print(f"\nReady to remediate {len(df_to_remediate)} entries.")

Found failed mappings file: DOME_Registry_Remediation/Failed_DOI_Mappings_2025-11-25.tsv

Total failed mappings: 31
Already remediated: 0
Still need review: 31

Ready to remediate 31 entries.


In [4]:
# Interactive Remediation Interface
if len(df_to_remediate) > 0:
    
    # Current entry index
    current_index = 0
    
    # Create output file path
    current_date = datetime.now().strftime('%Y-%m-%d')
    output_tsv_folder = 'DOME_Registry_TSV_Files'
    remediated_tsv_path = os.path.join(output_tsv_folder, f'remediated_Failed_DOI_Mappings_{current_date}.tsv')
    
    # Initialize remediated dataframe (copy from original with updates)
    if os.path.exists(remediated_tsv_path):
        df_remediated = pd.read_csv(remediated_tsv_path, sep='\t')
        print(f"Loading existing remediated file: {remediated_tsv_path}")
    else:
        df_remediated = df_failed.copy()
        print(f"Creating new remediated file: {remediated_tsv_path}")
    
    # Create widgets
    output = widgets.Output()
    
    # Input fields
    pmid_input = widgets.Text(
        value='',
        placeholder='Enter PMID (e.g., 12345678)',
        description='PMID:',
        style={'description_width': '100px'}
    )
    
    pmcid_input = widgets.Text(
        value='',
        placeholder='Enter PMCID (e.g., PMC1234567)',
        description='PMCID:',
        style={'description_width': '100px'}
    )
    
    status_dropdown = widgets.Dropdown(
        options=['RESOLVED', 'NOT_IN_PUBMED', 'DOI_ERROR', 'SKIP_FOR_NOW'],
        value='RESOLVED',
        description='Status:',
        style={'description_width': '100px'}
    )
    
    notes_input = widgets.Textarea(
        value='',
        placeholder='Add any notes about this entry',
        description='Notes:',
        rows=3,
        style={'description_width': '100px'}
    )
    
    # Buttons
    save_next_btn = widgets.Button(
        description='Save & Next',
        button_style='success',
        icon='check'
    )
    
    skip_btn = widgets.Button(
        description='Skip',
        button_style='warning',
        icon='forward'
    )
    
    prev_btn = widgets.Button(
        description='Previous',
        button_style='info',
        icon='backward'
    )
    
    # Progress indicator
    progress_label = widgets.HTML()
    
    # Function to display current entry
    def display_entry(index):
        with output:
            clear_output(wait=True)
            
            if index >= len(df_to_remediate):
                display(HTML("""
                    <div style='padding: 20px; background-color: #d4edda; border: 2px solid #28a745; border-radius: 5px;'>
                        <h2 style='color: #155724;'>‚úì All Entries Remediated!</h2>
                        <p>You have reviewed all failed mappings.</p>
                        <p>Remediated file saved to: <strong>{}</strong></p>
                    </div>
                """.format(remediated_tsv_path)))
                return
            
            row = df_to_remediate.iloc[index]
            
            # Update progress
            progress_label.value = f"<b>Entry {index + 1} of {len(df_to_remediate)}</b>"
            
            # Display entry information
            doi_cleaned = row.get('DOI_Cleaned', 'N/A')
            doi_link = row.get('DOI_Link', '')
            title = row.get('publication_title', 'N/A')
            year = row.get('publication_year', 'N/A')
            journal = row.get('publication_journal', 'N/A')
            shortid = row.get('shortid', 'N/A')
            
            html_content = f"""
            <div style='padding: 15px; background-color: #f8f9fa; border: 1px solid #dee2e6; border-radius: 5px; margin-bottom: 15px;'>
                <h3 style='color: #495057; margin-top: 0;'>Entry Details</h3>
                <table style='width: 100%; border-collapse: collapse;'>
                    <tr>
                        <td style='padding: 8px; font-weight: bold; width: 150px;'>DOME ID:</td>
                        <td style='padding: 8px;'>{shortid}</td>
                    </tr>
                    <tr>
                        <td style='padding: 8px; font-weight: bold;'>Title:</td>
                        <td style='padding: 8px;'>{title}</td>
                    </tr>
                    <tr>
                        <td style='padding: 8px; font-weight: bold;'>Year:</td>
                        <td style='padding: 8px;'>{year}</td>
                    </tr>
                    <tr>
                        <td style='padding: 8px; font-weight: bold;'>Journal:</td>
                        <td style='padding: 8px;'>{journal}</td>
                    </tr>
                    <tr>
                        <td style='padding: 8px; font-weight: bold;'>DOI (Cleaned):</td>
                        <td style='padding: 8px;'>{doi_cleaned}</td>
                    </tr>
                    <tr>
                        <td style='padding: 8px; font-weight: bold;'>Links:</td>
                        <td style='padding: 8px;'>
                            <a href='{doi_link}' target='_blank' style='color: #007bff; text-decoration: none; margin-right: 15px;'>
                                üîó Open DOI
                            </a>
                            <a href='https://pubmed.ncbi.nlm.nih.gov/?term={doi_cleaned}' target='_blank' style='color: #28a745; text-decoration: none;'>
                                üîç Search PubMed
                            </a>
                        </td>
                    </tr>
                </table>
            </div>
            """
            
            display(HTML(html_content))
            
            # Pre-fill if data already exists
            existing_pmid = row.get('Manual_PMID', '')
            existing_pmcid = row.get('Manual_PMCID', '')
            existing_status = row.get('Remediation_Status', 'RESOLVED')
            existing_notes = row.get('Remediation_Notes', '')
            
            if pd.notna(existing_pmid) and str(existing_pmid).strip():
                pmid_input.value = str(existing_pmid)
            else:
                pmid_input.value = ''
                
            if pd.notna(existing_pmcid) and str(existing_pmcid).strip():
                pmcid_input.value = str(existing_pmcid)
            else:
                pmcid_input.value = ''
                
            if existing_status != 'NEEDS_REVIEW':
                status_dropdown.value = existing_status
            else:
                status_dropdown.value = 'RESOLVED'
                
            if pd.notna(existing_notes):
                notes_input.value = str(existing_notes)
            else:
                notes_input.value = ''
    
    # Save function
    def save_entry(index):
        if index >= len(df_to_remediate):
            return
        
        row = df_to_remediate.iloc[index]
        shortid = row.get('shortid')
        
        # Find the row in the main remediated dataframe
        mask = df_remediated['shortid'] == shortid
        
        # Update the values
        df_remediated.loc[mask, 'Manual_PMID'] = pmid_input.value.strip()
        df_remediated.loc[mask, 'Manual_PMCID'] = pmcid_input.value.strip()
        df_remediated.loc[mask, 'Remediation_Status'] = status_dropdown.value
        df_remediated.loc[mask, 'Remediation_Notes'] = notes_input.value.strip()
        
        # Save to TSV
        df_remediated.to_csv(remediated_tsv_path, sep='\t', index=False)
        
        with output:
            print(f"‚úì Saved entry {index + 1} to {remediated_tsv_path}")
    
    # Button click handlers
    def on_save_next_clicked(b):
        global current_index
        save_entry(current_index)
        current_index += 1
        display_entry(current_index)
    
    def on_skip_clicked(b):
        global current_index
        current_index += 1
        display_entry(current_index)
    
    def on_prev_clicked(b):
        global current_index
        if current_index > 0:
            current_index -= 1
            display_entry(current_index)
    
    save_next_btn.on_click(on_save_next_clicked)
    skip_btn.on_click(on_skip_clicked)
    prev_btn.on_click(on_prev_clicked)
    
    # Layout
    input_box = widgets.VBox([
        widgets.HTML("<h3>Enter Remediation Data:</h3>"),
        pmid_input,
        pmcid_input,
        status_dropdown,
        notes_input,
        widgets.HBox([prev_btn, save_next_btn, skip_btn]),
        progress_label
    ], layout=widgets.Layout(padding='10px'))
    
    # Display interface
    display(widgets.VBox([input_box, output]))
    
    # Display first entry
    display_entry(current_index)
    
else:
    print("No entries to remediate!")

Creating new remediated file: DOME_Registry_TSV_Files/remediated_Failed_DOI_Mappings_2025-11-25.tsv


VBox(children=(VBox(children=(HTML(value='<h3>Enter Remediation Data:</h3>'), Text(value='', description='PMID‚Ä¶

## Summary Statistics

Run this cell after completing remediation to see statistics about your progress.

In [None]:
# Display remediation statistics
if os.path.exists(remediated_tsv_path):
    df_stats = pd.read_csv(remediated_tsv_path, sep='\t')
    
    total = len(df_stats)
    resolved = (df_stats['Remediation_Status'] == 'RESOLVED').sum()
    not_in_pubmed = (df_stats['Remediation_Status'] == 'NOT_IN_PUBMED').sum()
    doi_error = (df_stats['Remediation_Status'] == 'DOI_ERROR').sum()
    needs_review = (df_stats['Remediation_Status'] == 'NEEDS_REVIEW').sum()
    skip_for_now = (df_stats['Remediation_Status'] == 'SKIP_FOR_NOW').sum()
    
    # Count entries with PMID/PMCID
    with_pmid = df_stats['Manual_PMID'].notna().sum()
    with_pmcid = df_stats['Manual_PMCID'].notna().sum()
    
    print("="*60)
    print("REMEDIATION SUMMARY STATISTICS")
    print("="*60)
    print(f"Total failed mappings: {total}")
    print(f"\nStatus breakdown:")
    print(f"  ‚úì Resolved: {resolved} ({resolved/total*100:.1f}%)")
    print(f"  ‚úó Not in PubMed: {not_in_pubmed} ({not_in_pubmed/total*100:.1f}%)")
    print(f"  ‚ö† DOI Error: {doi_error} ({doi_error/total*100:.1f}%)")
    print(f"  ‚è≠ Skipped for now: {skip_for_now} ({skip_for_now/total*100:.1f}%)")
    print(f"  ‚è≥ Still needs review: {needs_review} ({needs_review/total*100:.1f}%)")
    print(f"\nIdentifiers found:")
    print(f"  PMID entries: {with_pmid}")
    print(f"  PMCID entries: {with_pmcid}")
    print(f"\nRemediated file: {remediated_tsv_path}")
    print("="*60)
else:
    print("No remediated file found yet. Start remediation process above.")

## Next Steps

After completing remediation:

1. **Review the remediated TSV file** in `DOME_Registry_TSV_Files/remediated_Failed_DOI_Mappings_[date].tsv`
2. **Re-run download blocks** (Blocks 5-7) for newly identified PMCIDs
3. **Update the main DOME Registry TSV** by merging the remediated data
4. **Re-run Block 8** to update metadata statistics

### Optional: Merge remediated data back to main TSV
Run the cell below to merge manual PMID/PMCID values back into the main DOME Registry TSV.

In [None]:
# Optional: Merge remediated data back to main TSV
# WARNING: This will modify your main DOME Registry TSV file!

merge_confirmed = input("Do you want to merge remediated data back to main TSV? (yes/no): ")

if merge_confirmed.lower() == 'yes':
    if os.path.exists(remediated_tsv_path):
        df_remediated = pd.read_csv(remediated_tsv_path, sep='\t')
        
        # Find the most recent main TSV file
        main_tsv_pattern = 'DOME_Registry_TSV_Files/PMCIDs_DOME_Registry_Contents_*.tsv'
        main_tsv_files = glob.glob(main_tsv_pattern)
        
        if main_tsv_files:
            latest_main_tsv = max(main_tsv_files, key=os.path.getmtime)
            print(f"Loading main TSV: {latest_main_tsv}")
            
            df_main = pd.read_csv(latest_main_tsv, sep='\t')
            
            # Merge only resolved entries with PMID/PMCID
            resolved_entries = df_remediated[
                (df_remediated['Remediation_Status'] == 'RESOLVED') &
                (df_remediated['Manual_PMID'].notna() | df_remediated['Manual_PMCID'].notna())
            ]
            
            merge_count = 0
            for idx, row in resolved_entries.iterrows():
                shortid = row['shortid']
                mask = df_main['shortid'] == shortid
                
                if mask.any():
                    if pd.notna(row['Manual_PMID']) and str(row['Manual_PMID']).strip():
                        df_main.loc[mask, 'mapped_pmid'] = row['Manual_PMID']
                    if pd.notna(row['Manual_PMCID']) and str(row['Manual_PMCID']).strip():
                        df_main.loc[mask, 'mapped_pmcid'] = row['Manual_PMCID']
                        # Also update europepmc_id
                        df_main.loc[mask, 'mapped_europepmc_id'] = row['Manual_PMCID']
                    merge_count += 1
            
            # Save merged TSV with new filename
            merged_tsv_path = latest_main_tsv.replace('.tsv', '_merged.tsv')
            df_main.to_csv(merged_tsv_path, sep='\t', index=False)
            
            print(f"\n‚úì Merged {merge_count} remediated entries")
            print(f"‚úì Saved to: {merged_tsv_path}")
            print(f"\nYou can now use this merged file for downloading PDFs and supplementary files!")
        else:
            print("ERROR: Could not find main DOME Registry TSV file!")
    else:
        print("ERROR: No remediated file found!")
else:
    print("Merge cancelled.")