# Data Curation Interface

This notebook provides an interactive interface for curating data entries from a CSV file.

**Features:**
- Read CSV row by row
- Display title, abstract, and name for easy viewing
- Click Y (positive) or N (negative) to classify entries
- Automatic saving to separate positive/negative CSV files
- Resume from specific row
- Automatic backups before saving
- Progress tracking

In [2]:
# Import required libraries
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import os
from datetime import datetime
import json
import markdown

## Configuration

Set your file paths and column names here.

In [3]:
# Configuration
INPUT_CSV = 'Dataset/random_top20_entries_across_journals.csv'  # Change this to your input CSV file
POSITIVE_CSV = 'positive_entries.csv'
NEGATIVE_CSV = 'negative_entries.csv'
SKIPPED_CSV = 'skipped_entries.csv'
PROGRESS_FILE = 'curation_progress.json'

# Column names (adjust these to match your CSV)
TITLE_COL = 'Title'  # Change to your title column name
ABSTRACT_COL = 'Abstract'  # Change to your abstract column name
NAME_COL = 'Journal'  # Change to your name column name
YEAR_COL = 'Year'  # Change to your year column name

## Curation Class

Main class that handles all curation functionality.

In [4]:
class CurationInterface:
    def __init__(self, input_csv, positive_csv, negative_csv, skipped_csv, progress_file,
                 title_col='title', abstract_col='abstract', name_col='name', year_col='Year'):
        self.input_csv = input_csv
        self.positive_csv = positive_csv
        self.negative_csv = negative_csv
        self.skipped_csv = skipped_csv
        self.progress_file = progress_file
        self.title_col = title_col
        self.abstract_col = abstract_col
        self.name_col = name_col
        self.year_col = year_col
        
        # Load data
        self.df = pd.read_csv(input_csv)
        
        # Filter for PMCID
        if 'PMCID' in self.df.columns:
            self.df = self.df[self.df['PMCID'].notna() & (self.df['PMCID'] != '')]
            print(f"Filtered to {len(self.df)} entries with PMCID.")
            
        # Filter out already curated entries
        curated_dois = set()
        if os.path.exists(positive_csv):
            try:
                pos_df = pd.read_csv(positive_csv)
                if 'DOI' in pos_df.columns:
                    curated_dois.update(pos_df['DOI'].dropna().astype(str).tolist())
            except Exception:
                pass
                
        if os.path.exists(negative_csv):
            try:
                neg_df = pd.read_csv(negative_csv)
                if 'DOI' in neg_df.columns:
                    curated_dois.update(neg_df['DOI'].dropna().astype(str).tolist())
            except Exception:
                pass

        if os.path.exists(skipped_csv):
            try:
                skip_df = pd.read_csv(skipped_csv)
                if 'DOI' in skip_df.columns:
                    curated_dois.update(skip_df['DOI'].dropna().astype(str).tolist())
            except Exception:
                pass
                
        if 'DOI' in self.df.columns and curated_dois:
            initial_count = len(self.df)
            self.df = self.df[~self.df['DOI'].astype(str).isin(curated_dois)]
            print(f"Removed {initial_count - len(self.df)} already curated/skipped entries.")
        
        self.df = self.df.reset_index(drop=True)
        self.current_idx = 0
        self.total_rows = len(self.df)
        
        # Load progress if exists
        # Note: We skip loading progress index because we've filtered the dataframe
        # and removed curated entries, so we always start from the beginning of the remaining items.
        # self.load_progress()
        print(f"Starting curation session with {self.total_rows} entries remaining.")
        
        # Create widgets
        self.create_widgets()
        
    def load_progress(self):
        """Load progress from previous session"""
        if os.path.exists(self.progress_file):
            with open(self.progress_file, 'r') as f:
                progress = json.load(f)
                self.current_idx = progress.get('current_idx', 0)
                print(f"✓ Resuming from row {self.current_idx + 1} of {self.total_rows}")
        else:
            print(f"Starting new curation session. Total rows: {self.total_rows}")
    
    def save_progress(self):
        """Save current progress"""
        progress = {
            'current_idx': self.current_idx,
            'last_updated': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }
        with open(self.progress_file, 'w') as f:
            json.dump(progress, f, indent=2)
    
    def backup_file(self, filepath):
        """Create backup of existing file (single backup, overwrites previous)"""
        if os.path.exists(filepath):
            directory = os.path.dirname(filepath)
            filename = os.path.basename(filepath)
            name, ext = os.path.splitext(filename)
            
            if directory:
                backup_path = os.path.join(directory, f"{name}_backup{ext}")
            else:
                backup_path = f"{name}_backup{ext}"
            
            import shutil
            shutil.copy2(filepath, backup_path)
            return backup_path
        return None
    
    def save_entry(self, entry, status):
        """Save entry to appropriate CSV file
        status: 'positive', 'negative', or 'skipped'
        """
        if status == 'positive':
            filepath = self.positive_csv
            label = "POSITIVE"
        elif status == 'negative':
            filepath = self.negative_csv
            label = "NEGATIVE"
        else:
            filepath = self.skipped_csv
            label = "SKIPPED"
        
        # Backup existing file
        backup_path = self.backup_file(filepath)
        
        # Append to CSV
        if os.path.exists(filepath):
            existing_df = pd.read_csv(filepath)
            updated_df = pd.concat([existing_df, entry], ignore_index=True)
        else:
            updated_df = entry
        
        updated_df.to_csv(filepath, index=False)
        
        return f"Saved to {label} ({filepath})"
    
    def create_widgets(self):
        """Create UI widgets"""
        # Title widget
        self.title_widget = widgets.HTML(value="")
        
        # Name widget
        self.name_widget = widgets.HTML(value="")
        
        # Year widget
        self.year_widget = widgets.HTML(value="")
        
        # Abstract widget (HTML for markdown rendering)
        self.abstract_widget = widgets.HTML(
            value='',
            layout=widgets.Layout(
                width='100%', 
                height='300px', 
                overflow_y='auto',
                border='1px solid #ccc',
                padding='10px'
            )
        )
        
        # Progress widget
        self.progress_widget = widgets.HTML(value="")
        
        # Status widget
        self.status_widget = widgets.HTML(value="")
        
        # Buttons
        self.yes_button = widgets.Button(
            description='✓ YES (Positive)',
            button_style='success',
            layout=widgets.Layout(width='200px', height='50px'),
            style={'font_weight': 'bold'}
        )
        self.no_button = widgets.Button(
            description='✗ NO (Negative)',
            button_style='danger',
            layout=widgets.Layout(width='200px', height='50px'),
            style={'font_weight': 'bold'}
        )
        self.skip_button = widgets.Button(
            description='Skip',
            button_style='warning',
            layout=widgets.Layout(width='100px', height='50px')
        )
        self.goto_button = widgets.Button(
            description='Go to Row',
            button_style='info',
            layout=widgets.Layout(width='100px', height='30px')
        )
        
        # Row input for jumping to specific row
        self.row_input = widgets.IntText(
            value=self.current_idx + 1,
            min=1,
            max=self.total_rows,
            layout=widgets.Layout(width='100px')
        )
        
        # Attach event handlers
        self.yes_button.on_click(self.on_yes)
        self.no_button.on_click(self.on_no)
        self.skip_button.on_click(self.on_skip)
        self.goto_button.on_click(self.on_goto)
    
    def display_current_entry(self):
        """Display the current entry"""
        if self.current_idx >= self.total_rows:
            self.status_widget.value = '<h2 style="color: green;">✓ All entries curated!</h2>'
            self.yes_button.disabled = True
            self.no_button.disabled = True
            self.skip_button.disabled = True
            return
        
        row = self.df.iloc[self.current_idx]
        
        # Update progress
        progress_pct = (self.current_idx / self.total_rows) * 100
        self.progress_widget.value = f'<h3>Progress: {self.current_idx}/{self.total_rows} ({progress_pct:.1f}%)</h3>'
        
        # Update title
        title = row.get(self.title_col, 'N/A')
        self.title_widget.value = f'<h2 style="color: #2c3e50;">Title: {title}</h2>'
        
        # Update name
        name = row.get(self.name_col, 'N/A')
        self.name_widget.value = f'<p style="font-size: 14px; color: #7f8c8d;"><b>Name:</b> {name}</p>'
        
        # Update year
        year = row.get(self.year_col, 'N/A')
        self.year_widget.value = f'<p style="font-size: 14px; color: #7f8c8d;"><b>Year:</b> {year}</p>'
        
        # Update abstract - convert markdown to HTML
        abstract = str(row.get(self.abstract_col, 'N/A'))
        try:
            # Convert markdown to HTML
            html_abstract = markdown.markdown(
                abstract,
                extensions=['markdown.extensions.extra', 'markdown.extensions.codehilite']
            )
            # Add some styling to the rendered HTML
            styled_abstract = f'''
            <div style="
                font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
                line-height: 1.6;
                color: #333;
                background-color: #f9f9f9;
                padding: 15px;
                border-radius: 5px;
                max-height: 250px;
                overflow-y: auto;
            ">
                {html_abstract}
            </div>
            '''
            self.abstract_widget.value = styled_abstract
        except Exception as e:
            # Fallback to plain text if markdown conversion fails
            self.abstract_widget.value = f'<div style="padding: 15px; background-color: #f9f9f9; border-radius: 5px;"><pre>{abstract}</pre></div>'
        
        # Update row input
        self.row_input.value = self.current_idx + 1
        
        # Clear status
        self.status_widget.value = ''
    
    def on_yes(self, button):
        """Handle YES button click"""
        entry = self.df.iloc[[self.current_idx]]
        message = self.save_entry(entry, status='positive')
        self.status_widget.value = f'<p style="color: green; font-weight: bold;">✓ {message}</p>'
        self.current_idx += 1
        self.save_progress()
        self.display_current_entry()
    
    def on_no(self, button):
        """Handle NO button click"""
        entry = self.df.iloc[[self.current_idx]]
        message = self.save_entry(entry, status='negative')
        self.status_widget.value = f'<p style="color: red; font-weight: bold;">✗ {message}</p>'
        self.current_idx += 1
        self.save_progress()
        self.display_current_entry()
    
    def on_skip(self, button):
        """Handle SKIP button click"""
        entry = self.df.iloc[[self.current_idx]]
        message = self.save_entry(entry, status='skipped')
        self.status_widget.value = f'<p style="color: orange;">⊘ {message}</p>'
        self.current_idx += 1
        self.save_progress()
        self.display_current_entry()
    
    def on_goto(self, button):
        """Handle Go to Row button click"""
        target_row = self.row_input.value - 1  # Convert to 0-indexed
        if 0 <= target_row < self.total_rows:
            self.current_idx = target_row
            self.save_progress()
            self.status_widget.value = f'<p style="color: blue;">Jumped to row {target_row + 1}</p>'
            self.display_current_entry()
        else:
            self.status_widget.value = f'<p style="color: red;">Invalid row number. Must be between 1 and {self.total_rows}</p>'
    
    def display(self):
        """Display the interface"""
        # Create layout
        button_box = widgets.HBox([self.yes_button, self.no_button, self.skip_button],
                                   layout=widgets.Layout(justify_content='center', margin='20px 0'))
        
        navigation_box = widgets.HBox([widgets.Label('Jump to row:'), self.row_input, self.goto_button],
                                       layout=widgets.Layout(justify_content='center', margin='10px 0'))
        
        main_box = widgets.VBox([
            self.progress_widget,
            widgets.HTML('<hr>'),
            self.title_widget,
            self.name_widget,
            self.year_widget,
            widgets.HTML('<label style="font-weight: bold; font-size: 16px; margin: 10px 0; display: block;">Abstract:</label>'),
            self.abstract_widget,
            button_box,
            self.status_widget,
            widgets.HTML('<hr>'),
            navigation_box
        ])
        
        # Display initial entry
        self.display_current_entry()
        
        # Show interface
        display(main_box)

## Start Curation

Run the cell below to start the curation interface.

In [5]:
# Initialize and display the curation interface
curator = CurationInterface(
    input_csv=INPUT_CSV,
    positive_csv=POSITIVE_CSV,
    negative_csv=NEGATIVE_CSV,
    skipped_csv=SKIPPED_CSV,
    progress_file=PROGRESS_FILE,
    title_col=TITLE_COL,
    abstract_col=ABSTRACT_COL,
    name_col=NAME_COL,
    year_col=YEAR_COL
)

curator.display()

Filtered to 71981 entries with PMCID.
Removed 2106 already curated/skipped entries.
Starting curation session with 69875 entries remaining.


VBox(children=(HTML(value='<h3>Progress: 0/69875 (0.0%)</h3>'), HTML(value='<hr>'), HTML(value='<h2 style="col…

## Utilities

Helper functions for managing your curation data.

In [6]:
# View statistics
def show_statistics():
    """Display curation statistics"""
    stats = {}
    
    if os.path.exists(POSITIVE_CSV):
        pos_df = pd.read_csv(POSITIVE_CSV)
        stats['Positive entries'] = len(pos_df)
    else:
        stats['Positive entries'] = 0
    
    if os.path.exists(NEGATIVE_CSV):
        neg_df = pd.read_csv(NEGATIVE_CSV)
        stats['Negative entries'] = len(neg_df)
    else:
        stats['Negative entries'] = 0

    if os.path.exists(SKIPPED_CSV):
        skip_df = pd.read_csv(SKIPPED_CSV)
        stats['Skipped entries'] = len(skip_df)
    else:
        stats['Skipped entries'] = 0
    
    stats['Total curated'] = stats['Positive entries'] + stats['Negative entries'] + stats['Skipped entries']
    
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as f:
            progress = json.load(f)
            stats['Current row'] = progress.get('current_idx', 0) + 1
            stats['Last updated'] = progress.get('last_updated', 'N/A')
    
    print("=" * 50)
    print("CURATION STATISTICS")
    print("=" * 50)
    for key, value in stats.items():
        print(f"{key:.<40} {value}")
    print("=" * 50)

# Uncomment to view statistics
show_statistics()

CURATION STATISTICS
Positive entries........................ 952
Negative entries........................ 1720
Skipped entries......................... 302
Total curated........................... 2974
Current row............................. 101
Last updated............................ 2025-12-22 14:02:59


In [7]:
# Reset progress (use with caution!)
#def reset_progress():
#    """Reset progress to start from the beginning"""
#    if os.path.exists(PROGRESS_FILE):
#        os.remove(PROGRESS_FILE)
#        print("Progress reset. You can now restart from row 1.")
#    else:
#        print("No progress file found.")

# Uncomment to reset progress
# reset_progress()

# Top Positive Curation

This section implements a specialized curation workflow to identify strong positive samples.
It selects the top entry from each journal (with at least 20 matches) that has a valid PMID.
The results are saved to `top_pos.csv`.

In [8]:
import pandas as pd
import os

# Configuration
SOURCE_CSV = 'Dataset/top20_entries_across_journals.csv'
CANDIDATES_CSV = 'Dataset/top_positive_candidates.csv'
TOP_POS_CSV = 'top_pos.csv'
TOP_NEG_CSV = 'top_neg.csv'
TOP_SKIPPED_CSV = 'top_skipped.csv'
TOP_PROGRESS_FILE = 'top_curation_progress.json'

print("Loading and preparing data...")
df = pd.read_csv(SOURCE_CSV)

# Filter for journals with >= 20 entries
journal_counts = df['Journal'].value_counts()
valid_journals = journal_counts[journal_counts >= 20].index
print(f"Found {len(valid_journals)} journals with >= 20 entries.")

df_filtered = df[df['Journal'].isin(valid_journals)]

# Select top entry per journal with PMCID
top_entries = []
journals_without_pmcid = []

for journal in valid_journals:
    journal_entries = df_filtered[df_filtered['Journal'] == journal]
    
    # Sort by match_percentage if available to ensure we get the "top" one
    if 'match_percentage' in journal_entries.columns:
        journal_entries = journal_entries.sort_values('match_percentage', ascending=False)
    
    # Find first entry with valid PMCID
    found = False
    for _, row in journal_entries.iterrows():
        # Check if PMCID is not NaN and not empty string
        pmcid_val = row['PMCID']
        if pd.notna(pmcid_val) and str(pmcid_val).strip() != '':
            top_entries.append(row)
            found = True
            break
    
    if not found:
        journals_without_pmcid.append(journal)

top_df = pd.DataFrame(top_entries)
print(f"Selected {len(top_df)} entries (one per journal with PMCID).")

# Save to a temporary file for the curation interface
top_df.to_csv(CANDIDATES_CSV, index=False)
print(f"Saved candidates to {CANDIDATES_CSV}")

Loading and preparing data...
Found 3076 journals with >= 20 entries.
Found 3076 journals with >= 20 entries.
Selected 2899 entries (one per journal with PMCID).
Selected 2899 entries (one per journal with PMCID).
Saved candidates to Dataset/top_positive_candidates.csv
Saved candidates to Dataset/top_positive_candidates.csv


In [9]:
# Define specialized Curation Interface
class TopCurationInterface(CurationInterface):
    def __init__(self, input_csv, positive_csv, negative_csv, skipped_csv, progress_file,
                 title_col='title', abstract_col='abstract', name_col='name', year_col='Year'):
        # Initialize attributes directly to avoid the PMCID filtering in the parent class
        self.input_csv = input_csv
        self.positive_csv = positive_csv
        self.negative_csv = negative_csv
        self.skipped_csv = skipped_csv
        self.progress_file = progress_file
        self.title_col = title_col
        self.abstract_col = abstract_col
        self.name_col = name_col
        self.year_col = year_col
        
        # Load data
        self.df = pd.read_csv(input_csv)
        
        # Filter for PMCID
        if 'PMCID' in self.df.columns:
            self.df = self.df[self.df['PMCID'].notna() & (self.df['PMCID'] != '')]
            print(f"Filtered to {len(self.df)} entries with PMCID.")
            
        # Filter out already curated entries
        curated_dois = set()
        if os.path.exists(positive_csv):
            try:
                pos_df = pd.read_csv(positive_csv)
                if 'DOI' in pos_df.columns:
                    curated_dois.update(pos_df['DOI'].dropna().astype(str).tolist())
            except Exception:
                pass
                
        if os.path.exists(negative_csv):
            try:
                neg_df = pd.read_csv(negative_csv)
                if 'DOI' in neg_df.columns:
                    curated_dois.update(neg_df['DOI'].dropna().astype(str).tolist())
            except Exception:
                pass

        if os.path.exists(skipped_csv):
            try:
                skip_df = pd.read_csv(skipped_csv)
                if 'DOI' in skip_df.columns:
                    curated_dois.update(skip_df['DOI'].dropna().astype(str).tolist())
            except Exception:
                pass
                
        if 'DOI' in self.df.columns and curated_dois:
            initial_count = len(self.df)
            self.df = self.df[~self.df['DOI'].astype(str).isin(curated_dois)]
            print(f"Removed {initial_count - len(self.df)} already curated/skipped entries.")
        
        self.df = self.df.reset_index(drop=True)
        self.current_idx = 0
        self.total_rows = len(self.df)
        
        print(f"Starting curation session with {self.total_rows} entries remaining.")
        
        # Create widgets
        self.create_widgets()

# Initialize and display
top_curator = TopCurationInterface(
    input_csv=CANDIDATES_CSV,
    positive_csv=TOP_POS_CSV,
    negative_csv=TOP_NEG_CSV,
    skipped_csv=TOP_SKIPPED_CSV,
    progress_file=TOP_PROGRESS_FILE,
    title_col='Title',
    abstract_col='Abstract',
    name_col='Journal',
    year_col='Year'
)

top_curator.display()

Filtered to 2899 entries with PMCID.
Removed 512 already curated/skipped entries.
Starting curation session with 2387 entries remaining.


VBox(children=(HTML(value='<h3>Progress: 0/2387 (0.0%)</h3>'), HTML(value='<hr>'), HTML(value='<h2 style="colo…

            ## Top Curation Statistics

View progress and statistics for the top positive curation session.

In [35]:
# View statistics for Top Positive Curation
def show_top_statistics():
    """Display top curation statistics"""
    stats = {}
    
    if os.path.exists(TOP_POS_CSV):
        pos_df = pd.read_csv(TOP_POS_CSV)
        stats['Top Positive entries'] = len(pos_df)
    else:
        stats['Top Positive entries'] = 0
    
    if os.path.exists(TOP_NEG_CSV):
        neg_df = pd.read_csv(TOP_NEG_CSV)
        stats['Top Negative entries'] = len(neg_df)
    else:
        stats['Top Negative entries'] = 0

    if os.path.exists(TOP_SKIPPED_CSV):
        skip_df = pd.read_csv(TOP_SKIPPED_CSV)
        stats['Top Skipped entries'] = len(skip_df)
    else:
        stats['Top Skipped entries'] = 0
    
    stats['Total Top curated'] = stats['Top Positive entries'] + stats['Top Negative entries'] + stats['Top Skipped entries']
    
    if os.path.exists(TOP_PROGRESS_FILE):
        with open(TOP_PROGRESS_FILE, 'r') as f:
            progress = json.load(f)
            stats['Current row'] = progress.get('current_idx', 0) + 1
            stats['Last updated'] = progress.get('last_updated', 'N/A')
    
    print("=" * 50)
    print("TOP CURATION STATISTICS")
    print("=" * 50)
    for key, value in stats.items():
        print(f"{key:.<40} {value}")
    print("=" * 50)

# Show statistics
show_top_statistics()

TOP CURATION STATISTICS
Top Positive entries.................... 507
Top Negative entries.................... 200
Top Skipped entries..................... 239
Total Top curated....................... 946
Current row............................. 404
Last updated............................ 2025-12-23 11:50:41


In [31]:
import pandas as pd
import os

def get_pmcid_count(csv_path):
    if not os.path.exists(csv_path):
        return "File not found"
    
    try:
        df = pd.read_csv(csv_path)
        if 'PMCID' not in df.columns:
            return "Column 'PMCID' not found"
            
        # Count non-null and non-empty PMCIDs
        # Ensure PMCID is treated as string for the empty check
        valid_pmcids = df[df['PMCID'].notna() & (df['PMCID'].astype(str).str.strip() != '')]
        return len(valid_pmcids)
    except Exception as e:
        return f"Error: {str(e)}"

# File paths
top_pos_path = 'top_pos.csv'
pos_entries_path = 'positive_entries.csv'

print("=" * 50)
print("PMCID COUNT SUMMARY")
print("=" * 50)

# Check Top Positive CSV
count_top = get_pmcid_count(top_pos_path)
print(f"Top Positive CSV ({top_pos_path}): {count_top}")

# Check Positive Entries CSV
count_pos = get_pmcid_count(pos_entries_path)
print(f"Positive Entries CSV ({pos_entries_path}): {count_pos}")

print("=" * 50)

#add both rows and make row for total
if isinstance(count_top, int) and isinstance(count_pos, int):
    total_count = count_top + count_pos
    print(f"Total PMCIDs across both files: {total_count}")

#count total lines across both csvs to see how many entries there are in total
total_lines_top = 0
total_lines_pos = 0
if os.path.exists(top_pos_path):
    with open(top_pos_path, 'r') as f:
        total_lines_top = sum(1 for line in f) - 1  # subtract header line  
if os.path.exists(pos_entries_path):
    with open(pos_entries_path, 'r') as f:
        total_lines_pos = sum(1 for line in f) - 1  # subtract header line
total_lines = total_lines_top + total_lines_pos
print(f"Total entries across both files: {total_lines}")
print("=" * 50)


PMCID COUNT SUMMARY
Top Positive CSV (top_pos.csv): 436
Positive Entries CSV (positive_entries.csv): 565
Total PMCIDs across both files: 1001
Total entries across both files: 1419
