# Europe PMC Links Generator

Here is the Python notebook version of the Perl script you provided. It replicates the functionality exactly: it validates your input against the official Europe PMC Schema (labslink.xsd), handles the Provider Details, and converts your TSV file into the required XML format.

## Prerequisites
To run this in a local Jupyter environment or Google Colab, you will need the lxml and pandas libraries. You can install them by running this cell first:

In [None]:
!pip install lxml pandas requests

## Step 1: Imports and Setup
This cell sets up the necessary libraries and defines the location of the Europe PMC schema.

In [None]:
import os
import sys
import time
import requests
import pandas as pd
from lxml import etree

# Location of the official Europe PMC LabsLink schema
SCHEMA_URL = 'http://europepmc.org/docs/labslink.xsd'

def get_schema():
    """Fetches and parses the validation schema from Europe PMC."""
    try:
        response = requests.get(SCHEMA_URL)
        response.raise_for_status()
        schema_root = etree.XML(response.content)
        return etree.XMLSchema(schema_root)
    except Exception as e:
        print(f"Error loading schema from {SCHEMA_URL}: {e}")
        sys.exit(1)

# Initialize schema
schema = get_schema()
print("Schema loaded successfully.")

## Step 2: Configuration
Define your file names here. In a notebook, this replaces the command-line arguments.

In [None]:
# --- USER CONFIGURATION ---
TAB_DELIMITED_FILE = 'input_links.tsv'       # Your input TSV file
PROVIDER_DETAILS_FILE = 'provider_info.xml'  # Where to save/read provider details
OUTPUT_LINKS_FILE = 'output_links.xml'       # Final XML output
# --------------------------

if not os.path.exists(TAB_DELIMITED_FILE):
    print(f"Error: Cannot find input file '{TAB_DELIMITED_FILE}'.")
    print("Please upload your TSV file before proceeding.")
else:
    print(f"Input file '{TAB_DELIMITED_FILE}' found.")

## Step 3: Handle Provider Details
This section checks if your provider XML exists. If not, it prompts you for the details (Provider ID, Resource Name, etc.), creates the file, and validates it.

In [None]:
provider_id = ''

def prompt_user(message):
    """Helper to get user input."""
    value = input(f"{message}: ").strip()
    if not value:
        raise ValueError("You must enter a value.")
    return value

if os.path.exists(PROVIDER_DETAILS_FILE):
    # File exists: Read and Validate
    try:
        parser = etree.XMLParser(remove_blank_text=True)
        xml_doc = etree.parse(PROVIDER_DETAILS_FILE, parser)
        schema.assertValid(xml_doc)
        
        # Extract Provider ID
        root = xml_doc.getroot()
        # Namespace handling might be needed depending on schema, 
        # but usually LabsLink is simple. We look for the &lt;id&gt; tag.
        found_id = root.find('.//id')
        if found_id is not None:
            provider_id = found_id.text
            print(f"Loaded Provider ID: {provider_id}")
        else:
            raise ValueError("Could not find <id> element in provider file.")

    except etree.DocumentInvalid as e:
        print(f"Validation Error in {PROVIDER_DETAILS_FILE}:")
        print(e)
        sys.exit(1)
else:
    # File does not exist: Create it
    print(f"'{PROVIDER_DETAILS_FILE}' not found. Let's create it.")
    
    try:
        p_id = prompt_user("Enter your provider ID")
        r_name = prompt_user("Enter your resource name (heading for links)")
        desc = prompt_user("Enter a description")
        email = prompt_user("Enter your email address")
        
        # Build XML
        providers = etree.Element('providers')
        provider = etree.SubElement(providers, 'provider')
        
        etree.SubElement(provider, 'id').text = p_id
        etree.SubElement(provider, 'resourceName').text = r_name
        etree.SubElement(provider, 'description').text = desc
        etree.SubElement(provider, 'email').text = email
        
        xml_doc = etree.ElementTree(providers)
        
        # Validate before saving
        schema.assertValid(xml_doc)
        
        # Save
        xml_doc.write(PROVIDER_DETAILS_FILE, pretty_print=True, xml_declaration=True, encoding='UTF-8')
        provider_id = p_id
        print(f"Created and validated {PROVIDER_DETAILS_FILE}")
        
    except Exception as e:
        print(f"Error creating provider file: {e}")
        sys.exit(1)

## Step 4: Convert TSV to Links XML
This is the main logic. It reads your TSV using pandas (which handles line endings better than raw Perl), constructs the XML DOM using lxml, validates the final structure, and saves it.

In [None]:
print("Processing links...")

try:
    # Read the TSV file. 
    # Assumes the first row is a header (like the Perl script implies).
    # We enforce specific column names to ensure logic consistency.
    # If your file has NO header, change 'header=0' to 'header=None'.
    df = pd.read_csv(
        TAB_DELIMITED_FILE, 
        sep='\t', 
        header=0, 
        names=['source', 'id', 'url', 'title'], 
        dtype=str
    )
    
    # Fill NaN titles with empty strings for easier processing
    df['title'] = df['title'].fillna('')

    # Create root XML element
    root_element = etree.Element('links')

    valid_rows = 0
    
    for index, row in df.iterrows():
        # Skip rows with missing essential data
        if pd.isna(row['source']) or pd.isna(row['id']) or pd.isna(row['url']):
            print(f"Skipping line {index + 2}: Missing source, id, or url.")
            continue

        # Create hierarchy: link -> resource/record
        link_element = etree.SubElement(root_element, 'link', providerId=provider_id)
        
        # Resource block
        resource_element = etree.SubElement(link_element, 'resource')
        url_element = etree.SubElement(resource_element, 'url')
        url_element.text = row['url']
        
        if row['title']:
            title_element = etree.SubElement(resource_element, 'title')
            title_element.text = row['title']

        # Record block
        record_element = etree.SubElement(link_element, 'record')
        source_element = etree.SubElement(record_element, 'source')
        source_element.text = row['source']
        id_element = etree.SubElement(record_element, 'id')
        id_element.text = row['id']
        
        valid_rows += 1

    # Create the ElementTree
    links_doc = etree.ElementTree(root_element)

    # Validate the generated XML against the schema
    schema.assertValid(links_doc)
    print("Generated XML is valid against labslink.xsd")

    # Write to file
    links_doc.write(OUTPUT_LINKS_FILE, pretty_print=True, xml_declaration=True, encoding='UTF-8')
    
    print("------------------------------------------------")
    print("Program finished.")
    print(f"Processed {valid_rows} links.")
    print(f"Files created: {OUTPUT_LINKS_FILE}, {PROVIDER_DETAILS_FILE}")
    print("Please upload both files to the Europe PMC FTP site.")

except etree.DocumentInvalid as e:
    print(f"XML Validation Failed: {e}")
    # Save error log
    with open(f"validation_errors_{int(time.time())}.txt", "w") as f:
        f.write(str(e))
    print("See validation_errors file for details.")

except Exception as e:
    print(f"An unexpected error occurred: {e}")

## How this Python version improves on the Perl script:
* **Pandas for Input:** It uses pandas to read the TSV, which is much more robust at handling different line endings (\r, \n) and encoding issues than the manual line-splitting used in the Perl script.
* **LXML for XML:** It uses Python's standard lxml library, which is the industry standard for high-performance XML parsing and XSD validation.
* **Modern Networking:** It uses requests to fetch the schema, which is generally more reliable than the older Perl networking modules.