In [1]:
from networkx.algorithms.minors.contraction import identified_nodes

"""
Author: Carlos S Reyna-Blanco
Email: carlos.reynablanco@meduniwien.ac.at
Date: 2025-06-26
Description: Script to retrieve organism taxonomy
Dependencies: requests, xml.etree.ElementTree, pandas, tqdm
"""

import requests
import xml.etree.ElementTree as ET
import pandas as pd
import time
import re
from tqdm import tqdm
import numpy as np
from urllib.parse import quote
import csv


In [320]:
def get_organism_from_uniprot(rep_id):
    url = f"https://www.uniprot.org/uniprot/{rep_id}.xml"
    try:
        response = requests.get(url)
        if response.status_code != 200:
            #print(f"Failed to retrieve {rep_id}: HTTP {response.status_code}")
            return None

        root = ET.fromstring(response.content)
        ns = {'up': 'http://uniprot.org/uniprot'}

        result = {'common': None, 'scientific': None, 'tax_id': None}

        for name in root.findall('.//up:organism/up:name', namespaces=ns):
            name_type = name.attrib.get('type', '').lower()
            if name_type == 'common':
                result['common'] = name.text
            elif name_type == 'scientific':
                result['scientific'] = name.text

        taxon = root.find('.//up:organism/up:dbReference[@type="NCBI Taxonomy"]', namespaces=ns)
        if taxon is not None:
            result['tax_id'] = taxon.attrib['id']

        return result if result['scientific'] else None
    except Exception as e:
        #print(f"Error processing {rep_id}: {e}")
        return None

def lookup_organism(rep):
    """
    Returns a dictionary with 'common', 'scientific', and 'tax_id' for the given rep ID.
    First tries UniProt API.
    If the rep ID contains underscores, tries splitting to find alternative candidates.
    """
    if not rep  or not isinstance(rep, str):
        return {'common': None, 'scientific': None, 'tax_id': None}

    # Try full rep ID
    result = get_organism_from_uniprot(rep)
    if result:
        return result

    # Decide on splitting logic
    if "_" in rep:
        parts = rep.split("_")
    else:
        parts = rep.split(", ")

    # Try parts of the rep ID (e.g., split by '_' or ', ')
    for candidate in map(str.strip, parts):#rep.split("_"):
        if not candidate:
            continue
        result = get_organism_from_uniprot(candidate)
        if result:
            return result
    #     elif candidate in manual_mapping:
    #         return {'common': None, 'scientific': manual_mapping[candidate], 'tax_id': None}
    #
    # # Fallback to manual mapping using full rep
    # if rep in manual_mapping:
    #     return {'common': None, 'scientific': manual_mapping[rep], 'tax_id': None}

    return {'common': None, 'scientific': None, 'tax_id': None}

# def search_taxid_by_name(scientific_name):
#     """
#     Query NCBI E-utilities for the given scientific name,
#     return the first matching tax_id (or None).
#     """
#     term = quote(str(scientific_name))  # Ensure it's a string
#     url  = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&retmode=json&term={term}[SCIN]"
#     try:
#         r = requests.get(url, timeout=5)
#         r.raise_for_status()
#         data = r.json()
#         ids  = data.get("esearchresult", {}).get("idlist", [])
#         return ids[0] if ids else None
#     except Exception:
#         return None
def search_taxid_by_name(scientific_name):
    """
    Query NCBI for scientific_name first.
    If no hit, query UniProt taxonomy API and:
      - Try the primary scientificName field
      - Then try any synonyms or otherNames found in the UniProt entry
    Returns the first NCBI taxid found, or None.
    """
    def ncbi_lookup(name):
        term = quote(str(name))
        url = (
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
            f"esearch.fcgi?db=taxonomy&retmode=json&term={term}[SCIN]"
        )
        try:
            r = requests.get(url, timeout=5)
            r.raise_for_status()
            ids = r.json().get("esearchresult", {}).get("idlist", [])
            return ids[0] if ids else None
        except Exception:
            return None

    # 1) Try NCBI on the given name
    taxid = ncbi_lookup(scientific_name)
    if taxid:
        return taxid

    # 2) Fallback to UniProt taxonomy search
    up_url = (
        "https://rest.uniprot.org/taxonomy/search"
        f"?query={quote(str(scientific_name))}&format=json"
    )
    try:
        r = requests.get(up_url, timeout=5)
        r.raise_for_status()
        results = r.json().get("results", [])
        if not results:
            return None

        entry = results[0]
        # Try the primary scientificName from UniProt
        main_name = entry.get("scientificName")
        taxid = ncbi_lookup(main_name)
        if taxid:
            return taxid

        # Then try any synonyms or other names
        synonyms = entry.get("synonyms", []) + entry.get("otherNames", [])
        for syn in synonyms:
            taxid = ncbi_lookup(syn)
            if taxid:
                return taxid

    except Exception:
        pass

    return None


def get_lineage_from_taxid(tax_id):
    url = f"https://rest.uniprot.org/taxonomy/{tax_id}"
    try:
        response = requests.get(url)
        if response.status_code != 200:
            #print(f"Failed to retrieve taxonomy for {tax_id}")
            return {}

        data = response.json()
        lineage = {rank['rank'].lower(): rank['scientificName'] for rank in data.get('lineage', [])}
        lineage[data.get('rank').lower()] = data.get('scientificName')  # Add most specific name

        common_name = data.get('commonName', '')  # Add common name if available
        if not common_name:
            # Check for 'no rank' in lineage to use as common fallback
            common_name = lineage.get('no rank', '')
        lineage['common'] = common_name

        return lineage
    except Exception as e:
        #print(f"Error retrieving lineage: {e}")
        return {}


def extract_rep_id(description):
    """
    Extracts the repID from a description string.
    The function looks for a pattern like 'RepID=<rep_value>'.
    If no repID is found, returns None.
    """
    if not description:
        return None

    # Regular expression pattern: looks for "RepID=" followed by one or more alphanumeric or underscore characters.
    # Adjust the pattern if your repIDs can include other characters.
    match = re.search(r'RepID=([\w\-:]+)', description)
    if match:
        return match.group(1)
    else:
        return None


def clean_iedb_organism_name(name):
    if pd.isna(name):
        return None

    # Step 1: Split by '&' and take the first group
    first_block = name.split('&')[0].strip()

    # Step 2: Extract content inside the first set of square brackets
    match = re.search(r"\[([^\]]+)\]", first_block)
    if not match:
        return None

    content = match.group(1)

    # Step 3: Split by ';' and take the first entry
    content = content.split(';')[0].strip()

    # Step 4: Remove anything in parentheses
    content = re.sub(r"\s*\(.*?\)", "", content)

    # Step 5: Final strip
    return content.strip("'\" ")

# Apply to each row with fallback to phage_name
def get_organism_complete_name(row):
    primary = clean_iedb_organism_name(row['IEDB_organism_name'])
    if primary:
        return primary

    # Fallback to phage_name if valid
    phage_val = row.get('phage_name', None)
    if pd.notna(phage_val) and str(phage_val).strip().lower() not in ['false', '', 'nan']:
        return str(phage_val).strip()

    return None

def collect_valid_ids(row):
    rep_cols = ['allergome_uniprot', 'allergen_uniprot', 'iedb_uniprot', 'fummy_uniprot', 'gened_uniprot']
    valid_ids = []
    for col in rep_cols:
        val = row[col]
        if isinstance(val, str):
            cleaned = val.strip()
            if cleaned.lower() not in ['false', 'nan', '']:
                valid_ids.append(cleaned)
        elif pd.notna(val) and val is not False:
            valid_ids.append(str(val).strip())
    return ", ".join(valid_ids) if valid_ids else None


def get_first_peptide_id_valid(row):
    for val in row:
        if isinstance(val, str) and val.strip().lower() not in ['', 'false', 'nan']:
            return val.strip()
        elif pd.notna(val) and val is not False:
            return val
    return None

def extract_taxid(desc):
    match = re.search(r'TaxID=(\d+)', str(desc))
    return match.group(1) if match else None

In [59]:
def match_taxid_by_approx_name(name):
    if pd.isna(name):
        return None
    for org_label, taxid in manual_taxids_agilent.items():
        if org_label.lower() in name.lower():
            return str(taxid)
    return None

def clean_prot_name_from_vfg(prot_name):
    """
    If prot_name starts with 'VFG' and contains a (gb|...) section, extract the protein ID.
    Example: "VFG002196(gb|NP_816637) ..." → "NP_816637"
    """
    if isinstance(prot_name, str) and prot_name.startswith("VFG"):
        match = re.search(r'\(gb\|([^\)]+)\)', prot_name)
        if match:
            return match.group(1)
    return prot_name

def get_taxid_from_protein_id(protein_id):
    """
    Given a protein accession (e.g. YP_009321702.1), fetch its TaxID and organism name from NCBI.
    """
    if  not isinstance(protein_id, str) or " " in protein_id:
        return None

    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        "db": "protein",
        "id": protein_id,
        "retmode": "json"
    }

    try:
        response = requests.get(base_url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        uid = next(iter(data["result"]["uids"]))
        result = data["result"][uid]
        taxid = result.get("taxid")
        #organism = result.get("organism")
        return str(taxid) #{"tax_id": str(taxid), "organism": organism}
    except Exception as e:
        #print(f"[Error] Failed to retrieve TaxID for {protein_id}: {e}")
        return None

def fill_taxid_from_protein(row):
    if row["TaxID"] is None:
        protein_id = row.get("prot_name")
        return get_taxid_from_protein_id(protein_id)
    else:
        return row["TaxID"]

def update_taxid_if_default_and_valid_protein(row):
    if row["TaxID"] in ["1", "2", "131567"]:
        prot = row.get("prot_name")
        if isinstance(prot, str) and prot and " " not in prot:
            new_taxid = get_taxid_from_protein_id(prot)
            return new_taxid if new_taxid else row["TaxID"]
    return row["TaxID"]


def fill_taxid_from_virus_pattern(row, virus_dict):
    if pd.notna(row["TaxID"]):
        return row["TaxID"]  # Already filled

    prot = row.get("prot_name")
    if not isinstance(prot, str):
        return None

    for virus_name, taxid in virus_dict.items():
        if virus_name in prot:
            return taxid

    return None

def extract_mnemonic_from_uniref(uniref_func, return_full=False):
    """
    Extract the RepID or mnemonic (e.g., 9FIRM) from a uniref_func string.

    - If RepID has an underscore, return:
        - full RepID if return_full=True
        - only part after '_' if return_full=False
    - If RepID does NOT have an underscore, always return full RepID.
    """
    if isinstance(uniref_func, str):
        match = re.search(r'RepID=("?)([^\s"]+)\1', uniref_func)
        if match:
            full_id = match.group(2)
            if "_" in full_id:
                if return_full:
                    return full_id
                else:
                    return full_id.split("_", 1)[1]
            else:
                return full_id
    return None

def get_taxid_from_mnemonic_uniprot(mnemonic):
    """
    Look up taxonomic ID from a UniProt mnemonic (e.g. '9FIRM') using the UniProt API.
    """
    url = f"https://rest.uniprot.org/taxonomy/search"
    params = {
        "query": f"mnemonic:{mnemonic}",
        "format": "json"
    }

    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        if data.get("results"):
            tax_id = data["results"][0].get("taxonId")
            return str(tax_id)
    except Exception as e:
        print(f"[Error] UniProt lookup failed for {mnemonic}: {e}")

    return None

In [340]:
def annotate_taxonomy(
    df,
    organism_col=None,
    rep_id_col=None,
    prot_id_col=None,
    taxid_col=None,
    method_priority=("organism", "rep_id", "prot_id", "tax_id"),  # explicit order
    outfile_path="taxonomy_output.csv",
    max_rows=None
):
    lineage_fields = ['domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'common']
    id_cache = {}

    # Map method name to column name
    method_cols = {
        "organism": organism_col,
        "rep_id": rep_id_col,
        "prot_id": prot_id_col,
        "tax_id": taxid_col
    }

    with open(outfile_path, "w", newline="", encoding="utf-8") as out_f:
        writer = csv.DictWriter(
            out_f,
            fieldnames=["peptide_name"] + lineage_fields
        )
        writer.writeheader()

        if max_rows:
            df = df.head(max_rows)

        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Fetching taxonomy"):
            row_data = dict.fromkeys(lineage_fields, "")
            lineage = {}

            # ----- Prioritization logic -----
            for method in method_priority:
                col = method_cols.get(method)
                if not col or col not in row:
                    continue
                val = row[col]
                if pd.isna(val):
                    continue

                if method == "organism" and pd.notna(val) and isinstance(val, str) and val.strip(): #if method == "organism" and str(val).strip() and val is not pd.NA:
                    if val in id_cache:
                        tax_id = id_cache[val]
                        if tax_id in id_cache:
                            lineage = id_cache[tax_id]
                        else:
                            lineage = get_lineage_from_taxid(tax_id) or {}
                            id_cache[tax_id] = lineage
                    else:
                        tax_id = search_taxid_by_name(val)
                        if tax_id:
                            id_cache[val] = tax_id
                            lineage = get_lineage_from_taxid(tax_id) or {}
                            id_cache[tax_id] = lineage

                elif method == "rep_id" and pd.notna(val):
                    rep_data = lookup_organism(val)
                    row_data["common"] = rep_data.get("common", "")
                    tax_id = rep_data.get("tax_id")
                    if tax_id:
                        if tax_id in id_cache:
                            lineage = id_cache[tax_id]
                        else:
                            lineage = get_lineage_from_taxid(tax_id) or {}
                            id_cache[tax_id] = lineage

                elif method == "prot_id" and pd.notna(val):
                    if val in id_cache:
                        tax_id = id_cache[val]
                        if tax_id in id_cache:
                            lineage = id_cache[tax_id]
                        else:
                            lineage = get_lineage_from_taxid(tax_id) or {}
                            id_cache[tax_id] = lineage
                    else:
                        tax_id = get_taxid_from_protein_id(val)
                        if tax_id:
                            id_cache[val] = tax_id
                            lineage = get_lineage_from_taxid(tax_id) or {}
                            id_cache[tax_id] = lineage

                elif method == "tax_id" and pd.notna(val):
                    tax_id = val
                    if tax_id in id_cache:
                        lineage = id_cache[tax_id]
                    else:
                        lineage = get_lineage_from_taxid(tax_id) or {}
                        id_cache[tax_id] = lineage

                if lineage:
                    break
            # Write output
            for key in lineage_fields:
                if key in lineage:
                    row_data[key] = lineage[key]

            if not any(row_data.values()):
                continue

            writer.writerow({"peptide_name": idx, **row_data})
            out_f.flush()

In [None]:
rep_data = lookup_organism("R6BPJ2")  # Human hemoglobin
rep_data

# AGILENT LIBRARY

In [None]:
# Using uniref ids directly to get species
df = pd.read_csv("agilent_library_with_info_extended.csv", index_col=0).head(10)
df['uniref'].apply(lookup_organism)

In [None]:
df = pd.read_pickle("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/agilent_library_with_info.pkl")
df = df[["uniref", "uniref_func", "Organism_complete_name","IEDB_organism_name"]]
# Remove the prefix only if it's present at the start
df["uniref"] = df["uniref"].str.replace(r"^UniRef90_", "", regex=True)
df
#df.to_csv("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/agilent_library_with_info_uniref_organism.csv")

In [None]:
# Predefine lineage fields of interest
lineage_fields = ['domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'common']

# Initialize empty list to store results
lineage_data = []
organism_cache = {}


with open("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/2022_agilent_library_with_info_extended_annotation.csv", "w", newline="", encoding="utf-8") as out_f:
    writer = csv.DictWriter(
        out_f,
        fieldnames=["peptide_name"] + lineage_fields,  # or whatever columns you need alongside your original df cols
    )
    writer.writeheader()

    for idx, row in tqdm(df.head(10).iterrows(), total=len(df), desc="Fetching taxonomy"):
        row_data = dict.fromkeys(lineage_fields, "")
        rep_id   = row["uniref"]
        sci_name = row["Organism_complete_name"]

        # First, attempt UniProt ID lookup
        rep_data = {}
        if isinstance(rep_id, str) and rep_id.strip():
            rep_data = lookup_organism(rep_id)
        row_data["common"] = rep_data.get("common", "")

        # If no tax_id yet, try scientific name lookup
        if not rep_data.get("tax_id")  and isinstance(sci_name, str) and sci_name.strip():

            tax_id = search_taxid_by_name(sci_name)
            organism_cache[sci_name] = tax_id

            if tax_id:
                rep_data = {"common":   row_data["common"] , "scientific": sci_name, "tax_id": tax_id}

        # If still no tax_id, skip
        if not rep_data.get("tax_id"):
            lineage_data.append(row_data)
            continue

        # fetch lineage by TaxID
        tax_id = rep_data["tax_id"]
        if tax_id in organism_cache:
            lineage = organism_cache[tax_id]
        else:
            lineage = get_lineage_from_taxid(tax_id) or {}
            organism_cache[tax_id] = lineage

        for key in lineage_fields:
            if key in lineage:
                row_data[key] = lineage[key]
        #lineage_data.append(row_data)
        # include the original row index if you want
        row_to_write = {"peptide_name": idx}
        row_to_write.update(row_data)
        writer.writerow(row_to_write)

# TWIST Library

In [None]:
#df = pd.read_csv("twist_library_with_info_extended.csv", index_col=0).head(10)
#df = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/twist_library_with_info.csv", index_col=0, low_memory=False)
#df['allergome_uniprot'].apply(lookup_organism)

# Apply cleaning function
#df['Organism_complete_name'] = df.apply(get_organism_complete_name, axis=1)
#df.to_csv("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/twist_library_with_info_cleanNames.csv")
#phage_name
# Columns in priority order
#rep_cols = ['allergome_uniprot', 'allergen_uniprot', 'iedb_uniprot', 'fummy_uniprot', 'gened_uniprot']

# Apply across the prioritized columns
#df['rep_id'] = df.apply(collect_valid_ids, axis=1)
#df.to_csv("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/twist_library_with_info_cleanNames_repIDs.csv")
df = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/twist_library_with_info_cleanNames_repIDs.csv", index_col=0, low_memory=False)
#df.iloc[:,[3,58,59]].to_csv("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/twist_library_with_important_info_cleanNames_repIDs.csv")

In [None]:
# Predefine lineage fields of interest
lineage_fields = ['domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'common']
lineage_data = []
organism_cache = {}

with open("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/2_twist_library_with_info_extended_annotation.csv", "w", newline="", encoding="utf-8") as out_f:
    writer = csv.DictWriter(
        out_f,
        fieldnames=["peptide_name"] + lineage_fields,  # or whatever columns you need alongside your original df cols
    )
    writer.writeheader()

    for idx, row in tqdm(df.head(10).iterrows(), total=len(df), desc="Fetching taxonomy"):
        row_data = dict.fromkeys(lineage_fields, "")
        rep_id   = row["rep_id"]
        sci_name = row["Organism_complete_name"]

        # First, attempt UniProt ID lookup
        rep_data = {}
        if isinstance(rep_id, str) and rep_id.strip():
            rep_data = lookup_organism(rep_id)
        row_data["common"] = rep_data.get("common", "")

        # If no tax_id yet, try scientific name lookup
        if not rep_data.get("tax_id")  and isinstance(sci_name, str) and sci_name.strip():

            tax_id = search_taxid_by_name(sci_name)
            organism_cache[sci_name] = tax_id

            if tax_id:
                rep_data = {"common":   row_data["common"] , "scientific": sci_name, "tax_id": tax_id}

        # If still no tax_id, skip
        if not rep_data.get("tax_id"):
            lineage_data.append(row_data)
            continue

        # fetch lineage by TaxID
        tax_id = rep_data["tax_id"]
        if tax_id in organism_cache:
            lineage = organism_cache[tax_id]
        else:
            lineage = get_lineage_from_taxid(tax_id) or {}
            organism_cache[tax_id] = lineage

        for key in lineage_fields:
            if key in lineage:
                row_data[key] = lineage[key]
        #lineage_data.append(row_data)
        # include the original row index if you want
        row_to_write = {"peptide_name": idx}
        row_to_write.update(row_data)
        writer.writerow(row_to_write)

# CORONA

In [None]:
df = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/corona2_library_with_info.csv", index_col=0, low_memory=False)
df = df[['Description', 'Uniprot', 'virus_name']].rename(columns={
    'Uniprot': 'rep_id',
    'virus_name': 'Organism_complete_name'
})
df.to_csv("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/corona2_library_with_important_info.csv")

In [None]:
# Predefine lineage fields of interest
lineage_fields = ['domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'common']
lineage_data = []
organism_cache = {}

with open("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/2_corona2_library_with_info_extended_annotation.csv", "w", newline="", encoding="utf-8") as out_f:
    writer = csv.DictWriter(
        out_f,
        fieldnames=["peptide_name"] + lineage_fields,  # or whatever columns you need alongside your original df cols
    )
    writer.writeheader()

    for idx, row in tqdm(df.head(10).iterrows(), total=len(df), desc="Fetching taxonomy"):
        row_data = dict.fromkeys(lineage_fields, "")
        rep_id   = row["rep_id"]
        sci_name = row["Organism_complete_name"]

        # First, attempt UniProt ID lookup
        rep_data = {}
        if isinstance(rep_id, str) and rep_id.strip():
            rep_data = lookup_organism(rep_id)
        row_data["common"] = rep_data.get("common", "")

        # If no tax_id yet, try scientific name lookup
        if not rep_data.get("tax_id")  and isinstance(sci_name, str) and sci_name.strip():

            tax_id = search_taxid_by_name(sci_name)
            organism_cache[sci_name] = tax_id

            if tax_id:
                rep_data = {"common":   row_data["common"] , "scientific": sci_name, "tax_id": tax_id}

        # If still no tax_id, skip
        if not rep_data.get("tax_id"):
            lineage_data.append(row_data)
            continue

        # fetch lineage by TaxID
        tax_id = rep_data["tax_id"]
        if tax_id in organism_cache:
            lineage = organism_cache[tax_id]
        else:
            lineage = get_lineage_from_taxid(tax_id) or {}
            organism_cache[tax_id] = lineage

        for key in lineage_fields:
            if key in lineage:
                row_data[key] = lineage[key]
        #lineage_data.append(row_data)
        # include the original row index if you want
        row_to_write = {"peptide_name": idx}
        row_to_write.update(row_data)
        writer.writerow(row_to_write)

# Automated lineage annotation

In [None]:
annotate_taxonomy(
    df=df,
    organism_col="IEDB_organism_name",
    taxid_col="TaxID",
    #rep_id_col="rep_id",
    method_priority=["organism", "tax_id"],
    #prioritize="organism",
    outfile_path="/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/output_organism_first.csv",
    max_rows=500
)

# Curate Agilent using Protein Acession or other name fields

# Agilent missing

In [None]:
df_all = pd.read_pickle("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/agilent_library_with_info.pkl")

## Recover TaxID from other sources

In [None]:
manual_taxids_agilent = {"Bacteroides dorei CL03T12C01":"997877",
 "Lactobacillus reuteri DSM 20016": "557436",
 "Bacteroides vulgatus ATCC 8482": "435590",
 "Faecalibacterium prausnitzii A2-165": "411483",
 "Roseburia inulinivorans DSM 16841":"622312",
 "Bifidobacterium longum 35624":"205913",
 "Ruminococcus gnavus AGR2154":"1384063",
 "Lactobacillus rhamnosus GG":"568703",
 "Bacteroides uniformis ATCC 8492":"411479",
 "[Eubacterium] hallii DSM 3353":"411469",
 "Lactobacillus plantarum WCFS1":"220668",
 "Bacteroides ovatus":"665954",
 "Dorea longicatena DSM 13814":"411462",
 "Bifidobacterium breve UCC2003":"326426",
 "Streptococcus lutetiensis 033":"1076934",
 "Streptococcus pyogenes MGAS8232":"186103",
 "Salmonella enterica subsp. enterica serovar Enteritidis str. P125109":"550537",
 "Escherichia coli O157:H7 str. Sakai":"386585",
 "Blautia obeum ATCC 29174":"411459",
 "Escherichia coli Nissle 1917":"316435",
 "Escherichia coli CFT073":"199310",
 "Enterococcus faecalis str. MMH594":"1351",
 "Escherichia coli O157:H7 str. EDL933":"155864",
 "Enterococcus faecalis V583":"226185",
 "Shigella flexneri 2a str. 301":"198214"}

manual_virus_agilent = {'Enterovirus A':"138948",
 'Enterovirus B':"138949",
 'Enterovirus C':"138950",
 'Human adenovirus C':"129951",
 'Human herpesvirus 1':"10298",
 'Human herpesvirus 3':"10335",
 'Human herpesvirus 4':"10377",
 'Human herpesvirus 5':"10360",
 'Human immunodeficiency virus 1':"11676",
 'Human respiratory syncytial virus':"410078",
 'Influenza A virus':"384505",
 'Norwalk virus':"11983",
 'Rhinovirus B':"147712"}

df = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/rerun_missing_agilent_peptide_annotations.csv", index_col=0, low_memory=False)
# clean IEDB col names, remove unnecesary things
df['IEDB_organism_name'] = df.apply(get_organism_complete_name, axis=1)
# following three need to be manually corrected
df.loc[df['Organism_complete_name'] == "Terrapene carolina triunguis", 'IEDB_organism_name'] = "Terrapene triunguis"
df.loc[df['IEDB_organism_name'] == "Human papillomavirus type 16", 'IEDB_organism_name']  = "Alphapapillomavirus 9"
df.loc[df['Organism_complete_name'] == "Orycteropus afer afer", 'IEDB_organism_name'] = "Orycteropus afer"
#get taxid from uniref_func content
df['TaxID'] = df['uniref_func'].apply(extract_taxid)
# update missing TaxID values based on manual dict
df.loc[df["Organism_complete_name"].isin(manual_taxids_agilent.keys()), "TaxID"] = df.loc[df["Organism_complete_name"].isin(manual_taxids_agilent.keys()), "Organism_complete_name"].map(manual_taxids_agilent)
#df.to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/rerun_missing_agilent_peptide_annotations_addTaxIDcol.csv")
#df = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/HCC_MUW_analysis/Data/final_missing_agilent.csv", index_col=0, low_memory=False)

# some TaxID values are still missing, add prot_name col as extra info to get taxID
df =df.merge(df_all.loc[:, ["prot_name"]], how="left", left_index=True, right_index=True)
#df["TaxID"] = df["TaxID"].apply(lambda x: str(int(x)) if pd.notna(x) else None)
# clean prot_name
df["prot_name"] = df["prot_name"].apply(clean_prot_name_from_vfg)
# Update missing TaxID values based on virus patterns in prot_name
df.loc[df["TaxID"].isna(), "TaxID"] = df.loc[df["TaxID"].isna()].apply(lambda row: fill_taxid_from_virus_pattern(row, manual_virus_agilent), axis=1)

In [None]:
# use protein id to get TaxID, only apply to missing TaxID rows
df.loc[df["TaxID"].isna(), "TaxID"] = df.loc[df["TaxID"].isna()].apply(lambda row: (
    None if pd.isna(row.get("prot_name")) or
            not isinstance(row.get("prot_name"), str) or
            " " in row["prot_name"]
    else get_taxid_from_protein_id(row["prot_name"])
), axis=1)
# Update taxID if values are 1 or 2 or  and if prot_name is not None and valid
df.loc[df["TaxID"].isin(["1", "2", "131567"]), "TaxID"] = df.loc[df["TaxID"].isin(["1", "2", "131567"])].apply(update_taxid_if_default_and_valid_protein, axis=1)
#df.to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/rerun_missing_agilent_peptide_annotations_cleaned_addTaxIDcol.csv")

## Try to get TaxID from repID

In [None]:
manual_repid_agilent = {"BACVU":"702446", "9BACT":"165179"}
# try to get RepID and with that TaxID, less accurate
toUse_repID_df = df.loc[(df["TaxID"].isin(["1", "2", "131567"])) & (df['Organism_complete_name'].isin(["root", "Bacteria", "cellular organisms"]))]
toUse_repID_df["repID"] = toUse_repID_df["uniref_func"].apply(lambda x: extract_mnemonic_from_uniref(x, return_full=False))
#toUse_repID_df["repID_mnemonic"] = toUse_repID_df["uniref_func"].apply(lambda x: extract_mnemonic_from_uniref(x, return_full=False))
toUse_repID_df['TaxID'] = toUse_repID_df['repID'].apply(get_taxid_from_mnemonic_uniprot)
# Update taxID if values are 2 or NaN with manual mapping
toUse_repID_df.loc[toUse_repID_df["repID"].isin(manual_repid_agilent.keys()), "TaxID"] = (toUse_repID_df["repID"].map(manual_repid_agilent))

## Main Curated Agilent to get Lineages

In [None]:
df["TaxID"].update(toUse_repID_df["TaxID"])
df["TaxID"] = df["TaxID"].apply(lambda x: str(int(x)) if pd.notna(x) else None)
df.to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/rerun_missing_agilent_peptide_annotations_cleaned_addTaxIDcol.csv")

## No TaxID for these ones

In [None]:
# No TaxID
miss_df =df.loc[df["TaxID"].isna()].merge(df_all, how="left", left_index=True, right_index=True) #df["TaxID"] = df["TaxID"].apply(lambda x: str(int(x)) if pd.notna(x) else None)
miss_df.shape

## These peptides have TaxID 1 or 2 or no lineage based on UniProt iD

In [None]:
df = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/rerun_missing_agilent_asBacteriaOrempty.csv", index_col=0, low_memory=False)
df['TaxID'] = df['uniref_func'].apply(extract_taxid)
df =df.merge(df_all.loc[:, ["prot_name"]], how="left", left_index=True, right_index=True)
df["prot_name"] = df["prot_name"].apply(clean_prot_name_from_vfg)
df['TaxID'] = df.apply(update_taxid_if_default_and_valid_protein, axis=1)

toUse_repID_df = df.loc[(df["TaxID"].isin(["1", "2", "131567"])) & (df['Organism_complete_name'].isin(["root", "Bacteria", "cellular organisms"]))]
toUse_repID_df["repID"] = toUse_repID_df["uniref_func"].apply(lambda x: extract_mnemonic_from_uniref(x, return_full=False))
#toUse_repID_df["repID_mnemonic"] = toUse_repID_df["uniref_func"].apply(lambda x: extract_mnemonic_from_uniref(x, return_full=False))
toUse_repID_df['TaxID'] = toUse_repID_df['repID'].apply(get_taxid_from_mnemonic_uniprot)
# Update taxID if values are 2 or NaN with manual mapping
toUse_repID_df.loc[toUse_repID_df["repID"].isin(manual_repid_agilent.keys()), "TaxID"] = (toUse_repID_df["repID"].map(manual_repid_agilent))
# In case some fail due to network issues
toUse_repID_df.loc[toUse_repID_df["TaxID"].isna(), "TaxID"] = toUse_repID_df.loc[toUse_repID_df["TaxID"].isna()]['repID'].apply(get_taxid_from_mnemonic_uniprot)

df["TaxID"].update(toUse_repID_df["TaxID"])
df["TaxID"] = df["TaxID"].apply(lambda x: str(int(x)) if pd.notna(x) else None)
df.to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/rerun_missing_agilent_asBacteriaOrempty_cleaned_addTaxIDcol.csv")

## Final Combined Curated Agilent to get Lineages

257 do not have info to recover some lineage info

In [None]:
pd.concat([pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/rerun_missing_agilent_peptide_annotations_cleaned_addTaxIDcol.csv", index_col=0, low_memory=False),
           df], axis=0).to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/rerun_missing_agilent_combined_peptide_annotations_cleaned_addTaxIDcol.csv")

### clean full agilent metadata to add new final agilen annotations

In [None]:
def clean_uniref_description(val):
    if pd.isna(val):
        return None
    # Remove anything starting from ' n=' or ' Key=' (e.g. Tax=, RepID=, etc.)
    return re.split(r'\s+(?:n=|Tax=|RepID=|TaxID=)', val)[0].strip()

# Apply it and create new column 'Description'
df_all["Description"] = df_all["uniref_func"].apply(clean_uniref_description)
tmp_all = df_all[["aa_seq", "pos", "len_seq","full name","Description","is_IEDB_or_cntrl","is_auto","is_infect","is_EBV","is_toxin","is_PNP","is_EM","is_MPA","is_patho","is_probio","is_IgA","is_flagellum","signalp6_slow","is_topgraph_new"]]
tmp_all["aa_seq"] = tmp_all["aa_seq"].str.replace(r"\(.*", "", regex=True).str.strip()
bool_cols = ["is_flagellum", "signalp6_slow", "is_topgraph_new"]
tmp_all[bool_cols] = tmp_all[bool_cols].astype(bool)
tmp_annot = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/agilent_completeAnnotation_library_with_lineages.csv", index_col=0, low_memory=False)
pd.merge(tmp_all, tmp_annot, how="left", left_index=True, right_index=True).to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/agilent_library_with_lineages_important_info.csv")

# Curate Corona Library

In [None]:
# everything having pdb| is sars-cov-2 2697049
#"Severe acute respiratory syndrome coronavirus 2" is sars-cov-2 2697049

In [50]:
def clean_and_patch_df(df):
    # Step 0: Ensure required columns exist
    if "TaxID" not in df.columns:
        df["TaxID"] = pd.NA

    # Step 1: Set organism name and TaxID for SARS-CoV-2 and pdb| cases
    sars_mask = df['Organism_complete_name'].str.contains("Severe acute respiratory syndrome coronavirus 2", na=False)
    pdb_mask = df['Organism_complete_name'].str.contains("pdb\\|", na=False)
    combined_mask = sars_mask | pdb_mask

    df.loc[combined_mask, "Organism_complete_name"] = "Severe acute respiratory syndrome coronavirus 2"
    df.loc[combined_mask, "TaxID"] = "2697049"

    # Step 2: Clean `prot_id` → keep only first entry before `&`, and remove suffixes like `_NSP13`
    df['prot_id'] = (
        df['prot_id']
        .astype(str)
        .str.split('&').str[0]
        .str.extract(r"^([A-Z0-9_]+(?:\.\d+)?)")[0]
    )

    # Step 3: Clean `prot_name` → keep only first part if multiple descriptions separated by '&'
    if 'prot_name' in df.columns:
        df['prot_name'] = df['prot_name'].astype(str).str.split('&').str[0].str.strip()

    return df

df = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/missing_corona_peptide_annotations.csv", index_col=0, low_memory=False)
df_all = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/corona2_library_with_info.csv", index_col=0, low_memory=False)
df = pd.merge(df, df_all[["prot_id", "prot_name"]], how="left", left_index=True, right_index=True)
df = clean_and_patch_df(df)
df.to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/missing_corona_peptide_annotations_cleaned_addTaxIDcol.csv")

In [92]:
df_all.rename(columns={"virus_name": "Organism_complete_name"}, inplace=True)
df_all = clean_and_patch_df(df_all)
tmp_all = df_all[["aa_seq","pos","len_seq","full name","prot_name"]]
tmp_all["aa_seq"] = tmp_all["aa_seq"].str.replace(r"\(.*", "", regex=True).str.strip()
tmp_all.rename(columns={"prot_name": "Description"}, inplace=True)
tmp_annot = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/corona_completeAnnotation_library_with_lineages.csv", index_col=0, low_memory=False)
pd.merge(tmp_all, tmp_annot, how="left", left_index=True, right_index=True).to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/corona2_library_with_lineages_important_info.csv")

# Curate TWIST

In [415]:
manual_twist_virus = {'Enterovirus A':"138948",
 'Enterovirus B':"138949",
 'Enterovirus C':"138950",
 'Human adenovirus C':"129951",
 'Human herpesvirus 1':"10298",
 'Human herpesvirus 3':"10335",
 'Human herpesvirus 4':"10377",
 'Human herpesvirus 5':"10360",
 'Human immunodeficiency virus 1':"11676",
 'Human respiratory syncytial virus':"410078",
 'Influenza A virus':"384505",
 'Norwalk virus':"11983",
 'Rhinovirus B':"147712",
 'Bundibugyo ebolavirus':"565995",
 'Sudan ebolavirus':"186540",
 'SAPK4 (MAPK13)':"9986",
 'Borna disease virus Giessen strain He/80':"1714621",
 'Mycoplasma penetrans':"272633",
 'Haemophilus influenzae NTHi 1479':"375177",
 'Chikungunya virus MY/08/065':"37124",
 'Porphyromonas gingivalis OMZ 409':"242619",
 'Haemophilus influenzae Subtype 1H':"727",
 'Human rhinovirus A89':"12132",
 'Entamoeba histolytica YS-27':"5759"}

mapping_twist_flu={"H3N2 A":"11320", "H3N2 B":"11520", "H1N1":"260815", "B B/":"11520"}

mapping_twist_bac={"Mycoplasma pneumoniae":"2104", "Mycoplasma penetrans":"28227"}

manual_twist_allergen_name_mapping ={'Hom s Elastin':"9606", 'Dan re PGM':"117571", 'Mus a 5':"214687",
       'Sola l PME':"4081", 'Str py Streptokinase':"1314", 'Hom s TM':"9606", 'Dro pp 7':"46245",
       'Fel d 2':"9685", 'Ory la 2':"8090", 'Sola l 4':"4081", 'Hom s Iduronidase':"9606",
       'Can f Feld1-like':"9615", 'Nas vi 12':"7425", 'Mala s 4':"1230383", 'Per a 7':"6978", 'Gal d 3':"9031",
       'Tri a 44':"4565", 'Mel g 3':"9103", 'Dan re 2':"117571", 'Bra di 5':"15368", 'Asp aw 14':"1033177",
       'Sola l 7':"4081", 'Dan re CK':"117571", 'Mus a 3':"214687", 'Tri a 42':"4565", 'Can f 7':"9615",
       'Sola l TLP':"4081", 'Sola l 5':"4081", 'Tri a 45':"4565", 'Tri a 41':"4565", 'Bomb m 1':"7091",
       'Mus a 1':"214687", 'Bra di 2':"15368", 'Tak ru 2':"31033", 'Ore ni NDKB':"8128", 'Can f 6':"9615",
       'Asp fl 2':"332952", 'Dan re NDKB':"117571", 'Acy pi 7':"133076", 'Asp aw 3':"1033177", 'Nas vi AK':"7425",
       'Gly m 8':"3847", 'Gly m 5':"3847", 'Can f 1':"9615", 'Can f 8':"9615", 'Str dy Streptokinase':"370554",
       'Equ c 1':"9796", 'Mala s 6':"1230383", 'Mus a 4':"214687", 'Mus a 2':"214687", 'Can f 5':"9615", 'Hor v 37':"4513",
       'Hom s PSA':"9606", 'Bra di 7':"15368", 'Hor v 7k-LTP':"4513", 'Fel d 1':"9685", 'Mala s 8':"1230383",
       'Dan re 1':"117571", 'Hom s Glucocerebrosidase':"9606", 'Mel g 2':"9103", 'Sola l SOD':"4081"}

def first_accession(x):
    if isinstance(x, str):
        # split on either '&' or ',' (with optional space), take the first piece
        return re.split(r'[&,]\s*', x)[0]
    return x

df = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/missing_twist_peptide_annotations.csv", index_col=0, low_memory=False)
df_all = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/twist_library_with_info.csv", index_col=0, low_memory=False)
df = pd.merge(df, df_all[["allergenonline_ncbi", "allergome_name", "SDAP_name","gened_Accession","gened_comments", "IEDB_comments"]], how="left", left_index=True, right_index=True)
df['allergome_name'] = df['allergome_name'].apply(lambda x: re.sub(r"\s*\(.*", "", x) if isinstance(x, str) else x)
df['SDAP_name'] = df['SDAP_name'].apply(lambda x: re.sub(r"\s*\(.*", "", x) if isinstance(x, str) else x)
#df["allergenonline_ncbi"] = df["allergenonline_ncbi"].apply(lambda x: x.split("&")[0] if isinstance(x, str) else x)
df["allergenonline_ncbi"] = df["allergenonline_ncbi"].apply(first_accession)
mask = df["gened_Accession"] == "LTRA_LACLM&LTRA_LACLC"
df.loc[mask, "TaxID"] = "416870"
df.loc[mask, "gened_Accession"] = pd.NA
valid_mask = (df["gened_Accession"].notna() & (df["gened_Accession"] != False) & (df["gened_Accession"].astype(str).str.strip() != ""))
df.loc[valid_mask, "allergenonline_ncbi"] = df.loc[valid_mask, "gened_Accession"] # Update 'allergenonline_ncbi' values with those from 'gened_Accession'
df.rename(columns={"allergenonline_ncbi": "prot_id"}, inplace=True)
mask = df['allergome_name'].isin(manual_twist_allergen_name_mapping)
df.loc[mask, 'TaxID'] = df.loc[mask, 'allergome_name'].map(manual_twist_allergen_name_mapping)

# manual mapping
df.loc[df['rep_id'] == "A0A0A3ZKQ1", 'TaxID'] = "556"

mask_deep = df['full name'].str.contains('Deep-sea thermophilic phage D6E', case=False, na=False)
df.loc[mask_deep, 'TaxID'] = '749413'
mask_deep = df['full name'].str.contains('Human serum albumin', case=False, na=False)
df.loc[mask_deep, 'TaxID'] = '9606'
# Rows whose comments mention Lactococcus lactis → TaxID = 1358
mask_lac = df['gened_comments'].str.contains('Lactococcus lactis', case=False, na=False)
df.loc[mask_lac, 'TaxID'] = '1358'
# Rows whose comments contain "From Anat " → TaxID = 9606
mask_anat = df['IEDB_comments'].str.contains(r'\bFrom Anat\b', case=False, na=False)
df.loc[mask_anat, 'TaxID'] = '9606'

# map the flu virus to taxid
pattern = r"^(" + "|".join(re.escape(k) for k in mapping_twist_flu) + r")"
mask = df["full name"].str.match(pattern, na=False)
df.loc[mask, "prefix"] = df.loc[mask, "full name"].str.extract(pattern)[0]
df.loc[mask, "TaxID"] = df.loc[mask, "prefix"].map(mapping_twist_flu)
df.drop(columns="prefix", inplace=True)

# map other viruses
pattern = r'(' + '|'.join(re.escape(k) for k in manual_twist_virus) + r')'
matched = df['full name'].str.extract(pattern, expand=False)
mask = matched.notna()
df.loc[mask, 'TaxID'] = matched.loc[mask].map(manual_twist_virus)

cols = ["prot_id", "allergome_name", "SDAP_name"]
df[cols] = df[cols].replace("False", pd.NA)
df = df.where(~df.isna(), pd.NA)

In [416]:
manual_twist_abbv_allergen_name_mapping = {'Hom s ':"9606", 'Dan re ':"117571", 'Mus a ':"214687",
       'Sola l ':"4081", 'Str py ':"1314", 'Dro pp ':"46245",
       'Fel d ':"9685", 'Ory la ':"8090",
       'Can f ':"9615", 'Nas vi ':"7425", 'Mala s ':"1230383", 'Per a ':"6978", 'Gal d ':"9031",
       'Tri a ':"4565", 'Mel g ':"9103", 'Bra di ':"15368", 'Asp aw ':"1033177", 'Bomb m ':"7091",
       'Tak ru ':"31033", 'Ore ni ':"8128",
       'Asp fl ':"332952", 'Acy pi ':"133076",
       'Gly m ':"3847", 'Str dy ':"370554",
       'Equ c ':"9796", 'Hor v ':"4513",
       'Act d ':"3627", 'Aed a ':"7159", 'Aln g ':"3517", 'Alt a ':"5599",
       'Api m ':"7460", 'Ara h ':"3818", 'Ara t ':"3702", 'Asp f ':"746128",
       'Asp o ':"5062", 'Bet v ':"3505",
       'Blo t ':"40697",
       'Bos d ':"9913", 'Bra n ':"3711",
       'Cand a ':"5476", 'Cap a ':"4072", 'Cas s ':"21020", 'Chi t ':"7155",
       'Cla h ':"29918", 'Cor a ':"13451", 'Cup a ':"257620", 'Cur l ':"5503", 'Den n ':"51109",
       'Der f ':"6954", 'Der p ':"6956",
       'Dic v ':"29172", 'Har a ':"115357", 'Hev b ':"3981",
       'Hom a ':"6706", 'Jug r ':"51240", 'Lol p ':"4522", 'Lyc e ':"357543",
       'Mal d ':"3750", 'Ole e ':"4146",
       'Ory s ':"4530", 'Pha v ':"3885", 'Phl p ':"15957", 'Pol e ':"27506",
       'Pol m ':"91422", 'Pru du ':"3755", 'Sin a ':"3728", 'Sol g ':"121131",
       'Sol s ':"176597", 'Sola t ':"4113", 'Sor h ':"4560", 'Tri t ':"34387", 'Zea m ':"4577"}

mask=(~(df["allergome_name"].notna() & (df["allergome_name"] != "False") & (df["allergome_name"].str.strip() != "")) & (df["prot_id"].isna()) & (df["SDAP_name"].notna() & (df["SDAP_name"] != "False") & (df["SDAP_name"].str.strip() != "")) & df['TaxID'].isna())

for prefix, taxid in manual_twist_abbv_allergen_name_mapping.items():
    sel = mask & df["SDAP_name"].str.startswith(prefix, na=False)
    df.loc[sel, "TaxID"] = taxid

In [417]:
def extract_wp_id(text):
    """Return the first WP_… accession (with optional version) in the text, else None."""
    if not isinstance(text, str):
        return None
    m = re.search(r'(WP_[0-9]+(?:\.[0-9]+)?)', text)
    return m.group(1) if m else None

# Rows still missing organism, prot_id, and TaxID but have gened_comments
mask_need = (
    df['Organism_complete_name'].isna()
    & df['prot_id'].isna()
    & df['TaxID'].isna()
    & df['gened_comments'].notna()
)
# Extract WP_… accession into prot_id
df.loc[mask_need, 'prot_id'] = df.loc[mask_need, 'gened_comments'].map(extract_wp_id)

# list of peptides that have weird organism name, curate them manually based on my mapping dict
to_curate_df = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/to_curate_twist.csv", index_col=0, low_memory=False)
for virus, taxid in manual_twist_virus.items():
    virus = re.escape(virus)
    mask = (df.index.isin(to_curate_df.index) & df['Organism_complete_name'].str.contains(virus, case=False, na=False) & df['TaxID'].isna())
    df.loc[mask, 'TaxID'] = taxid

for bac, taxid in mapping_twist_bac.items():
    bac = re.escape(bac)
    mask = (df['Organism_complete_name'].str.contains(bac, case=False, na=False) & df['TaxID'].isna())
    df.loc[mask, 'TaxID'] = taxid

#clean metadata
df.to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/missing_twist_peptide_annotations_cleaned_addTaxIDcol.csv")

In [518]:
def clean_full_name(df):
    fn = df['full name'].astype(str)

    # If “fummy_name … <bold>” exists, extract only the text in between
    fn = fn.str.replace(
        r'.*?fummy_name\s*(.*?)\s*<bold>.*',
        r'\1',
        flags=re.IGNORECASE | re.DOTALL,
        regex=True
    )

    # Strip leading “positive…(from uniprot):” prefix
    fn = fn.str.replace(
        r'^(?:positive|negative)\s*\((?:from uniprot|exact)\):\s*',
        "",
        flags=re.IGNORECASE,
        regex=True
    )


    # Remove *any* leading accession ending in .<digits> plus space
    fn = fn.str.replace(
        r'^\S+\.\d+\s+',
        '',
        regex=True
    )

    # Keep only up to the first ampersand (with or without spaces)
    fn = fn.str.split(r'\s*&\s*', regex=True).str[0]

    # Remove numeric parentheticals like “(1/1)”
    fn = fn.str.replace(r'\s*\(\d+/\d+\)', "", regex=True)

    # **Only** strip a bracketed suffix at end of string (e.g. `[foo]` at end)
    fn = fn.str.replace(r'\s*\[[^\]]+\]$', '', regex=True)

    # Remove any remaining bracketed notes [like this]
    #fn = fn.str.replace(r'\[.*?\]', "", regex=True)
   #  Strip any trailing unclosed “[…” at end
    fn = fn.str.replace(r'\s*\[[^\]]*$', '', regex=True)
    #  Trim whitespace
    fn = fn.str.strip()


    df['full name'] = fn
    return df

df_all = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/twist_library_with_info.csv", index_col=0, low_memory=False)
df_all = clean_full_name(df_all)
mask = df_all['full name'].astype(str).str.startswith("allergen_name", na=False)
df_all.loc[mask, 'full name'] = pd.NA

# 1) Replace the literal "False" and empty strings in your source columns with real missing
source_cols = [
    'allergome_name',
    'allergen_name',
    'allergenonline_name',
    'SDAP_name',
    'iedb_name',
    'full name',
    'phage_name'
]

df_all[source_cols] = (
    df_all[source_cols]
      .replace("False", pd.NA)     # turn the string "False" into <NA>
      .replace("FALSE", pd.NA)     # turn the string "False" into <NA>
      .replace(r"^Unassigned.*", pd.NA, regex=True) # anything starting with Unassigned
      .replace("",     pd.NA)     # turn empty strings into <NA>
)

# 2) Coalesce (take the first non-null) across those columns into Description
df_all['Description'] = (
    df_all[source_cols]
      .bfill(axis=1)              # back-fill across columns
      .iloc[:, 0]                 # then take the first column
)

# 3) Clean off any " (…)" and whitespace
df_all['Description'] = df_all['Description'].apply(
    lambda x: re.sub(r"\s*\(.*", "", x).strip() if isinstance(x, str) else x
)

  df_all[source_cols]


In [522]:
tmp_all = df_all[["aa_seq","pos","len_seq","Description", 'is_auto', 'is_infect', 'is_EBV', 'is_phage', 'is_allergens', 'is_influenza']]
tmp_all.loc[:,"aa_seq"] = tmp_all["aa_seq"].str.replace(r"\(.*", "", regex=True).str.strip()
#tmp_all.rename(columns={"prot_name": "Description"}, inplace=True)
tmp_annot = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/twist_completeAnnotation_library_with_lineages.csv", index_col=0, low_memory=False)
pd.merge(tmp_all, tmp_annot, how="left", left_index=True, right_index=True).to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/twist_library_with_lineages_important_info.csv")

# Concatanate

In [523]:
agilent = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/agilent_library_with_lineages_important_info.csv", index_col=0, low_memory=False)
twist = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/twist_library_with_lineages_important_info.csv", index_col=0, low_memory=False)
corona = pd.read_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/corona2_library_with_lineages_important_info.csv", index_col=0, low_memory=False)

In [544]:
# concatenate them
combined = pd.concat([agilent, twist, corona], axis=0, join="outer")
cols = combined.columns.tolist()
cols.remove('is_allergens')
cols.remove('is_influenza')
cols.remove('is_phage')
idx = cols.index('domain')
cols.insert(idx, 'is_allergens')
combined = combined[cols]
combined.to_csv("/home/creyna/Vogl-lab_Projects_git/Annotations/combined_libraries_with_lineages_important_info.csv")