In [289]:
import xml.etree.ElementTree as ET
from typing import NamedTuple

from collections import Counter

import pandas as pd
import numpy as np
from Bio import SeqIO

In [2]:
namespace = "{http://uniprot.org/uniprot}"

tree = ET.parse('work/sclerotiniaceae.xml')
root = tree.getroot()


In [3]:
def fmap(func, optional):
    if optional is None:
        return None
    else:
        return func(optional)

In [14]:
class UPREntry(NamedTuple):
    accession: str
    orf: str | None
    taxid: int | None
    lineage: str | None
    full_name: str | None
    checksum: str
    seq: str
    
    def as_series(self):
        from pandas import Series
        
        return Series({
            "accession": self.accession,
            "orf": self.orf,
            "taxid": self.taxid,
            "lineage": self.lineage,
            "full_name": self.full_name,
            "checksum": self.checksum,
            "seq": self.seq
        })
    
    @classmethod
    def from_xml(cls, element, namespace="{http://uniprot.org/uniprot}"):
        accession = element.find(f"{namespace}accession").text

        orf = None
        gis = element.find(f"{namespace}gene")
        if gis is not None:
            for gi in gis:
                if gi.attrib.get("type", None) == "ORF":
                    orf = gi.text

        taxid = None
        lineage = None
        organism = element.find(f"{namespace}organism")
        if organism is not None:
            dbr = organism.find(f"{namespace}dbReference")
            if (dbr is not None) and (dbr.attrib.get("type", None) == "NCBI Taxonomy"):
                taxid = int(dbr.attrib.get("id", None))

            lin = organism.find(f"{namespace}lineage")
            lin2 = []
            if lin is not None:
                lin2 = [o.text for o in lin if o.tag == f"{namespace}taxon"]
                
            for n in organism.findall(f"{namespace}name"):
                if n.attrib.get("type", None) == "scientific":
                    lin2.append(n.text)
                    break

            lineage = "|".join(lin2)

        protein = element.find(f"{namespace}protein")
        full_name = fmap(
            lambda x: x.text,
            fmap(
                lambda x: x.find(f"{namespace}fullName"),
                protein.find(f"{namespace}recommendedName")
            )
        )
        
        e = element.find(f"{namespace}sequence")
        checksum = e.attrib.get("checksum", None)
        seq = e.text
        
        return cls(accession, orf, taxid, lineage, full_name, checksum, seq)
    
entries = root.findall(f"{namespace}entry")
entries = pd.DataFrame([UPREntry.from_xml(e).as_series() for e in entries])

In [16]:
entries.head()

Unnamed: 0,accession,orf,taxid,lineage,full_name,checksum,seq
0,A0A1D9PWC4,sscle_02g017740,665079,Eukaryota|Fungi|Dikarya|Ascomycota|Pezizomycot...,Sulfate adenylyltransferase,DD1F70C6BE0D9B8F,MANSPHGGVLKDLLARDLPRHNELSAEAETLPAIVLTERQLCDLEL...
1,A0A1D9Q7R8,sscle_07g057350,665079,Eukaryota|Fungi|Dikarya|Ascomycota|Pezizomycot...,ATP-dependent DNA helicase PIF1,837D7126F6FE4B3F,MPFGCTTSPALSSLIRGRLAQRWRAHPRFTFRITNVPTRSVLYSSI...
2,A0A1D9QCG4,sscle_09g074140,665079,Eukaryota|Fungi|Dikarya|Ascomycota|Pezizomycot...,E3 ubiquitin-protein ligase,7D2E7175B5AD853C,MASNSRVDSSLPAQPNLRLTIIAADGLYKRDVFRFPDPFAVATLSG...
3,A0A1D9QLW8,sscle_15g103530,665079,Eukaryota|Fungi|Dikarya|Ascomycota|Pezizomycot...,Pentafunctional AROM polypeptide,A24C113D2705D768,MGSTTFENPTRIEILGKEDIIVDFDIWRNFVAEDLLSDLPSSTYVL...
4,A0A384J5D0,BCIN_01g04340,332648,Eukaryota|Fungi|Dikarya|Ascomycota|Pezizomycot...,Sulfate adenylyltransferase,80E63CB177D83D54,MANSPHGGVLKDLLARDLSRHNELATEAETLPAVVLSERQLCDLEL...


In [57]:
seqs = SeqIO.to_dict(SeqIO.parse("work/Sscl1980.faa", "fasta"))

In [58]:
len(seqs)

11130

In [200]:
clusters = pd.read_csv("work/foldseek_results_cluster.tsv", sep="\t", names=["cluster", "member"])
clusters["accession"] = clusters["member"].str.split("-").str[1]
clusters["cluster_accession"] = clusters["cluster"].str.split("-").str[1]

clusters = pd.merge(
    clusters,
    entries,
    on="accession",
    how="left"
)
clusters["lineage"] = clusters["lineage"].str.replace("Eukaryota|Fungi|Dikarya|Ascomycota|Pezizomycotina|Leotiomycetes|Helotiales|Sclerotiniaceae|", "")

clusters = pd.merge(
    clusters.groupby("cluster")["member"].nunique().reset_index().rename(columns={"member": "nmembers"}),
    clusters,
    on="cluster"
)

clusters = clusters.drop(columns=["seq", "cluster", "member"]).rename(columns={"cluster_accession": "cluster"})

clusters = clusters[[
    'cluster', 'nmembers', 'accession', 
    'orf', 'taxid', 'lineage', 'full_name', 'checksum'
]]

clusters.sort_values(["nmembers", "cluster", "lineage", "accession"], inplace=True, ascending=False)
clusters.to_csv("output/foldseek_clusters.tsv", sep="\t", na_rep="-")

print(clusters.shape)
clusters.head()

(228612, 8)


Unnamed: 0,cluster,nmembers,accession,orf,taxid,lineage,full_name,checksum
225311,W9CLA0,1563,A7F9P6,SS1G_14327,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,Zn(2)-C6 fungal-type domain-containing protein,B617D367831CF4EA
225850,W9CLA0,1563,A7F9K0,SS1G_14281,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,,7D60D9F4CAB083FD
224569,W9CLA0,1563,A7F8H5,SS1G_13906,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,Major facilitator superfamily (MFS) profile do...,5BED6BF162BE5A16
225793,W9CLA0,1563,A7F859,SS1G_13789,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,Zn(2)-C6 fungal-type domain-containing protein,E2EBDF571C2F49AA
224809,W9CLA0,1563,A7F6B5,SS1G_13144,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,Zn(2)-C6 fungal-type domain-containing protein,F6055999B0E34BD0


In [243]:
effector_homologues = pd.read_csv("output/predector_protein/effectordb_matches.tsv", sep="\t", na_values=".")
effector_homologues = effector_homologues["gene_name effector_name".split()].drop_duplicates().groupby("gene_name")["effector_name"].apply(lambda x: "|".join(x)).to_dict()

funcs = pd.read_csv("output/protein_functional_annotation_wide.tsv", sep="\t", na_values=".")
funcs["effector_name"] = funcs["name"].apply(effector_homologues.get)
funcs = funcs[["name", "effector_name"] + [n for n in funcs.columns if n not in ("name", "effector_name")]]

funcs["signal_peptide"] = (funcs[['signalp3_nn', 'signalp3_hmm', 'signalp4', 'signalp5', 'signalp6', 'deepsig', 'phobius_sp', 'targetp_secreted', 'tmbed_sp']] > 0.5).any(axis=1)

tmp = funcs.loc[:, ["tmhmm_tmcount", "phobius_tmcount", "tmbed_domains"]]
tmp["tmbed_domains"] = tmp["tmbed_domains"].str.count(",").fillna(0)
funcs["single_transmembrane"] = (tmp > 0).any(axis=1) & (tmp <= 1).all(axis=1)
funcs["multi_transmembrane"] = (tmp > 1).any(axis=1)
del tmp

funcs["deeploc_extracellular"] = (funcs[["deeploc_extracellular", "deeploc2_extracellular"]] > 0.5).any(axis=1)

funcs = funcs[[
    'name',
    'effector_score',
    'effector_name',
    'phibase_genes',
    'swissprot_protein_name', 'swissprot_gene_name',
    'pdb_protein_name', 
    'go_names',
    'ipr_name',
    'dbcan_description',
    'residue_number',
    'effectorp1', 'effectorp2', 'effectorp3_cytoplasmic', 'effectorp3_apoplastic', 
    'deepredeff_fungi', 'deepredeff_oomycete',
    'apoplastp',
    'signal_peptide', 'deeploc_extracellular', 'single_transmembrane', 'multi_transmembrane',   
]]

funcs.head()

Unnamed: 0,name,effector_score,effector_name,phibase_genes,swissprot_protein_name,swissprot_gene_name,pdb_protein_name,go_names,ipr_name,dbcan_description,...,effectorp2,effectorp3_cytoplasmic,effectorp3_apoplastic,deepredeff_fungi,deepredeff_oomycete,apoplastp,signal_peptide,deeploc_extracellular,single_transmembrane,multi_transmembrane
0,sscle_03g031910,3.816,,,,,,,,,...,0.965,0.703,0.807,0.608,0.67,0.42,True,True,False,False
1,sscle_16g107730,3.816,,,,,,,,,...,0.964,0.706,0.795,0.608,0.669,0.42,True,True,False,False
2,sscle_09g074030,3.263,,,,,,,,,...,0.964,0.652,0.803,0.513,0.669,0.59,True,True,False,False
3,sscle_05g045060,3.234,SsINE5|BcSSP2,,,,,,,,...,0.966,,0.87,0.204,0.374,0.89,True,True,False,False
4,sscle_01g003850,3.056,SsSSVP1,,,,,,,,...,0.919,,0.786,0.283,0.241,0.62,True,True,False,False


In [272]:
sclero_clusters = clusters[clusters["orf"].isin(seqs)]

sclero_clusters = pd.merge(
    sclero_clusters.groupby("cluster")["accession"].nunique().reset_index().rename(columns={"accession": "nsclero_members"}),
    sclero_clusters,
    on="cluster"
)

def mapper(s):    
    counter = Counter(s.str.split("|").str[-1])
    return "|".join([f"{k} (n={v})" for k, v in counter.items()])

sclero_clusters = pd.merge(
    sclero_clusters,
    clusters.groupby("cluster")["lineage"].apply(mapper).reset_index().rename(columns={"lineage": "nmembers_other_taxa"}),
    on="cluster"
)


sclero_clusters = sclero_clusters[['cluster', 'nmembers', 'nsclero_members', 'nmembers_other_taxa', 'accession', 'orf', 'taxid', 'lineage', 'full_name']]

sclero_clusters = pd.merge(
    sclero_clusters,
    funcs,
    left_on="orf",
    right_on="name",
    how="left"
)

sclero_clusters.to_csv("output/foldseek_clusters_sclero.tsv", sep="\t", na_rep="-")

print(sclero_clusters.shape)
sclero_clusters.head()

(10699, 31)


Unnamed: 0,cluster,nmembers,nsclero_members,nmembers_other_taxa,accession,orf,taxid,lineage,full_name,name,...,effectorp2,effectorp3_cytoplasmic,effectorp3_apoplastic,deepredeff_fungi,deepredeff_oomycete,apoplastp,signal_peptide,deeploc_extracellular,single_transmembrane,multi_transmembrane
0,A0A023PHJ9,15,1,Sclerotinia trifoliorum (n=1)|Sclerotinia scle...,A0A1D9PX14,sscle_02g020280,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,MAT1-1-5,sscle_02g020280,...,0.113,0.703,,0.387,0.245,0.0,False,False,False,False
1,A0A088CAE9,135,7,Sclerotinia sclerotiorum (strain ATCC 18683 / ...,A0A1D9QJH3,sscle_14g098670,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,SET domain-containing protein,sscle_14g098670,...,0.009,,,0.512,0.419,0.08,False,False,False,False
2,A0A088CAE9,135,7,Sclerotinia sclerotiorum (strain ATCC 18683 / ...,A0A1D9Q9P9,sscle_08g064370,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,SET domain-containing protein,sscle_08g064370,...,0.009,,,0.979,0.552,0.01,False,False,False,False
3,A0A088CAE9,135,7,Sclerotinia sclerotiorum (strain ATCC 18683 / ...,A0A1D9Q5W4,sscle_06g051140,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,MYND-type zinc finger protein samB,sscle_06g051140,...,0.03,,,0.129,0.482,0.0,False,False,False,False
4,A0A088CAE9,135,7,Sclerotinia sclerotiorum (strain ATCC 18683 / ...,A0A1D9Q2R7,sscle_04g039290,665079.0,Sclerotinia|Sclerotinia sclerotiorum (strain A...,BZIP domain-containing protein,sscle_04g039290,...,0.028,,,0.957,0.414,0.01,False,False,False,False


In [312]:
tmp1 = sclero_clusters.loc[(sclero_clusters["effector_score"] > 1), "cluster"].unique()
tmp2 = sclero_clusters.groupby("cluster")["effector_score"].median()
tmp2 = tmp2[tmp2 > 0].index.values

candidates = np.union1d(tmp1, tmp2)
candidates

array(['A0A0P0A8Z3', 'A0A1D9PRP0', 'A0A1D9PS94', 'A0A1D9PT93',
       'A0A1D9PUD4', 'A0A1D9PUX6', 'A0A1D9PVB1', 'A0A1D9PVH8',
       'A0A1D9PVL2', 'A0A1D9PYQ0', 'A0A1D9PZ96', 'A0A1D9PZQ1',
       'A0A1D9Q0S6', 'A0A1D9Q1Q5', 'A0A1D9Q1T2', 'A0A1D9Q1U2',
       'A0A1D9Q1X9', 'A0A1D9Q265', 'A0A1D9Q2F6', 'A0A1D9Q2H6',
       'A0A1D9Q2N2', 'A0A1D9Q3K4', 'A0A1D9Q400', 'A0A1D9Q401',
       'A0A1D9Q5E8', 'A0A1D9Q693', 'A0A1D9Q6G7', 'A0A1D9Q7J3',
       'A0A1D9Q902', 'A0A1D9Q944', 'A0A1D9Q958', 'A0A1D9QBF9',
       'A0A1D9QCN1', 'A0A1D9QDC8', 'A0A1D9QDS0', 'A0A1D9QEH4',
       'A0A1D9QF17', 'A0A1D9QF87', 'A0A1D9QFB4', 'A0A1D9QFY2',
       'A0A1D9QGH3', 'A0A1D9QH76', 'A0A1D9QHL5', 'A0A1D9QHZ1',
       'A0A1D9QIA7', 'A0A1D9QJ46', 'A0A1D9QJN6', 'A0A1D9QJW0',
       'A0A1D9QJZ1', 'A0A1D9QK43', 'A0A1D9QK51', 'A0A1D9QMR3',
       'A0A1D9QMV4', 'A0A1D9QN25', 'A0A384J4M9', 'A0A384J6D7',
       'A0A384J8L9', 'A0A384J8V2', 'A0A384JA22', 'A0A384JJ94',
       'A0A384JJR4', 'A0A384JK02', 'A0A384JMN2', 'A0A38

In [320]:
cand_clusters = sclero_clusters.loc[sclero_clusters["cluster"].isin(candidates) & (sclero_clusters["nsclero_members"] > 1), ].sort_values("nsclero_members cluster effector_score".split(), ascending=False)

In [322]:
cand_clusters.to_csv("output/cand_clusters.tsv", sep="\t", index=False, na_rep="-")