In [1]:
import re
import gzip
from typing import NamedTuple

import pandas as pd
from predectorutils.analyses import MMSeqs

In [2]:
with open("work/swissprot-matches.tsv", "r") as handle:
    sp = list(MMSeqs.from_file(handle))

In [18]:
!wget -O work/unipro_sprot.fasta.gz https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz

MMSeqs('sscle_01g000010', 'Q9US54', 172, 292, 327, 95, 183, 199, 6.235e-08, 2, 32.5, 120, 138.0, 59.0, '68M15I3M17I17M', 49, 0.367, 0.442)

In [4]:
with gzip.open("work/uniprot_sprot.fasta.gz", "r") as handle:
    sp_headers = []
    for line in handle:
        line = line.decode()
        line = line.strip()
        if line.startswith(">"):
            sp_headers.append(line.lstrip(">"))

In [5]:
class SwissProt(NamedTuple):
    db: str  # sp | tr
    accession: str
    entry_name: str
    protein_name: str
    organism_name: str
    organism_id: int
    gene_name: str | None
    protein_existence: int
    sequence_version: str

    @classmethod
    def from_str(cls, s: str):
        import re
        s = s.strip()
        if s.startswith(">"):
            s = s[1:]

        id, rem = s.split(maxsplit=1)
        db, acc, en = id.split("|")

        matches = re.finditer("((?P<ty>OS|OX|GN|PE|SV)=(?P<vs>\S+))", rem)
        gn = None
        min_ = len(rem)
        for m in matches:
            mgd = m.groupdict()
            min_ = min([min_, m.start()])

            if mgd["ty"] == "OS":
                os = mgd["vs"]
            elif mgd["ty"] == "OX":
                ox = mgd["vs"]
            elif mgd["ty"] == "GN":
                gn = mgd["vs"]
            elif mgd["ty"] == "PE":
                pe = mgd["vs"]
            elif mgd["ty"] == "SV":
                sv = mgd["vs"]
            else:
                raise ValueError("This shouldn't be possible.")

        pn = rem[:min_].strip()
        return cls(db, acc, en, pn, os, int(ox), gn, int(pe), sv)

    def as_dict(self):
        return dict(
            db=self.db,
            accession=self.accession,
            entry_name=self.entry_name,
            protein_name=self.protein_name,
            organism_name=self.organism_name,
            organism_id=self.organism_id,
            gene_name=self.gene_name,
            protein_existence=self.protein_existence,
            sequence_version=self.sequence_version
        )
    
    def as_series(self):
        return pd.Series(self.as_dict())

spp = [SwissProt.from_str(sp_) for sp_ in sp_headers]
spp = {sp_.accession: sp_ for sp_ in spp}
spp["Q6GZX4"]

SwissProt(db='sp', accession='Q6GZX4', entry_name='001R_FRG3G', protein_name='Putative transcription factor 001R', organism_name='Frog', organism_id=654924, gene_name='FV3-001R', protein_existence=4, sequence_version='1')

In [6]:
spdf = pd.DataFrame(map(lambda s: s.as_series(), sp))
spdf.head()

Unnamed: 0,query,target,qstart,qend,qlen,tstart,tend,tlen,evalue,gapopen,pident,alnlen,raw,bits,cigar,mismatch,qcov,tcov
0,sscle_01g000010,Q9US54,172,292,327,95,183,199,6.235e-08,2,32.5,120,138.0,59.0,68M15I3M17I17M,49,0.367,0.442
1,sscle_01g000010,Q99JX1,166,291,327,102,195,211,8.117e-07,2,30.4,125,128.0,55.0,74M3I3M29I16M,55,0.382,0.441
2,sscle_01g000010,Q15544,166,291,327,102,195,211,1.079e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441
3,sscle_01g000010,Q5RA91,166,291,327,102,195,211,1.079e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441
4,sscle_01g000010,Q5U1X0,166,291,327,102,195,211,1.079e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441


In [7]:
sppdf = pd.DataFrame(map(lambda s: s.as_series(), spp.values()))
sppdf.head()

Unnamed: 0,db,accession,entry_name,protein_name,organism_name,organism_id,gene_name,protein_existence,sequence_version
0,sp,Q6GZX4,001R_FRG3G,Putative transcription factor 001R,Frog,654924,FV3-001R,4,1
1,sp,Q6GZX3,002L_FRG3G,Uncharacterized protein 002L,Frog,654924,FV3-002L,4,1
2,sp,Q197F8,002R_IIV3,Uncharacterized protein 002R,Invertebrate,345201,IIV3-002R,4,1
3,sp,Q197F7,003L_IIV3,Uncharacterized protein 003L,Invertebrate,345201,IIV3-003L,4,1
4,sp,Q6GZX2,003R_FRG3G,Uncharacterized protein 3R,Frog,654924,FV3-003R,3,1


In [8]:
merged_sp = pd.merge(
    sppdf["accession protein_name organism_name organism_id gene_name".split()],
    spdf,
    left_on="accession",
    right_on="target",
    how="right"
).rename(columns={"organism_name": "organism", "organism_id": "taxid"})
merged_sp.head()

Unnamed: 0,accession,protein_name,organism,taxid,gene_name,query,target,qstart,qend,qlen,...,evalue,gapopen,pident,alnlen,raw,bits,cigar,mismatch,qcov,tcov
0,Q9US54,Transcription initiation factor TFIID subunit 11,Schizosaccharomyces,284812,taf11,sscle_01g000010,Q9US54,172,292,327,...,6.235e-08,2,32.5,120,138.0,59.0,68M15I3M17I17M,49,0.367,0.442
1,Q99JX1,Transcription initiation factor TFIID subunit 11,Mus,10090,Taf11,sscle_01g000010,Q99JX1,166,291,327,...,8.117e-07,2,30.4,125,128.0,55.0,74M3I3M29I16M,55,0.382,0.441
2,Q15544,Transcription initiation factor TFIID subunit 11,Homo,9606,TAF11,sscle_01g000010,Q15544,166,291,327,...,1.079e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441
3,Q5RA91,Transcription initiation factor TFIID subunit 11,Pongo,9601,TAF11,sscle_01g000010,Q5RA91,166,291,327,...,1.079e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441
4,Q5U1X0,Transcription initiation factor TFIID subunit 11,Rattus,10116,Taf11,sscle_01g000010,Q5U1X0,166,291,327,...,1.079e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441


In [10]:
!mkdir -p output

In [11]:
merged_sp = merged_sp[merged_sp.evalue < 1e-5]
merged_sp.to_csv("output/swissprot-matches.tsv", sep="\t", index=False)

In [12]:
for spi in sp:
    spi.analysis = "swissprot"
    spi.database = "SwissProt"

In [13]:
gffs = []
for spi in sp:
    gffs.extend(spi.as_gff(software_version="13.45111", database_version="2023_02"))

In [14]:
gffs[0]

GFFRecord('sscle_01g000010', 'MMSeqs2:13.45111:SwissProt:2023_02', 'protein_match', 139, 291, 3.144e-15, Strand.UNSTRANDED, Phase.NOT_CDS, GFF3Attributes(target=Target('Q9M565', 76, 195), gap=Gap([GapElement(GapCode.MATCH, 25), GapElement(GapCode.INSERT, 2), GapElement(GapCode.MATCH, 41), GapElement(GapCode.DELETE, 1), GapElement(GapCode.MATCH, 37), GapElement(GapCode.INSERT, 32), GapElement(GapCode.MATCH, 15)]), is_circular=False, custom={'tlen': '210', 'evalue': '3.144e-15', 'gapopen': '3', 'pident': '21.5', 'alnlen': '153', 'raw': '194.0', 'bits': '81.0', 'mismatch': '85', 'qcov': '0.465', 'tcov': '0.567'}))

In [15]:
with open("output/swissprot.gff3", "w") as handle:
    for gi in gffs:
        tid = gi.attributes.target.target_id
        entry = spp[tid]
    
        gi.attributes.dbxref = [f"UniProtKB/Swiss-Prot:{tid}"]
        gi.attributes.custom["description"] = entry.protein_name
        gi.attributes.name = entry.gene_name
        gi.attributes.custom["taxid"] = entry.organism_id
        gi.attributes.custom["organism"] = entry.organism_name
    
        print(gi.as_str(), file=handle)