In [1]:
import re
import gzip
from typing import NamedTuple

import pandas as pd
from predectorutils.analyses import MMSeqs

In [2]:
with open("./work/pdb-matches.tsv", "r") as handle:
    sp = list(MMSeqs.from_file(handle))

In [3]:
!wget -O work/pdb_seqres.txt.gz https://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt.gz

--2023-06-14 21:51:46--  https://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt.gz
Resolving ftp.wwpdb.org (ftp.wwpdb.org)... 128.6.158.49
Connecting to ftp.wwpdb.org (ftp.wwpdb.org)|128.6.158.49|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46077417 (44M) [application/x-gzip]
Saving to: ‘work/pdb_seqres.txt.gz’


2023-06-14 21:52:07 (2.22 MB/s) - ‘work/pdb_seqres.txt.gz’ saved [46077417/46077417]



In [16]:
with gzip.open("work/pdb_seqres.txt.gz", "r") as handle:
    headers = []
    for line in handle:
        line = line.decode()
        line = line.strip()
        if line.startswith(">"):
            headers.append(line.lstrip(">"))

In [31]:
mapper = {}
for h in headers:
    h = h.strip().split(maxsplit=3)
    mapper[h[0]] = h[-1]

In [11]:
spdf = pd.DataFrame(map(lambda s: s.as_series(), sp))
spdf.head()

Unnamed: 0,query,target,qstart,qend,qlen,tstart,tend,tlen,evalue,gapopen,pident,alnlen,raw,bits,cigar,mismatch,qcov,tcov
0,sscle_01g000010,6mzd_P,166,291,327,102,195,211,1e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441
1,sscle_01g000010,7egf_k,166,291,327,102,195,211,1e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441
2,sscle_01g000010,7enc_Dk,166,291,327,102,195,211,1e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441
3,sscle_01g000010,8gxq_Dk,166,291,327,102,195,211,1e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441
4,sscle_01g000010,7eg9_k,166,291,327,102,195,211,1e-06,2,30.4,125,128.0,55.0,74M3I2M29I17M,55,0.382,0.441


In [32]:
spdf["description"] = spdf["target"].apply(mapper.get)

In [35]:
spdf.to_csv("output/pdb-matches.tsv", sep="\t", index=False)

In [36]:
for spi in sp:
    spi.analysis = "pdb"
    spi.database = "PDB"

In [38]:
gffs = []
for spi in sp:
    gffs.extend(spi.as_gff(software_version="13.45111", database_version="2023_06_13"))

In [39]:
gffs[0]

GFFRecord('sscle_01g001290', 'MMSeqs2:13.45111:PDB:2023_06_13', 'protein_match', 0, 462, 8.574e-162, Strand.UNSTRANDED, Phase.NOT_CDS, GFF3Attributes(target=Target('3cpi_G', 0, 447), gap=Gap([GapElement(GapCode.MATCH, 3), GapElement(GapCode.DELETE, 2), GapElement(GapCode.MATCH, 55), GapElement(GapCode.DELETE, 1), GapElement(GapCode.MATCH, 57), GapElement(GapCode.DELETE, 1), GapElement(GapCode.MATCH, 3), GapElement(GapCode.INSERT, 5), GapElement(GapCode.MATCH, 86), GapElement(GapCode.INSERT, 2), GapElement(GapCode.MATCH, 63), GapElement(GapCode.DELETE, 2), GapElement(GapCode.MATCH, 10), GapElement(GapCode.INSERT, 11), GapElement(GapCode.MATCH, 2), GapElement(GapCode.INSERT, 2), GapElement(GapCode.MATCH, 21), GapElement(GapCode.DELETE, 1), GapElement(GapCode.MATCH, 64), GapElement(GapCode.INSERT, 1), GapElement(GapCode.MATCH, 22), GapElement(GapCode.INSERT, 1), GapElement(GapCode.MATCH, 54)]), is_circular=False, custom={'tlen': '451', 'evalue': '8.574e-162', 'gapopen': '11', 'pident': '5

In [41]:
with open("output/pdb.gff3", "w") as handle:
    for gi in gffs:
        tid = gi.attributes.target.target_id
    
        gi.attributes.dbxref = [f"PDB:{tid}"]
        if tid in mapper:
            gi.attributes.custom["description"] = mapper[tid]
        gi.attributes.name = tid
        print(gi.as_str(), file=handle)