# Build the depolymerase database 
***

## The cookbook for building the depolymerase database :
### A. Get the E.C associated with the depolymerase activity 
### B. Collect the IPR entries associated with the E.C of interest
### C. Scan the interproscan descriptions ; keep those that are relevant in our case
### D. Download the protein sequences, make a filtered hmm profile
### E. Make the DB
***

> A. <b> Get the relevant E.C

Two classes of enzymes are associated with a depolymerase activity : the lyase and the hydrolase

<div class="alert alert-block alert-info"> 
<b>
Lyases 
 <b/>
</div>
EC 4. Lyases ; 4.2 Carbon-oxygen lyases ; 4.2.2 Acting on polysaccharides
<div class="alert alert-block alert-success">
==> All the groups with 4.2.2
</div>

<div class="alert alert-block alert-info">   
<b>
Hydrolases 
<b/>    
</div>
EC 3. Hydrolases ; 3.2 Glycosylases ; 3.2.1 Glycosidases, i.e. enzymes hydrolyzing O- and S-glycosyl compounds 
<div class="alert alert-block alert-success">
==> All the groups with 3.2.1
</div>
 
***

>B. <b>Collect the IPE entries associated with the E.C

1. Fetch the file with the informations associated with each IPR entries

In [None]:
import xmltodict
import pprint
import json
from PPT_functions import *

path_PFAM = "/home/robbyce/Documents/bioinformatics/Depolymerase_DB"
# Getting the file with the description of each IPR entries
# ! wget https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro.xml.gz
xml_interpro = xmltodict.parse(open(f"{path_PFAM}/interpro.xml").read())


2. Generate an object with the IPR ACC and descriptions of the IPR entries associated with the relevant E.C ==> candidates IPR : n = 6

In [None]:
m=0
Ec_numbers = ["4.2.2","3.2.1"]

IPR_scan = {}
IPR_s = set ()
clean_IPR = {}

elimininated = {'Active_site' : [],
 'Binding_site': [],
 'Conserved_site': [],
 'PTM': []}

lyases = set()
hydrolases =set()
# Get a list of correct IPR : 
for index_i, entry in enumerate(xml_interpro['interprodb']["interpro"]) :
    if "external_doc_list" in xml_interpro['interprodb']["interpro"][index_i] :
        output = xml_interpro['interprodb']["interpro"][index_i]
        if output["@type"] not in elimininated :
            if "db_xref" in xml_interpro['interprodb']["interpro"][index_i]["external_doc_list"] :
                for index_db ,db in enumerate(xml_interpro['interprodb']["interpro"][index_i]["external_doc_list"]["db_xref"]) :
                    try :
                        if db["@db"]=='EC' :
                            try :
                                if db["@dbkey"][0:5] in Ec_numbers :
                                    IPR_s.add(entry["@id"])
                                    if db["@dbkey"][0:5] == "4.2.2" :
                                        lyases.add(entry["@id"])
                                    else : 
                                        hydrolases.add(entry["@id"])
                            except Exception as e :
                                pass
                    except Exception as e :
                        if xml_interpro['interprodb']["interpro"][index_i]["external_doc_list"]["db_xref"]["@db"] == "EC":
                            if xml_interpro['interprodb']["interpro"][index_i]["external_doc_list"]["db_xref"]["@dbkey"][0:5] in Ec_numbers:
                                IPR_s.add(entry["@id"])
                                if xml_interpro['interprodb']["interpro"][index_i]["external_doc_list"]["db_xref"]["@dbkey"][0:5] == "4.2.2" :
                                    lyases.add(entry["@id"])
                                else : 
                                    hydrolases.add(entry["@id"])

for IPR in IPR_s :
    output = get_full_entry(IPR , False)
    if output["@type"] not in elimininated :
        short_name = output["@short_name"]
        if "p" not in output["abstract"] :
            full_text = output["abstract"]["#text"]
        elif isinstance(output["abstract"]["p"] , str) :
            full_text = output["abstract"]["p"]
            pass
        elif isinstance(output["abstract"]["p"] , dict) :
            full_text = output["abstract"]["p"]["#text"]
            pass
        elif isinstance(output["abstract"]["p"] , list) :
            full_text = str()
            if len(output["abstract"]["p"]) == 1 :
                full_text = output["abstract"]["p"][0]
            else :
                if all_strings(output["abstract"]["p"]) == True :
                    full_text = "".join(output["abstract"]["p"])
                    # *** Until here, it's fine ***
                else :
                    for index_p, ele_p in enumerate(output["abstract"]["p"]) :
                        try :
                            if isinstance (ele_p , str) :
                                full_text = full_text + ele_p
                            elif isinstance (ele_p , dict) and "#text" in ele_p:
                                full_text = full_text + ele_p["#text"]
                            else :
                                continue
                        except Exception as e :
                            print(e , IPR,output["abstract"]["p"])
            pass
        # Reformatting the description :
        bad_char = ["[", "]","\n", " , ", " , .", " , , ."]
        clean_description = str()
        very_clean_description = str()
        for char in bad_char :
            full_text= full_text.replace(char, "")
        #for char in full_text :
            #if char in bad_char :
                #continue
            #else :
                #clean_description = clean_description + char
        clean_description = " ".join(full_text.split())
        #for char in clean_description :
            #if char in bad_char :
                #very_clean_description = very_clean_description + "."
                #continue
            #else :
                #very_clean_description = very_clean_description + char
        # Creating the dictionary 
        a = {"short_name" : short_name, "description" : clean_description}
        clean_IPR[IPR] = a

***
> C. <b>Scan the description

1. The relevant functions 

In [None]:
def get_full_entry(IPR , pprint_o_nah) :
    import pprint
    pp = pprint.PrettyPrinter(width = 150)
    for index_i, entry in enumerate(xml_interpro['interprodb']["interpro"]) :
        if entry["@id"] == IPR :
            output = xml_interpro['interprodb']["interpro"][index_i]
    if pprint_o_nah == True :
        return pp.pprint(output)
    else :
        return output
    
def all_strings(list_p) :
    for index_list, element in enumerate(list_p) :
        if isinstance(element , str) == False :
            return False
            break
        else :
            continue
    else :
        return True
    
    
def any_string(list_p) :
    for index_list, element in enumerate(list_p) :
        if isinstance(element , str) == True :
            return True
            break
    else :
        return False
    
def return_string(list_p) :
    string = str()
    for index_list, element in enumerate(list_p) :
        if isinstance(element , str) == True :
            string = string + element
        else :
            continue
            
    
def go_terms(IPR, search_list) :
    for index_i, entry in enumerate(xml_interpro['interprodb']["interpro"]) :
        if entry["@id"] == IPR :
            if "class_list" in entry.keys() :
                descriptions = []
                if isinstance(entry["class_list"]["classification"] , dict ) :
                    descriptions.append(go_term["description"]) 
                elif isinstance(entry["class_list"]["classification"] , list ) :
                    for i_go, go_term in enumerate(entry["class_list"]["classification"]) :
                        descriptions.append(go_term["description"]) 
                for i_des, go_description in enumerate(descriptions):
                    if go_description in search_list :
                        if IPR not in IPR_depo :
                            get_full_entry(IPR, True)
                            break

2. Some code to facilitate the classification 

In [None]:
depo_terms = ["hyluronidase", "pectin", "pectate","sialidase", "levanase", "xylosidase","dextranase","rhamnosidase","alginate"]

for id__ in list(clean_IPR.keys())[600:] :
    if id__ in lyases :
        #print(id__)
        #pp.pprint(clean_IPR[id__])
        #print("\n")
        pass
    elif good_term(clean_IPR[id__]["description"].lower()) == True :
        #print(id__)
        #pp.pprint(clean_IPR[id__])
        #print("\n")
        pass
    elif clean_IPR[id__]["description"].lower().count("cbm") > 0 :
        #print(id__)
        #pp.pprint(clean_IPR[id__])
        #print("\n")
        pass
    elif clean_IPR[id__]["description"].lower().count("peptidoglycan") > 0 :
        #print(id__)
        #pp.pprint(clean_IPR[id__])
        #print("\n")
        pass
    elif clean_IPR[id__]["description"].lower().count("peptidase") > 0 :
        #print(id__)
        #pp.pprint(clean_IPR[id__])
        #print("\n")
        pass
    elif clean_IPR[id__]["description"].lower().count("dna") > 0 :
        #print(id__)
        #pp.pprint(clean_IPR[id__])
        #print("\n")
        pass
    elif clean_IPR[id__]["description"].lower().count("polysaccharide") > 0 :
        #print(id__)
        #pp.pprint(clean_IPR[id__])
        #print("\n")
        pass
    elif clean_IPR[id__]["description"].lower().count("o-glycosyl hydrolases") > 0 :
        #print(id__)
        #pp.pprint(clean_IPR[id__])
        #print("\n")
        pass
    else :
        print(id__)
        pp.pprint(clean_IPR[id__])
        print("\n")
        pass

3. After reading the descriptions of the IPR, we kept this final list of entries :

In [1]:
IPR_depolymerase = ['IPR029411', 'IPR002152', 'IPR012480', 'IPR039513', 'IPR002925', 'IPR041624', 'IPR024200', 'IPR029456', 'IPR015220', 'IPR041111', 'IPR036962',
 'IPR016285', 'IPR005323', 'IPR016007', 'IPR044505', 'IPR015178', 'IPR011840', 'IPR024561', 'IPR006048', 'IPR009470', 'IPR004197', 'IPR012878',
 'IPR015165', 'IPR024732', 'IPR023309', 'IPR040784', 'IPR014551', 'IPR032312', 'IPR000490', 'IPR006103', 'IPR001139', 'IPR031330', 'IPR000400',
 'IPR044846', 'IPR011040', 'IPR011050', 'IPR024428', 'IPR000743', 'IPR002241', 'IPR005192', 'IPR011837', 'IPR024430', 'IPR000852', 'IPR006710',
 'IPR011839', 'IPR010905', 'IPR013775', 'IPR011838', 'IPR026071', 'IPR032792', 'IPR043534', 'IPR012939', 'IPR037110', 'IPR003469', 'IPR017736',
 'IPR015289', 'IPR016283', 'IPR016282', 'IPR039743', 'IPR005197', 'IPR016289', 'IPR018155', 'IPR035669', 'IPR023933', 'IPR036026', 'IPR023296',
 'IPR016840', 'IPR013777', 'IPR000165', 'IPR010720', 'IPR001554', 'IPR001661', 'IPR002252', 'IPR027291', 'IPR006215', 'IPR005604', 'IPR019563',
 'IPR022844', 'IPR011395', 'IPR011496', 'IPR015883', 'IPR005198', 'IPR017069', 'IPR022859', 'IPR025975', 'IPR018082', 'IPR012334', 'IPR000322',
 'IPR001860', 'IPR000805', 'IPR036434', 'IPR038964', 'IPR006046', 'IPR024745', 'IPR013319', 'IPR000334', 'IPR031335', 'IPR003476', 'IPR001382',
 'IPR037019', 'IPR005193', 'IPR037398', 'IPR013785', 'IPR010702', 'IPR026856', 'IPR023295', 'IPR039174', 'IPR034641', 'IPR036278', 'IPR045032',
 'IPR016590', 'IPR002022', 'IPR012970', 'IPR038970', 'IPR044112', 'IPR000514', 'IPR001724', 'IPR001362', 'IPR025092', 'IPR006775', 'IPR036881',
 'IPR033654', 'IPR039279', 'IPR006047', 'IPR006101', 'IPR016288', 'IPR001137', 'IPR004185', 'IPR004888', 'IPR001547', 'IPR005199', 'IPR011583',
 'IPR000933', 'IPR011683', 'IPR001722', 'IPR005201', 'IPR001439', 'IPR002037', 'IPR011100', 'IPR027260', 'IPR045857', 'IPR009860', 'IPR016455',
 'IPR014895', 'IPR027946', 'IPR008929', 'IPR000125', 'IPR024746', 'IPR001329', 'IPR000726', 'IPR000974', 'IPR011330', 'IPR006633', 'IPR008979',
 'IPR023346', 'IPR000974', 'IPR019282', 'IPR000922', 'IPR041351', 'IPR006421', 'IPR003790', 'IPR001088', 'IPR010713', 'IPR023309', 'IPR011839',
 'IPR000125', 'IPR035394', 'IPR032091', 'IPR015177', 'IPR039448', 'IPR015331', 'IPR011613', 'IPR015179', 'IPR014635', 'IPR001371', 'IPR016282',
 'IPR016714', 'IPR006065', 'IPR001223', 'IPR025706', 'IPR041542', 'IPR033452', 'IPR014718', 'IPR035396', 'IPR044914', 'IPR006425', 'IPR021016',
 'IPR008811', 'IPR001286', 'IPR005200', 'IPR024733', 'IPR000757', 'IPR032790', 'IPR008397', 'IPR004898', 'IPR006626', 'IPR003159', 'IPR008902',
 'IPR007724', 'IPR046372', 'IPR001000', 'IPR016287', 'IPR013776', 'IPR002594', 'IPR023720', 'IPR000556', 'IPR016286', 'IPR016828', 'IPR029070',
 'IPR006584', 'IPR039514', 'IPR026283', 'IPR032979', 'IPR009939', 'IPR039473', 'IPR007781', 'IPR000677', 'IPR038901', 'IPR008291', 'IPR040527',
 'IPR036439']

In [3]:
with open("/media/concha-eloko/Linux/depolymerase_building/IPR_etires.tsv" , "w") as outfile :
    for ipr in IPR_depolymerase :
        outfile.write(f"{ipr}\n")

***
> D.<b> Download the protein sequences

1. Download the sequences

In [None]:
# standard library modules
import sys, errno, re, json, ssl
from urllib import request
from urllib.error import HTTPError
from time import sleep
import os 
from multiprocessing import Pool

path_db = "/home/conchae/databases/depolymerase_building"

HEADER_SEPARATOR = "|"
LINE_LENGTH = 80


IPR_depo = [
 'IPR029411', 'IPR002152', 'IPR012480', 'IPR039513', 'IPR002925', 'IPR041624', 'IPR024200', 'IPR029456', 'IPR015220', 'IPR041111', 'IPR036962',
 'IPR016285', 'IPR005323', 'IPR016007', 'IPR044505', 'IPR015178', 'IPR011840', 'IPR024561', 'IPR006048', 'IPR009470', 'IPR004197', 'IPR012878',
 'IPR015165', 'IPR024732', 'IPR023309', 'IPR040784', 'IPR014551', 'IPR032312', 'IPR000490', 'IPR006103', 'IPR001139', 'IPR031330', 'IPR000400',
 'IPR044846', 'IPR011040', 'IPR011050', 'IPR024428', 'IPR000743', 'IPR002241', 'IPR005192', 'IPR011837', 'IPR024430', 'IPR000852', 'IPR006710',
 'IPR011839', 'IPR010905', 'IPR013775', 'IPR011838', 'IPR026071', 'IPR032792', 'IPR043534', 'IPR012939', 'IPR037110', 'IPR003469', 'IPR017736',
 'IPR015289', 'IPR016283', 'IPR016282', 'IPR039743', 'IPR005197', 'IPR016289', 'IPR018155', 'IPR035669', 'IPR023933', 'IPR036026', 'IPR023296',
 'IPR016840', 'IPR013777', 'IPR000165', 'IPR010720', 'IPR001554', 'IPR001661', 'IPR002252', 'IPR027291', 'IPR006215', 'IPR005604', 'IPR019563',
 'IPR022844', 'IPR011395', 'IPR011496', 'IPR015883', 'IPR005198', 'IPR017069', 'IPR022859', 'IPR025975', 'IPR018082', 'IPR012334', 'IPR000322',
 'IPR001860', 'IPR000805', 'IPR036434', 'IPR038964', 'IPR006046', 'IPR024745', 'IPR013319', 'IPR000334', 'IPR031335', 'IPR003476', 'IPR001382',
 'IPR037019', 'IPR005193', 'IPR037398', 'IPR013785', 'IPR010702', 'IPR026856', 'IPR023295', 'IPR039174', 'IPR034641', 'IPR036278', 'IPR045032',
 'IPR016590', 'IPR002022', 'IPR012970', 'IPR038970', 'IPR044112', 'IPR000514', 'IPR001724', 'IPR001362', 'IPR025092', 'IPR006775', 'IPR036881',
 'IPR033654', 'IPR039279', 'IPR006047', 'IPR006101', 'IPR016288', 'IPR001137', 'IPR004185', 'IPR004888', 'IPR001547', 'IPR005199', 'IPR011583',
 'IPR000933', 'IPR011683', 'IPR001722', 'IPR005201', 'IPR001439', 'IPR002037', 'IPR011100', 'IPR027260', 'IPR045857', 'IPR009860', 'IPR016455',
 'IPR014895', 'IPR027946', 'IPR008929', 'IPR000125', 'IPR024746', 'IPR001329', 'IPR000726', 'IPR000974', 'IPR011330', 'IPR006633', 'IPR008979',
 'IPR023346', 'IPR000974', 'IPR019282', 'IPR000922', 'IPR041351', 'IPR006421', 'IPR003790', 'IPR001088', 'IPR010713', 'IPR023309', 'IPR011839',
 'IPR000125', 'IPR035394', 'IPR032091', 'IPR015177', 'IPR039448', 'IPR015331', 'IPR011613', 'IPR015179', 'IPR014635', 'IPR001371', 'IPR016282',
 'IPR016714', 'IPR006065', 'IPR001223', 'IPR025706', 'IPR041542', 'IPR033452', 'IPR014718', 'IPR035396', 'IPR044914', 'IPR006425', 'IPR021016',
 'IPR008811', 'IPR001286', 'IPR005200', 'IPR024733', 'IPR000757', 'IPR032790', 'IPR008397', 'IPR004898', 'IPR006626', 'IPR003159', 'IPR008902',
 'IPR007724', 'IPR046372', 'IPR001000', 'IPR016287', 'IPR013776', 'IPR002594', 'IPR023720', 'IPR000556', 'IPR016286', 'IPR016828', 'IPR029070',
 'IPR006584', 'IPR039514', 'IPR026283', 'IPR032979', 'IPR009939', 'IPR039473', 'IPR007781', 'IPR000677', 'IPR038901', 'IPR008291', 'IPR040527',
 'IPR036439']

urls = [f"https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/entry/InterPro/{IPR}/?page_size=200&extra_fields=sequence" for IPR in IPR_depo if IPR]

def get_IPR(url) :
    IPR = url.split("InterPro/")[1].split("/")[0]
    print(f"{IPR} in the process... \n")
    final_length = 0
    if os.path.isfile(f"{path_db}/{IPR}.entry.sequences.fasta") == False :
        with open(f"{path_db}/{IPR}.entry.sequences.fasta", "w") as outfile :
            #disable SSL verification to avoid config issues
            context = ssl._create_unverified_context()
            next = url
            last_page = False
            attempts = 0
            while next:
                try:
                    req = request.Request(next, headers={"Accept": "application/json"})
                    res = request.urlopen(req, context=context)
                    # If the API times out due a long running query
                    if res.status == 408:
                        # wait just over a minute
                        sleep(61)
                        # then continue this loop with the same URL
                        continue
                    elif res.status == 204:
                        #no data so leave loop
                        break
                    payload = json.loads(res.read().decode())
                    next = payload["next"]
                    attempts = 0
                    if not next:
                        last_page = True
                except HTTPError as e:
                    if e.code == 408:
                        sleep(61)
                        continue
                    else:
                        # If there is a different HTTP error, it wil re-try 3 times before failing
                        if attempts < 3:
                            attempts += 1
                            sleep(61)
                            continue
                        else:
                            sys.stderr.write("LAST URL: " + next)
                            raise e

                for i, item in enumerate(payload["results"]):
                    if i > 500 :
                    start , end = 0, 0

                    entries = None
                    if ("entry_subset" in item):
                        entries = item["entry_subset"]
                    elif ("entries" in item):
                        entries = item["entries"]
                    if entries is not None:
                        entries_header = "-".join([entry["accession"] + "(" + ";".join([",".join([ str(fragment["start"]) + "..." + str(fragment["end"]) 
                              for fragment in locations["fragments"]]) for locations in entry["entry_protein_locations"]]) + ")" for entry in entries])
                        outfile.write(">" + item["metadata"]["accession"] + HEADER_SEPARATOR
                                      + entries_header + HEADER_SEPARATOR
                                      + item["metadata"]["name"] + "\n")
                    else:
                        outfile.write(">" + item["metadata"]["accession"] + HEADER_SEPARATOR + item["metadata"]["name"] + "\n")
                        pass
                    seq = item["extra_fields"]["sequence"]
                    fastaSeqFragments = [seq[0+i:LINE_LENGTH+i] for i in range(0, len(seq), LINE_LENGTH)]
                    for fastaSeqFragment in fastaSeqFragments:
                        outfile.write(fastaSeqFragment + "\n")
                    # Don't overload the server, give it time before asking for more
                if next:
                    sleep(1)
                    
if __name__ == "__main__" :
    with Pool(20) as p :
        p.map(get_IPR , urls)

***
> E. <b>Make the database 

Detect the IPR entries grouping proteins which signature type is a PRINT, domain ...


In [None]:
import os 
from Bio import SeqIO
from tqdm import tqdm
path_db = "/home/conchae/databases/depolymerase_building"
path_out = "/home/conchae/databases/depolymerase_building/IPR_domain_multi"

#path_db = "/home/robbyce/Documents/bioinformatics/depolymerase_building"
#path_out = "/home/robbyce/Documents/bioinformatics/depolymerase_building/IPR_domain_multi"
IPR_depo = [
 'IPR029411', 'IPR002152', 'IPR012480', 'IPR039513', 'IPR002925', 'IPR041624', 'IPR024200', 'IPR029456', 'IPR015220', 'IPR041111', 'IPR036962',
 'IPR016285', 'IPR005323', 'IPR016007', 'IPR044505', 'IPR015178', 'IPR011840', 'IPR024561', 'IPR006048', 'IPR009470', 'IPR004197', 'IPR012878',
 'IPR015165', 'IPR024732', 'IPR023309', 'IPR040784', 'IPR014551', 'IPR032312', 'IPR000490', 'IPR006103', 'IPR001139', 'IPR031330', 'IPR000400',
 'IPR044846', 'IPR011040', 'IPR011050', 'IPR024428', 'IPR000743', 'IPR002241', 'IPR005192', 'IPR011837', 'IPR024430', 'IPR000852', 'IPR006710',
 'IPR011839', 'IPR010905', 'IPR013775', 'IPR011838', 'IPR026071', 'IPR032792', 'IPR043534', 'IPR012939', 'IPR037110', 'IPR003469', 'IPR017736',
 'IPR015289', 'IPR016283', 'IPR016282', 'IPR039743', 'IPR005197', 'IPR016289', 'IPR018155', 'IPR035669', 'IPR023933', 'IPR036026', 'IPR023296',
 'IPR016840', 'IPR013777', 'IPR000165', 'IPR010720', 'IPR001554', 'IPR001661', 'IPR002252', 'IPR027291', 'IPR006215', 'IPR005604', 'IPR019563',
 'IPR022844', 'IPR011395', 'IPR011496', 'IPR015883', 'IPR005198', 'IPR017069', 'IPR022859', 'IPR025975', 'IPR018082', 'IPR012334', 'IPR000322',
 'IPR001860', 'IPR000805', 'IPR036434', 'IPR038964', 'IPR006046', 'IPR024745', 'IPR013319', 'IPR000334', 'IPR031335', 'IPR003476', 'IPR001382',
 'IPR037019', 'IPR005193', 'IPR037398', 'IPR013785', 'IPR010702', 'IPR026856', 'IPR023295', 'IPR039174', 'IPR034641', 'IPR036278', 'IPR045032',
 'IPR016590', 'IPR002022', 'IPR012970', 'IPR038970', 'IPR044112', 'IPR000514', 'IPR001724', 'IPR001362', 'IPR025092', 'IPR006775', 'IPR036881',
 'IPR033654', 'IPR039279', 'IPR006047', 'IPR006101', 'IPR016288', 'IPR001137', 'IPR004185', 'IPR004888', 'IPR001547', 'IPR005199', 'IPR011583',
 'IPR000933', 'IPR011683', 'IPR001722', 'IPR005201', 'IPR001439', 'IPR002037', 'IPR011100', 'IPR027260', 'IPR045857', 'IPR009860', 'IPR016455',
 'IPR014895', 'IPR027946', 'IPR008929', 'IPR000125', 'IPR024746', 'IPR001329', 'IPR000726', 'IPR000974', 'IPR011330', 'IPR006633', 'IPR008979',
 'IPR023346', 'IPR000974', 'IPR019282', 'IPR000922', 'IPR041351', 'IPR006421', 'IPR003790', 'IPR001088', 'IPR010713', 'IPR023309', 'IPR011839',
 'IPR000125', 'IPR035394', 'IPR032091', 'IPR015177', 'IPR039448', 'IPR015331', 'IPR011613', 'IPR015179', 'IPR014635', 'IPR001371', 'IPR016282',
 'IPR016714', 'IPR006065', 'IPR001223', 'IPR025706', 'IPR041542', 'IPR033452', 'IPR014718', 'IPR035396', 'IPR044914', 'IPR006425', 'IPR021016',
 'IPR008811', 'IPR001286', 'IPR005200', 'IPR024733', 'IPR000757', 'IPR032790', 'IPR008397', 'IPR004898', 'IPR006626', 'IPR003159', 'IPR008902',
 'IPR007724', 'IPR046372', 'IPR001000', 'IPR016287', 'IPR013776', 'IPR002594', 'IPR023720', 'IPR000556', 'IPR016286', 'IPR016828', 'IPR029070',
 'IPR006584', 'IPR039514', 'IPR026283', 'IPR032979', 'IPR009939', 'IPR039473', 'IPR007781', 'IPR000677', 'IPR038901', 'IPR008291', 'IPR040527',
 'IPR036439']
# The proteins which signature span one domain ==> get the domain
# more than 1 domain ==> keep the full sequence 
for ipr_multi in tqdm(IPR_depo):
    if ipr_multi in ["IPR012334","IPR011050"] :
        with open(f"{path_out}/{ipr_multi}.domain.sequences.corrected.v2.fasta", "w") as outfile :
            for record in SeqIO.parse(f"{path_db}/IPR_multi_fasta/{ipr_multi}.entry.sequences.fasta", "fasta"):
                mid_info = record.id.split("|")[1]
                # If the signature is a single domain
                if mid_info.count(";") == 0 :
                    start = int(mid_info.split("...")[0].split("(")[1])
                    end = int(mid_info.split("...")[1].split(")")[0])
                    sequence = record.seq[start : end]
                    tag = "__tag__domain"
                # If it is more than one domain ==> keep the entire sequence
                else :
                    sequence = record.seq
                    tag = "__tag__full"
                if len(sequence) < 3000 : 
                    outfile.write(f">{record.id}{tag}\n{sequence}\n")
                else :
                    continue
        

2. Build MSA of the sequences associated with each IPR entry : FAMSA

In [None]:
import pandas as pd
from tqdm import tqdm

path_db = "/home/conchae/databases/depolymerase_building"

def run_famsa(path_in, path_out) :
    import subprocess
    #align_cmd = f"famsa -gt sl -dist_export {path_in}.dist -pid -t 10 {path_in} {path_out}"
    align_cmd = f"famsa -gt sl -t 10 {path_in} {path_out}"
    align_subprocess = subprocess.Popen (align_cmd , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    align_out, align_err = align_subprocess.communicate()
    return align_out

paths = pd.DataFrame()
for ipr_multi in tqdm(IPR_depo):
    if ipr_multi in ["IPR012334","IPR011050"]:
        in_path = f"{path_db}/IPR_domain_multi/{ipr_multi}.domain.sequences.corrected.v2.fasta"
        out_path = f"{path_db}/IPR_MSA/{ipr_multi}.domain.corrected.MSA.v2.fasta"
        run_famsa(in_path, out_path)
        paths = paths.append({"Ins" : in_path, "Outs" : out_path}, ignore_index = True)
    
run = list(map(run_famsa , paths["Ins"].to_list(),paths["Outs"].to_list()))
# *****************************************************************************************************************************
#!/bin/bash
#BATCH --job-name=MSA_IPR__
#SBATCH --partition=medium
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=30 
#SBATCH --mem=200gb 
#SBATCH --time=02-00:00:00 
#SBATCH --output=MSA_IPR__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate bio_phylo

python /home/conchae/databases/depolymerase_building/script_files/MSA_ipr_domains.py

3. Filter the MSA based on the pairwise penrcentage identity : eliminate one sequence of the pairs that have >95% id. 

In [None]:
import os 

path_db = "/home/conchae/databases/depolymerase_building"

def filter_MSA(i_file) :
    import subprocess
    out_path = "/".join(i_file.split("/")[0:-1])
    file_name = i_file.split("/")[-1].split(".fasta")[0]
    o_file = f"{out_path}/{file_name}.filtered.v2.fasta"    
    # ***********************************
    filter_cmd = f"hhfilter -i {i_file} -o {o_file} -id 95"
    filter_subprocess = subprocess.Popen (filter_cmd , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    filter_out, filter_err = filter_subprocess.communicate()
    
list_msa = [path_db+"/IPR_v2/"+ipr for ipr in os.listdir(f"{path_db}/IPR_v2") if ipr.count("corrected.MSA.v2")>0]
results = list(map(filter_MSA, list_msa))

4. Build a hmm profile from the final set of kept sequences

In [None]:
def build_hmm(i_file) :
    import subprocess
    out_path = "/".join(i_file.split("/")[0:-1])
    file_name = i_file.split("/")[-1].split(".a3m")[0]
    o_file = f"{out_path}/{file_name}.hmm"    
    # ***********************************
    build_cmd = f"hmmbuild {o_file} {i_file}"
    build_subprocess = subprocess.Popen (build_cmd , shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    filter_out, filter_err = build_subprocess.communicate()
    print(o_file , filter_out , filter_err)
    
list_msa = [path_db+"/IPR_v2/"+ipr for ipr in os.listdir(f"{path_db}/IPR_v2") if ipr[-3:]=="a3m"]
results = list(map(build_hmm, list_msa))

5. Append the hmm profiles into a single file, then press it to make the final database

In [None]:
import os 

path_db = "/home/conchae/databases/depolymerase_building"

# Single file with all profiles
with open(f"{path_db}/depolymerase_db.v2301.hmm", "w") as outfile :
    for file in os.listdir(f"{path_db}/IPR_MSA") :
        if file[-3:]=="hmm" :
            ipr_content = open(f"{path_db}/IPR_MSA/{file}").read()
            outfile.write(ipr_content+"\n")

# Single file with all a3m
with open(f"{path_db}/depolymerase_db.v2301.a3m", "w") as outfile :
    for file in os.listdir(f"{path_db}/IPR_MSA") :
        if file[-3:]=="a3m" :
            ipr_content = open(f"{path_db}/IPR_MSA/{file}").read()
            outfile.write(ipr_content+"\n")


# ! conda activate HH-suite3
export PATH="/media/concha-eloko/Linux/softwares/hh-suite/lib/ffindex/src:$PATH"
HHLIB="/media/concha-eloko/Linux/conda_envs/HH-suite3"
# modify the script hhsuitedb.py : 
# l 110 :  hhlib_environment = os.environ['HHLIB'] -- > hhlib_environment = "/media/concha-eloko/Linux/conda_envs/HH-suite3"
python3 /media/concha-eloko/Linux/softwares/hh-suite/scripts/hhsuitedb.py \
-o /media/concha-eloko/Linux/depolymerase_project/DBsuite_depolymerase/depolymerase_db.suite \
--ihhm=/media/concha-eloko/Linux/depolymerase_project/clean_files/*.hmm \
--ia3m=/media/concha-eloko/Linux/depolymerase_project/clean_files/*.a3m \
--cpu=2 \
--force

# ***************************************************************************************************************************************************
# Build the databse by hand :
/usr/share/hhsuite/bin/ffindex_build pdb_full_a3m.ffdata pdb_full_a3m.ffindex a3m/
/usr/share/hhsuite/bin/ffindex_build pdb_full_hhm.ffdata pdb_full_hhm.ffindex hhm/
LC_ALL=C sort pdb_full_hhm.ffindex > pdb_full_hhm.ffindex.simpleSort
LC_ALL=C sort pdb_full_a3m.ffindex > pdb_full_a3m.ffindex.simpleSort
mv pdb_full_a3m.ffindex pdb_full_a3m.ffindex.orig
mv pdb_full_hhm.ffindex pdb_full_hhm.ffindex.orig
ln -s pdb_full_a3m.ffindex.simpleSort pdb_full_a3m.ffindex
ln -s pdb_full_hhm.ffindex.simpleSort pdb_full_hhm.ffindex
export OMP_NUM_THREADS=$(nproc)
/usr/share/hhsuite/bin/cstranslate  -A /usr/share/hhsuite/data/cs219.lib -D /usr/share/hhsuite/data/context_data.lib -x 0.3 -c 4 -f -i pdb_full_a3m -o pdb_full_cs219 -I a3m -b