# Literature Analysis

## Automated searches with Biopython

In [1]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW
import requests, sys, json
import re
from Bio import SeqIO
from Bio import Entrez
from Bio import Medline
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SearchIO
from Bio.SwissProt import KeyWList
from Bio import SwissProt
from Bio import ExPASy
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Align.Applications import ClustalwCommandline
from Bio.Align import AlignInfo
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo

Two possible strategies for searching for articles.
The first allows searching globally, the second is more targeted and involves a pre-selection of the articles.

In [88]:
#First Strategie

database = "PubMed"#For literature will be PubMed
word = "DDX18"
res= "5" #I choose a limit of thirty
email= "guilherme.lobo@ua.pt"

Entrez.email= email
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="medline", retmode="text") #para este caso em especifico devemos ter como database a PubMed("Medline")
records = Medline.parse(handle)

for record in records:
    #print(record) 
    #print("PMID:",record.get("PMID","-"))
    print("title:", record.get("TI", "-"))
    #print('abstract:', record.get('AB', '-'))
    print("authors:", record.get("AU", "-"))
    print("source:", record.get("SO", "-"))
    print("")

title: Detection of Selection Signatures in Anqing Six-End-White Pigs Based on Resequencing Data.
authors: ['Chen Y', 'Wu X', 'Wang J', 'Hou Y', 'Liu Y', 'Wang B', 'Hu X', 'Zheng X', 'Zhang X', 'Ding Y', 'Yin Z']
source: Genes (Basel). 2022 Dec 8;13(12):2310. doi: 10.3390/genes13122310.

title: DDX18 prevents R-loop-induced DNA damage and genome instability via PARP-1.
authors: ['Lin WL', 'Chen JK', 'Wen X', 'He W', 'Zarceno GA', 'Chen Y', 'Chen S', 'Paull TT', 'Liu HW']
source: Cell Rep. 2022 Jul 19;40(3):111089. doi: 10.1016/j.celrep.2022.111089.

title: Identification of Important Modules and Hub Gene in Chronic Kidney Disease Based on WGCNA.
authors: ['Wang J', 'Yin Y', 'Lu Q', 'Zhao YR', 'Hu YJ', 'Hu YZ', 'Wang ZY']
source: J Immunol Res. 2022 May 4;2022:4615292. doi: 10.1155/2022/4615292. eCollection 2022.

title: The RNA-Binding Protein DDX18 Promotes Gastric Cancer by Affecting the Maturation of MicroRNA-21.
authors: ['Zhang Y', 'Yu F', 'Ni B', 'Li Q', 'Bae SW', 'Choi JH', 'Yan

In [89]:
#Same strategie but for type II diabetes

database = "PubMed"
word = "Type 2 Diabetes"
res= "5" 
email= "guilherme.lobo@ua.pt"

Entrez.email= email
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="medline", retmode="text")
records = Medline.parse(handle)

for record in records:
    print("title:", record.get("TI", "-"))
    print("authors:", record.get("AU", "-"))
    print("source:", record.get("SO", "-"))
    print("")

title: Onchidium struma polysaccharides exhibit hypoglycemic activity and modulate the gut microbiota in mice with type 2 diabetes mellitus.
authors: ['Zhao Y', 'Song P', 'Yin S', 'Fan T', 'Li F', 'Ge X', 'Liu T', 'Xu W', 'Xu S', 'Chen L']
source: Food Funct. 2023 Jan 24. doi: 10.1039/d2fo02450k.

title: Yunvjian Improves Glucose and Insulin Function in Diabetic Rats by Regulating Gastric Emptying Function.
authors: ['Luo WY', 'Gao L', 'Zhao DD', 'Zhang L', 'Gao B', 'Lei G', 'Dong GT', 'Wei JP']
source: Evid Based Complement Alternat Med. 2023 Jan 14;2023:8551406. doi: 10.1155/2023/8551406. eCollection 2023.

title: The changing food environment and neighborhood prevalence of type 2 diabetes.
authors: ['Zick CD', 'Curtis DS', 'Meeks H', 'Smith KR', 'Brown BB', 'Kole K', 'Kowaleski-Jones L']
source: SSM Popul Health. 2023 Jan 10;21:101338. doi: 10.1016/j.ssmph.2023.101338. eCollection 2023 Mar.

title: Acute decompensation of patient following an outpatient CT-guided needle biopsy: A ca

In [3]:
#Second Strategie for myarticles_DDX18; articles used in literature analysis
alldata=[]
with open("Article_DDX18.txt",encoding="utf-8") as f: #For this examples I chosse articles from PubMed about Diabetes type II
    pmids= Medline.parse(f)
    for pmid in tqdm(pmids):
        try:
            pid=pmid["PMID"]
        except:
            pid="-"
        try:
            Title=pmid["TI"]
        except:
            Title="-"
        #try:        
            #Abstract=pmid["AB"]
        #except:
            #Abstract="-"
        #try:
            #Authors=pmid["AU"]
        #except:
            #Authors="-"
        #try:
            #Source=pmid["SO"]
        #except:
            #Source="-"
        
        dic={"PMID":pid,
             "Title":Title,
             #"Abstract":Abstract,
             #"Authors":Authors,
             #"Source":Source
            }
        alldata.append(dic)

df = pd.DataFrame.from_records(alldata,index=['1', '2',"3","4","5"])
print(df)
#writer = pd.ExcelWriter('articles_ddx18.xlsx')
#df.to_Exel(writer)
#writer.save()
#df=pd.DataFrame(alldata)
#df.to_csv("articles_ddx18.csv", index=False)

5it [00:00, 2717.93it/s]

       PMID                                              Title
1  33489896  The RNA-Binding Protein DDX18 Promotes Gastric...
2  18351129                  Cellular studies of MrDb (DDX18).
3  34603195  Non-Coding RNA as Biomarkers for Type 2 Diabet...
4  33479058  The First Genome-Wide Association Study for Ty...
5  30297969  Fine-mapping type 2 diabetes loci to single-va...





# Analysis of the sequence and features present in the NCBI

## DDX18

In [90]:
database ="nucleotide"
word = "ddx18 and homo sapiens and Chromosome 2 and not predicted and not unverified"
res= "30"
Entrez.email="guilherme.lobo@ua.pt"
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="gb") 
records = list(SeqIO.parse(handle,"gb"))
handle.close()
for x in records:
    print(x.id,x.description)

NM_006773.4 Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
AC009312.4 Homo sapiens BAC clone RP11-425F6 from 2, complete sequence
AC009404.5 Homo sapiens BAC clone RP11-28H22 from 2, complete sequence
CM000253.1 Homo sapiens chromosome 2, whole genome shotgun sequence
CH471103.1 Homo sapiens 211000035839809 genomic scaffold, whole genome shotgun sequence
DQ655976.2 Homo sapiens clone Affy08254D01, mRNA sequence
DQ655975.2 Homo sapiens clone Affy08248B12, mRNA sequence
NT_086328.2 Homo sapiens chromosome 2 sequence, ENCODE region ENr121


Taking into account the description given, we think that the id (**"NM_006773.4"**) would be the most appropriate and, as such, will be used from now on.

In [91]:
with open("nucleotide_ddx18.fasta","w") as f: 
    for rec in records:
        #print(rec)
        if rec.id == "NM_006773.4":
            print("Record sequence length: ", len(rec.seq))
            print("Record features count: ", len(rec.features))
            sequence_nucleotide_DDX18=rec.seq
            print( "Nuclleotide sequence:", str(rec.seq)) #useful for blastn
            f.write(f">{rec.id}\n{sequence_nucleotide_DDX18}\n\n") # only the sequence in the file
        
            cds = []
            for i,feature in enumerate(rec.features):
                if feature.type=="CDS":
                    cds.append(i)
                    for k in cds:
                        print("ddx18 gene synonyms:",rec.features[k].qualifiers["gene_synonym"])#can help latter in the search
                        print("name:",rec.features[k].qualifiers["product"])
                        protein_ID=rec.features[k].qualifiers["protein_id"]
                        sequence_protein=rec.features[k].qualifiers["translation"]#get the protein seq of my cds
                        print("Protein sequence:",sequence_protein)
            gene = []
            for j in range(len(rec.features)):
                if rec.features[j].type == "gene":
                    gene.append(j)
                    for g in gene:
                        print("Location:",rec.features[g].location)
                        print("Strand:",rec.features[g].location.strand)
                        print("Gene:",rec.features[g].qualifiers["gene"])
#the real important thing is the nucleotide sequence  and the protein sequence of cds

Record sequence length:  3753
Record features count:  27
Nuclleotide sequence: ACGTGCGGCCGGAAGGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCACTTGTTGGGCAGAATGTCACACCTGCCGATGAAACTCCTGCGTAAGAAGATCGAGAAGCGGAACCTCAAATTGCGGCAGCGGAACCTAAAGTTTCAGGGGGCCTCAAATCTGACCCTATCGGAAACTCAAAATGGAGATGTATCTGAAGAAACAATGGGAAGTAGAAAGGTTAAAAAATCAAAACAAAAGCCCATGAATGTGGGCTTATCAGAAACTCAAAATGGAGGCATGTCTCAAGAAGCAGTGGGAAATATAAAAGTTACAAAGTCTCCCCAGAAATCCACTGTATTAACCAATGGAGAAGCAGCAATGCAGTCTTCCAATTCAGAATCAAAAAAGAAAAAGAAGAAAAAGAGAAAAATGGTGAATGATGCTGAGCCTGATACGAAAAAAGCAAAAACTGAAAACAAAGGGAAATCTGAAGAAGAAAGTGCCGAGACTACTAAAGAAACAGAAAATAATGTGGAGAAGCCAGATAATGATGAAGATGAGAGTGAGGTGCCCAGTCTGCCCCTGGGACTGACAGGAGCTTTTGAGGATACTTCGTTTGCTTCTCTATGTAATCTTGTCAATGAAAACACTCTGAAGGCAATAAAAGAAATGGGTTTTACAAACATGACTGAAATTCAGCATAAAAGTATCAGACCACTTCTGGAAGGCAGGGATCTTCTAGCAGCTGCAAAAACAGGCAGTGGTAAAACCCTGGCTTTTCTCATCCCTGCAGTTGAACTCATTGTTAAGTTAAGGTTCATGCCCAGGAATGGAACAGGAGTCCTTATTCTCTCACCTACTAGAGAACTAGCCATGCAAACCTTTGGTGTTCTTAAGGAGCTGATGACTCACCAC

In [92]:
#The correspondent information of NM_006773.4 was downloaded to a file
import os
Entrez.email = "guilherme.lobo@ua.pt"
filename = "NM_006773.4.gb"
if not os.path.isfile(filename):
    net_handle = Entrez.efetch(db="nucleotide", id="NM_006773.4", rettype="gb", retmode="text")
    out_handle = open(filename, "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()

In [142]:
record = SeqIO.read(open("NM_006773.4.gb"), format="genbank")
position=0
record_types={}
for x in record.features:
    record_types[x.type]=record_types.get(x.type,0)+1
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        position=i
print("The length of the sequence: {}\n".format(len(record.seq)))
print("Type of features: {}\n".format(record_types))
#print("Comment from NCBI: {}\n".format(record.annotations["comment"]))
print("Location of the CDS on the original sequence: {}".format(record.features[position].location))

The length of the sequence: 3753

Type of features: {'source': 1, 'gene': 1, 'exon': 14, 'misc_feature': 4, 'CDS': 1, 'regulatory': 3, 'polyA_site': 3}

Location of the CDS on the original sequence: [87:2100](+)


In [94]:
CDS_nuc_seq=record.seq[int(record.features[position].location.start):int(record.features[position].location.end)]

In [95]:
#saving the CDS_nucleotides and CD_aminoacid seqs in files
filename = "NM_006773.4.gb"
filename_CDS_nucl = "CDS_nucleot_DDX18_seq.fasta"
filename_CDS_prot = "CDS_prot_DDX18_seq.fasta"
input_handle  = open(filename, "r")
output_handle_nucl = open(filename_CDS_nucl, "w")
output_handle_prot = open(filename_CDS_prot, "w")
for seq_record in SeqIO.parse(input_handle, "genbank") :
    output_handle_nucl.write(">\n%s" % (CDS_nuc_seq))
    output_handle_prot.write(">\n%s" % ("".join(record.features[position].qualifiers['translation']))) #without join, output is a list
    
output_handle_prot.close()
output_handle_nucl.close()
input_handle.close()

# Homology analysis by BLAST or Diamond

## Blastn

In [96]:
record = SeqIO.read(open("nucleotide_ddx18.fasta"), format="fasta") 
print(len(record.seq))

3753


In [97]:
result_handle=NCBIWWW.qblast("blastn","nt",record.seq)
with open("blastn_DNA_ddx18_file.xml","w") as out_handle: 
    out_handle.write(result_handle.read())
result_handle.close()

In [98]:
Blast=open("blastn_DNA_ddx18_file.xml")
Blast_record=NCBIXML.parse(Blast)
for x in Blast_record:
    print(x.alignments[0])#best-one

gi|1519243738|ref|NM_006773.4| Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
           Length = 3753



In [99]:
result_seq = open("blastn_DNA_ddx18_file.xml")
blast_results = NCBIXML.read(result_seq)
#print("Number of alignments:",len(blast_results.alignments))
for parameter in blast_results.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
#     for e_v in parameter.hsps:
#         print('E-value: ', e_v.expect)


Accession:  NM_006773
Definition:  Homo sapiens DEAD-box helicase 18 (DDX18), mRNA

Accession:  BC024739
Definition:  Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 18, mRNA (cDNA clone MGC:29902 IMAGE:4995871), complete cds

Accession:  XM_003819127
Definition:  PREDICTED: Pan paniscus DEAD-box helicase 18 (DDX18), mRNA

Accession:  XM_515753
Definition:  PREDICTED: Pan troglodytes DEAD-box helicase 18 (DDX18), mRNA

Accession:  XM_004031662
Definition:  PREDICTED: Gorilla gorilla gorilla DEAD-box helicase 18 (DDX18), mRNA


In [100]:
E_VALUE_THRESH = 0.001
results_Blast= open('blastn_DNA_ddx18_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    print (alignment)
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            print('****Alignment****')
            print('sequence: ', alignment.title)
            print('lenght:', alignment.length)
            print(hsp.query[0:75] + '...')
            print(hsp.match[0:75] + '...')
            print(hsp.sbjct[0:75] + '...')
            print()
            

gi|1519243738|ref|NM_006773.4| Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
           Length = 3753

****Alignment****
sequence:  gi|1519243738|ref|NM_006773.4| Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
lenght: 3753
ACGTGCGGCCGGAAGGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCAC...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
ACGTGCGGCCGGAAGGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCAC...

gi|19353238|gb|BC024739.1| Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 18, mRNA (cDNA clone MGC:29902 IMAGE:4995871), complete cds
           Length = 3764

****Alignment****
sequence:  gi|19353238|gb|BC024739.1| Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 18, mRNA (cDNA clone MGC:29902 IMAGE:4995871), complete cds
lenght: 3764
GGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCACTTGTTGGGCAGAAT...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
GGGAAGTAACGTCAGCCTGAGAACTGAGTAGC

In [101]:
#filtering the "predicted" alignments
results_Blastn= open('blastn_DNA_ddx18_file.xml')
blastn_records = NCBIXML.read(results_Blastn)
E_VALUE_THRESH = 0.001
count_preditc, count_homo=0,0
list_filtered_alignments=[]
for alignment in  blastn_records.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            exist = re.search(r'PREDICTED:\s', alignment.title)
            if exist:
                pre = re.match(r'PREDICTED:\s', exist[0] )
                if pre:
                    count_preditc+=1
            else:
                list_filtered_alignments.append(alignment.accession)
                homo=re.search(r'Homo\ssapiens',alignment.title)
                if homo:
                    count_homo+=1
print(list_filtered_alignments)
print()
print('Total {} PREDICTED seqs found and remaining {} ids are from Homo sapiens'.format(count_preditc, count_homo))

['NM_006773', 'BC024739', 'NM_001132808', 'NM_001132808', 'NM_001132808', 'AK091227', 'AK001467', 'BC003360', 'BC001238', 'X98743', 'X98743', 'NG_008704', 'NG_008704', 'NG_008704', 'AL365434', 'AL365434', 'AL365434', 'LT744377', 'KJ897930', 'AB209392', 'AB209392', 'AB209392', 'CP034492', 'CP034492', 'NG_002480', 'NG_002480', 'AL391262', 'AL391262', 'NG_008706', 'AL138725']

Total 67 PREDICTED seqs found and remaining 18 ids are from Homo sapiens


In [102]:
with open('DDX18_nucleotide_result_blast.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")

# Blastp

## DDX18

In [128]:
record_protein = SeqIO.read(open("CDS_prot_DDX18_seq.fasta"), format="fasta") 
print(len(record_protein.seq))

670


In [18]:
result_handle=NCBIWWW.qblast("blastp","swissprot",record_protein.seq)#blast with the protein translate of my cds
with open("blastp_file_ddx18.xml","w") as out_handle:
    out_handle.write(result_handle.read())
result_handle.close()

In [7]:
result_seq = open("blastp_file_ddx18.xml")
blast_results = NCBIXML.read(result_seq)
#print("Number of alignments:",len(blast_results.alignments))
for parameter in blast_results.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  Q9NVP1
Definition:  RecName: Full=ATP-dependent RNA helicase DDX18; AltName: Full=DEAD box protein 18; AltName: Full=Myc-regulated DEAD box protein; Short=MrDb [Homo sapiens]
E-value:  0.0

Accession:  Q8K363
Definition:  RecName: Full=ATP-dependent RNA helicase DDX18; AltName: Full=DEAD box protein 18 [Mus musculus]
E-value:  0.0

Accession:  Q9VD51
Definition:  RecName: Full=Probable ATP-dependent RNA helicase pitchoune [Drosophila melanogaster]
E-value:  0.0

Accession:  Q1EA54
Definition:  RecName: Full=ATP-dependent RNA helicase HAS1 [Coccidioides immitis RS]
E-value:  0.0

Accession:  A4R8B5
Definition:  RecName: Full=ATP-dependent RNA helicase HAS1 [Pyricularia oryzae 70-15]
E-value:  0.0


In [8]:
results_Blastp= open('blastp_file_ddx18.xml')
blastp_records = NCBIXML.read(results_Blastp)
E_VALUE_THRESH = 0.001
list_filtered_alignments,list_species=[],[]
for alignment in  blastp_records.alignments:
    for hsp in alignment.hsps:
#         print(hsp.identities)    # maybe add more 
        if hsp.expect < E_VALUE_THRESH:
            list_filtered_alignments.append(alignment.accession)
            title_organism=re.search(r'\[.+\s.+\]', alignment.title)
            if title_organism:
                m = re.match(r'\[.+\s.+\]', title_organism[0] )
                specie = m.group(0)
                #print(specie)
                list_species.append(specie)
for x in sorted(set(list_species)):
    print("number of times: {} that appeared specie: {}".format(list_species.count(x),x))

print(list_filtered_alignments)

number of times: 2 that appeared specie: [Arabidopsis thaliana]
number of times: 2 that appeared specie: [Aspergillus clavatus NRRL 1]
number of times: 2 that appeared specie: [Aspergillus fischeri NRRL 181]
number of times: 2 that appeared specie: [Aspergillus fumigatus Af293]
number of times: 2 that appeared specie: [Aspergillus nidulans FGSC A4]
number of times: 2 that appeared specie: [Aspergillus niger CBS 513.88]
number of times: 2 that appeared specie: [Aspergillus oryzae RIB40]
number of times: 2 that appeared specie: [Aspergillus terreus NIH2624]
number of times: 1 that appeared specie: [Candida albicans SC5314]
number of times: 2 that appeared specie: [Chaetomium globosum CBS 148.51]
number of times: 2 that appeared specie: [Coccidioides immitis RS]
number of times: 1 that appeared specie: [Cryptococcus neoformans var. neoformans JEC21] >sp|P0CQ85.1| RecName: Full=ATP-dependent RNA helicase HAS1 [Cryptococcus neoformans var. neoformans B-3501A]
number of times: 1 that appeare

In [106]:
# with open('blastp_files_ID_ddx18.txt', 'w') as f:
#     for line in list_filtered_alignments:
#         f.write(f"{line}\n")

In [131]:
Blast=open("blastp_file_ddx18.xml")
Blast_record=NCBIXML.parse(Blast)
for x in Blast_record:
    print(x.database)
    print(x.matrix)
    print(x.gap_penalties)
    print(len(x.alignments))

swissprot
BLOSUM62
(11, 1)
50


In [108]:
result_handle=open("blastp_file_ddx18.xml")
blast_record=NCBIXML.read(result_handle)
E_VALUE_THRESH=0.01
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        if hsp.expect<E_VALUE_THRESH:
            print("----Aligment----")
            print("sequence:",alignment.title)
            print("lenght:",alignment.length)
            print("e.value:",hsp.expect) 
            print(hsp.query[0:75] + "...")
            print(hsp.query[0:75] + "...")
            print(hsp.sbjct[0:75] + "...")

----Aligment----
sequence: sp|Q9NVP1.2| RecName: Full=ATP-dependent RNA helicase DDX18; AltName: Full=DEAD box protein 18; AltName: Full=Myc-regulated DEAD box protein; Short=MrDb [Homo sapiens]
lenght: 670
e.value: 0.0
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
----Aligment----
sequence: sp|Q8K363.1| RecName: Full=ATP-dependent RNA helicase DDX18; AltName: Full=DEAD box protein 18 [Mus musculus]
lenght: 660
e.value: 0.0
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
MSQLQMKLLRRKIEKRNAKLRQRNLKLQETSDTSLSQPQNGDVPKETGKGGKVKKALKRSVPVDSAEAQSGGMPE...
----Aligment----
sequence: sp|Q9VD51.2| RecName: Full=Probable ATP-dependent RNA helicase pitchoune [Drosophila melanogaster]
lenght: 680
e.valu

In [132]:
filtragem= []
results_Blastp= open('blastp_file_ddx18.xml')
blastp_records_r = NCBIXML.read(results_Blastp)
first=blastp_records_r.alignments[0]
len_max_seq=first.hsps[0].align_length
print(len_max_seq)
results_Blastp= open('blastp_file_ddx18.xml')
blastp_records = NCBIXML.parse(results_Blastp)
list_accession,list_identities,list_coverage_start, list_coverage_stop,list_alignLen=[],[],[],[],[]
list_organism=[]
E_VALUE_THRESH = 0.001
for alignment in  blastp_records:
    for a in alignment.alignments:
        for b in a.hsps:
            if b.expect < E_VALUE_THRESH:
                title_organism=re.search(r'\[.+\s.+\]', a.title)
                if title_organism:
                    m = re.match(r'\[(.+\s.+)\]', title_organism[0] )
                    list_accession.append(a.accession)
                    list_coverage_start.append(b.sbjct_start)
                    list_coverage_stop.append(b.sbjct_end)
                    list_identities.append(b.identities)
                    list_alignLen.append(b.align_length)
                    list_organism.append(m.group(1))
            #print(a.accession, b.align_length, b.identities, b.expect, b.sbjct_start, b.sbjct_end)
            
#calculation of %identity                        
def identity(list_i, list_a):
    perc_list_identities=[]
    for index in range(len(list_i)):
        perc_list_identities.append(round(list_i[index]*100/list_a[index]))
    return perc_list_identities

#calculation of the %coverage - for all hsps of all accessions
def coverage(list_end,list_start):
    perc_list_coverage=[]
    for index in range(len(list_identities)):
        perc_list_coverage.append(round((1+list_end[index]-list_start[index])/len_max_seq*100))
    return perc_list_coverage

#selecting the max coverage and max identity per hsps/accession
def max_values_by_id(ids, covs, idents): 
    unique_ids = []
    max_coverage = []
    max_identity = []
    for id, cov, ident in zip(ids, covs,idents):
        if id in unique_ids:
            idx = unique_ids.index(id)
            max_coverage[idx] = max(max_coverage[idx], cov)
            max_identity[idx] = max(max_identity[idx], ident)
        else:
            unique_ids.append(id)
            max_coverage.append(cov)
            max_identity.append(ident)
    return unique_ids, max_coverage, max_identity

beta_max=max_values_by_id(list_accession, coverage(list_coverage_stop, list_coverage_start),identity(list_identities,list_alignLen))

thresold_coverage=60
thresold_identity=60
print('__Thresholds__')
print('coverage: {}%   |'.format(thresold_coverage))
print('identity: {}%   |'.format(thresold_identity))
print('e-value: {}% |\n'.format(E_VALUE_THRESH))
print('{:>5}{:>14}{:>11}{:>35}'.format('ID','%coverage','%identity', 'Organism'))
count=0
filtragem= []
for id, cov, ident, specie in zip(beta_max[0], beta_max[1], beta_max[2], list_organism):
    if int(cov)>thresold_coverage and ident>thresold_identity:
        if int(cov)>100:
            print('|{:>5}|{:>9}*{:>9}{:>24}'.format(id, cov, ident, specie))
            count+=1
        else:
            print('|{:>5}|{:>9}{:>10}{:>40}'.format(id, cov, ident, specie))
        filtragem.append(f"{id} {specie}")
  
if count>0:
    print('\n* means that the subject sequence is longer than the query sequence')

670
__Thresholds__
coverage: 60%   |
identity: 60%   |
e-value: 0.001% |

   ID     %coverage  %identity                           Organism
|Q9NVP1|      100       100                            Homo sapiens
|Q8K363|       99        86                            Mus musculus
|Q9VD51|       68        79                 Drosophila melanogaster
|A2Q9T6|       75        62            Aspergillus niger CBS 513.88
|Q54S03|       70        65                Dictyostelium discoideum
|A3LNR6|       75        61       Scheffersomyces stipitis CBS 6054
|Q09916|       68        66         Schizosaccharomyces pombe 972h-
|Q6CXB7|       70        62        Kluyveromyces lactis NRRL Y-1140
|Q6BH93|       68        65            Debaryomyces hansenii CBS767
|Q6FIL3|       71        62              [Candida] glabrata CBS 138
|Q74Z73|       66        64        Eremothecium gossypii ATCC 10895
|A5DID7|       68        63     Meyerozyma guilliermondii ATCC 6260
|Q6C7D2|       67        62             Yarr

In [133]:
#teste
lista_sem_repetidos= []
seen = set()
for x in filtragem:
    parts = x.split()
    organi = parts[1] + '_' + parts[2]
    print(parts)
    print(organi)
    
    if organi in seen:
#         filtragem.remove(x)
        seen.add(organi)  
    else:
        lista_sem_repetidos.append(x)      
# print(lista_sem_repetidos)

lista_ids = []
with open('protein_result_blastp.txt', 'w') as f:
    for x in lista_sem_repetidos:
        IDS = x.split()
        lista_ids.append(IDS[0])
        f.write(f"{IDS[0]}\n")
# print(lista_ids)

['Q9NVP1', 'Homo', 'sapiens']
Homo_sapiens
['Q8K363', 'Mus', 'musculus']
Mus_musculus
['Q9VD51', 'Drosophila', 'melanogaster']
Drosophila_melanogaster
['A2Q9T6', 'Aspergillus', 'niger', 'CBS', '513.88']
Aspergillus_niger
['Q54S03', 'Dictyostelium', 'discoideum']
Dictyostelium_discoideum
['A3LNR6', 'Scheffersomyces', 'stipitis', 'CBS', '6054']
Scheffersomyces_stipitis
['Q09916', 'Schizosaccharomyces', 'pombe', '972h-']
Schizosaccharomyces_pombe
['Q6CXB7', 'Kluyveromyces', 'lactis', 'NRRL', 'Y-1140']
Kluyveromyces_lactis
['Q6BH93', 'Debaryomyces', 'hansenii', 'CBS767']
Debaryomyces_hansenii
['Q6FIL3', '[Candida]', 'glabrata', 'CBS', '138']
[Candida]_glabrata
['Q74Z73', 'Eremothecium', 'gossypii', 'ATCC', '10895']
Eremothecium_gossypii
['A5DID7', 'Meyerozyma', 'guilliermondii', 'ATCC', '6260']
Meyerozyma_guilliermondii
['Q6C7D2', 'Yarrowia', 'lipolytica', 'CLIB122']
Yarrowia_lipolytica
['Q4P6N0', 'Ustilago', 'maydis', '521']
Ustilago_maydis
['Q5AK59', 'Candida', 'albicans', 'SC5314']
Cand

In [111]:
# blast_qresult=SearchIO.read("blastp_file_ddx18.xml","blast-xml")
# blast_hsp=blast_qresult[0][0]
# print(blast_qresult[0][0])#best hsp
# print(blast_hsp.aln)

## Uniprot search of Blastp results

In [136]:
WEBSITE_API="https://rest.uniprot.org"
def get_url(url,**kwargs):
    response=requests.get(url,**kwargs);
    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()
    return response
r=get_url(f"{WEBSITE_API}/uniprotkb/search?query=ddx18 AND (taxonomy_id:9606) AND (reviewed:true)")#more detail possible
data=r.json()
n_results=len(data["results"])
print(f"Number of results: {n_results}\n")
total=r.headers.get("x-total-results")
page_total=len(data["results"])

Number of results: 2



In [113]:
# #one way to list the ids
# r=get_url(f"{WEBSITE_API}//uniprotkb/search?query=DDX18 AND (taxonomy_id:9606) AND (reviewed:true)&format=list&size=2")
# #r.text

# accessions=[r.text.replace("\n",",")]
# accessions

In [114]:
# #to format fasta that alow see the protein sequences and id in a form more familiary
# r=get_url(f"{WEBSITE_API}//uniprotkb/search?query=DDX18  AND (taxonomy_id:9606) AND (reviewed:true)&format=fasta")
# fasta=r.text
# print(fasta)

-we choose the first one the id Q9NVP1 

In [115]:
# with ExPASy.get_sprot_raw("Q9NVP1") as handle:
#     seq_record = SeqIO.read(handle, "swiss")
#     #print(seq_record.id)
#     #print(seq_record.entry_name, "\n")
#     #print(", ".join(seq_record.accessions), "\n")
#     #print(seq_record.keywords, "\n")
#     #print(seq_record.organism, "\n")
#     #print(len(seq_record.sequence), "aa", "\n")
#     print(seq_record.seq)
# #is not possible in my case!!!!!!!!!!!!!!!!!!!!1

We can't understand why the id of ddx18 doesn't work and the other genes already work

In [137]:
# print(lista_sem_repetidos)
a=", ".join(lista_sem_repetidos)
b=a.replace(" ","_")
c=b.split(",")
print(c)

['Q9NVP1_Homo_sapiens', '_Q8K363_Mus_musculus', '_Q9VD51_Drosophila_melanogaster', '_A2Q9T6_Aspergillus_niger_CBS_513.88', '_Q54S03_Dictyostelium_discoideum', '_A3LNR6_Scheffersomyces_stipitis_CBS_6054', '_Q09916_Schizosaccharomyces_pombe_972h-', '_Q6CXB7_Kluyveromyces_lactis_NRRL_Y-1140', '_Q6BH93_Debaryomyces_hansenii_CBS767', '_Q6FIL3_[Candida]_glabrata_CBS_138', '_Q74Z73_Eremothecium_gossypii_ATCC_10895', '_A5DID7_Meyerozyma_guilliermondii_ATCC_6260', '_Q6C7D2_Yarrowia_lipolytica_CLIB122', '_Q4P6N0_Ustilago_maydis_521', '_Q5AK59_Candida_albicans_SC5314']


In [138]:
fileUniprot= open("protein_result_blastp.txt", "r")
fields="sequence"
WEBSITE_API="https://rest.uniprot.org"
seqs=[]

for i in fileUniprot:
    r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
    seqs.append(str(r.content))

with open('ddx18_blastp_file.fasta', 'w') as f:
    for index, seq in enumerate(seqs):
        existe = re.search(r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL)
        if existe:
            m = re.match( r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL )
            print(lista_sem_repetidos[index])
            f.write(f">{c[index]}\n{m.group(1)}\n\n")                    
            print(m.group(1))


Q9NVP1 Homo sapiens
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQEAVGNIKVTKSPQKSTVLTNGEAAMQSSNSESKKKKKKKRKMVNDAEPDTKKAKTENKGKSEEESAETTKETENNVEKPDNDEDESEVPSLPLGLTGAFEDTSFASLCNLVNENTLKAIKEMGFTNMTEIQHKSIRPLLEGRDLLAAAKTGSGKTLAFLIPAVELIVKLRFMPRNGTGVLILSPTRELAMQTFGVLKELMTHHVHTYGLIMGGSNRSAEAQKLGNGINIIVATPGRLLDHMQNTPGFMYKNLQCLVIDEADRILDVGFEEELKQIIKLLPTRRQTMLFSATQTRKVEDLARISLKKEPLYVGVDDDKANATVDGLEQGYVVCPSEKRFLLLFTFLKKNRKKKLMVFFSSCMSVKYHYELLNYIDLPVLAIHGKQKQNKRTTTFFQFCNADSGTLLCTDVAARGLDIPEVDWIVQYDPPDDPKEYIHRVGRTARGLNGRGHALLILRPEELGFLRYLKQSKVPLSEFDFSWSKISDIQSQLEKLIEKNYFLHKSAQEAYKSYIRAYDSHSLKQIFNVNNLNLPQVALSFGFKVPPFVDLNVNSNEGKQKKRGGGGGFGYQKTKKVEKSKIFKHISKKSSDSRQFSH
Q8K363 Mus musculus
MSQLQMKLLRRKIEKRNAKLRQRNLKLQETSDTSLSQPQNGDVPKETGKGGKVKKALKRSVPVDSAEAQSGGMPEETLENGKVKKSPQKLTTLANGEAAPTPPPDSEVKKKKKKKRKMANDAGPDTKKAKTEESAEACEEPEDDVKKADDSEVPSLPLGLTGAFEDTSFASLSNLVNENTLKAIEEMGFKRMTEIQHKSIRPLLEGRDLLAAAKTGSGKTLAFLIPVIELIVKLKFMPRNGTGVLILSPTRELAMQTFGVLKELMTHHVHTYGLIMGGSNRSAEVQKLL

In [139]:
fileUniprot= open("protein_result_blastp.txt", "r")
fields=['accession','organism_name','protein_name','cc_function','cc_subcellular_location']
WEBSITE_API="https://rest.uniprot.org"

def get_field_for_id(ID_PROT, field): 
    response = get_url("{}/uniprotkb/search?query={}&fields={}&size=1&format=tsv".format(WEBSITE_API,ID_PROT,field))
    return str(response.content)

def get_info_uniprot(fileUniprot):
    results = []
    result = []
    tmp = []
    uniprot_final_list=[]
    
    for first_index in fileUniprot:
        tmp= []
        for field in fields:
            result = get_field_for_id(first_index, field)
            tmp.append(result)
        results.append(tmp)
    for ind in results:
        uniprot_list=[]
        for i in ind:
            entry = re.search(r'b\'Entry\\n(.+?(?=\\n\'))', str(i), re.DOTALL)
            organism = re.search(r'b\'Organism\\n(.+?(?=\\n\'))', str(i), re.DOTALL)
            protein = re.search(r'Protein names\\n(.+?(?=\\n))', str(i), re.DOTALL)
            function = re.search(r'Function \[CC\]\\n.{9} (.+?(?=\\n))', str(i), re.DOTALL )
            location_exist = re.search(r'b\'Subcellular location \[CC\]\\nSUBCELLULAR LOCATION: (.+?(?=\\n\'))', str(i), re.DOTALL )
            location_notexist = re.search( r'b\'Subcellular location \[CC\]\\n\\n\'',str(i), re.DOTALL )   
            if entry:
                uniprot_list.append(entry.group(1))
            if organism:
                uniprot_list.append(organism.group(1))
            if protein:
                uniprot_list.append(protein.group(1))
            if location_exist:
                uniprot_list.append(location_exist.group(1))
            if location_notexist:
                uniprot_list.append('N/A')
            if function:
                uniprot_list.append(function.group(1))
        uniprot_final_list.append(uniprot_list)
    return uniprot_final_list


In [140]:
get_info_uniprot(fileUniprot)

[['Q9NVP1',
  'Homo sapiens (Human)',
  'ATP-dependent RNA helicase DDX18 (EC 3.6.4.13) (DEAD box protein 18) (Myc-regulated DEAD box protein) (MrDb)',
  'Probable RNA-dependent helicase.',
  'Nucleus, nucleolus {ECO:0000269|PubMed:16963496}. Chromosome {ECO:0000269|PubMed:20813266}.'],
 ['Q8K363',
  'Mus musculus (Mouse)',
  'ATP-dependent RNA helicase DDX18 (EC 3.6.4.13) (DEAD box protein 18)',
  'Probable RNA-dependent helicase.',
  'Nucleus, nucleolus {ECO:0000250|UniProtKB:Q9NVP1}. Chromosome {ECO:0000250|UniProtKB:Q9NVP1}.'],
 ['Q9VD51',
  'Drosophila melanogaster (Fruit fly)',
  'Probable ATP-dependent RNA helicase pitchoune (EC 3.6.4.13)',
  'Probable RNA-dependent helicase. Functions in cell growth and proliferation. May have a role in ribosome biogenesis and, consequently, in protein biosynthesis. {ECO:0000269|PubMed:9716523}.',
  'Nucleus, nucleolus {ECO:0000269|PubMed:9716523}.'],
 ['A2Q9T6',
  'Aspergillus niger (strain CBS 513.88 / FGSC A1513)',
  'ATP-dependent RNA helic

In [141]:
fileUniprot= open("protein_result_blastp.txt", "r")
allInfo=get_info_uniprot(fileUniprot)
for index in allInfo:
    print('Id: {}\nOrganism: {}\nProtein name: {}\nSubcelular location: {}\nFunction:  {}\n\n'.format(index[0], index[1], index[2], index[4], index[3]))
    

Id: Q9NVP1
Organism: Homo sapiens (Human)
Protein name: ATP-dependent RNA helicase DDX18 (EC 3.6.4.13) (DEAD box protein 18) (Myc-regulated DEAD box protein) (MrDb)
Subcelular location: Nucleus, nucleolus {ECO:0000269|PubMed:16963496}. Chromosome {ECO:0000269|PubMed:20813266}.
Function:  Probable RNA-dependent helicase.


Id: Q8K363
Organism: Mus musculus (Mouse)
Protein name: ATP-dependent RNA helicase DDX18 (EC 3.6.4.13) (DEAD box protein 18)
Subcelular location: Nucleus, nucleolus {ECO:0000250|UniProtKB:Q9NVP1}. Chromosome {ECO:0000250|UniProtKB:Q9NVP1}.
Function:  Probable RNA-dependent helicase.


Id: Q9VD51
Organism: Drosophila melanogaster (Fruit fly)
Protein name: Probable ATP-dependent RNA helicase pitchoune (EC 3.6.4.13)
Subcelular location: Nucleus, nucleolus {ECO:0000269|PubMed:9716523}.
Function:  Probable RNA-dependent helicase. Functions in cell growth and proliferation. May have a role in ribosome biogenesis and, consequently, in protein biosynthesis. {ECO:0000269|PubMe

In [116]:
# results_Blastp= open('blastp_file_ddx18.xml')
# blastp_records = NCBIXML.read(results_Blastp)
# titles_list=[]
# for alignment in  blastp_records.alignments:   
#     titles_list.append(alignment.title)

# file= open("protein_result_blastp.txt", "r")
# fields="sequence"
# WEBSITE_API="https://rest.uniprot.org"
# seqs=[]

# for i in file:
#     r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
#     seqs.append(str(r.content))

# with open('ddx18_blastp_file.fasta', 'w') as f:
#     for index, seq in enumerate(seqs):
#         existe = re.search(r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL)
#         if existe:
#             m = re.match( r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL )
#             f.write(f">{titles_list[index]}\n{m.group(1)}\n\n")                    
# #             print(m.group(1))      

In [117]:
# file= open("blastp_files_ID_ddx18.txt", "r")
# fields="accession,organism_name,protein_name,cc_subcellular_location,cc_function"
# WEBSITE_API="https://rest.uniprot.org"
# with open('uniprot_result_CDS_filtered.txt', 'w',encoding='utf-8') as f:
#     for i in file:
#         r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
#         print(r.text)
#         f.write(r.text)
#         f.write('\n')

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q9NVP1	Homo sapiens (Human)	ATP-dependent RNA helicase DDX18 (EC 3.6.4.13) (DEAD box protein 18) (Myc-regulated DEAD box protein) (MrDb)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000269|PubMed:16963496}. Chromosome {ECO:0000269|PubMed:20813266}.	FUNCTION: Probable RNA-dependent helicase.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q8K363	Mus musculus (Mouse)	ATP-dependent RNA helicase DDX18 (EC 3.6.4.13) (DEAD box protein 18)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000250|UniProtKB:Q9NVP1}. Chromosome {ECO:0000250|UniProtKB:Q9NVP1}.	FUNCTION: Probable RNA-dependent helicase.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q9VD51	Drosophila melanogaster (Fruit fly)	Probable ATP-dependent RNA helicase pitchoune (EC 3.6.4.13)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000269|PubMed:9716523}.	FUNCTION: Probable RNA-dependent helicase. Functions in cell growth

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q6CXB7	Kluyveromyces lactis (strain ATCC 8585 / CBS 2359 / DSM 70799 / NBRC 1267 / NRRL Y-1140 / WM37) (Yeast) (Candida sphaerica)	ATP-dependent RNA helicase HAS1 (EC 3.6.4.13)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000250}.	FUNCTION: ATP-dependent RNA helicase involved in 40S ribosomal subunit biogenesis. Required for the processing and cleavage of 35S pre-rRNA at sites A0, A1, and A2, leading to mature 18S rRNA.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q6BH93	Debaryomyces hansenii (strain ATCC 36239 / CBS 767 / BCRC 21394 / JCM 1990 / NBRC 0083 / IGC 2968) (Yeast) (Torulaspora hansenii)	ATP-dependent RNA helicase HAS1 (EC 3.6.4.13)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000250}.	FUNCTION: ATP-dependent RNA helicase involved in 40S ribosomal subunit biogenesis. Required for the processing and cleavage of 35S pre-rRNA at sites A0, A1, and A2, leading to mature 18S rRNA.

En

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q0CMM5	Aspergillus terreus (strain NIH 2624 / FGSC A1156)	ATP-dependent RNA helicase dbp4 (EC 3.6.4.13)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000250}.	FUNCTION: ATP-dependent RNA helicase required for ribosome biogenesis. Involved in the release of U14 snoRNA in pre-ribosomal complexes. Required for pre-rRNA cleavage at site A2 (By similarity). {ECO:0000250}.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q4WM60	Neosartorya fumigata (strain ATCC MYA-4609 / Af293 / CBS 101355 / FGSC A1100) (Aspergillus fumigatus)	ATP-dependent RNA helicase dbp4 (EC 3.6.4.13)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000250}.	FUNCTION: ATP-dependent RNA helicase required for ribosome biogenesis. Involved in the release of U14 snoRNA in pre-ribosomal complexes. Required for pre-rRNA cleavage at site A2 (By similarity). {ECO:0000250}.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC

# Alignment and Phylo

In [49]:

dir =r'C:\Program files (x86)\ClustalW2\clustalw2'
in_file =r'ddx18_blastp_file.fasta'

clustalw_cline = ClustalwCommandline(dir, infile=in_file)
clustalw_cline()
print(clustalw_cline)


"C:\Program files (x86)\ClustalW2\clustalw2" -infile=ddx18_blastp_file.fasta


In [50]:
cline = ClustalwCommandline("clustalw", infile="ddx18_blastp_file.fasta", outfile="ddx18_blastp_file.aln")
cline

ClustalwCommandline(cmd='clustalw', infile='ddx18_blastp_file.fasta', outfile='ddx18_blastp_file.aln')

In [51]:

align = AlignIO.read("ddx18_blastp_file.aln", "clustal")

print(format(align, "clustal")) #formato string

CLUSTAL 2.1 multiple sequence alignment


_Q6BH93_Debaryomyces_hansenii_      ----------------------MAVSSKSKTGRSPKPAKVQPSKK-----
_A5DID7_Meyerozyma_guilliermon      --------------------MSKGVKSKTRSESKPKAHKKQSDTN-----
_A3LNR6_Scheffersomyces_stipit      --------------------MAKNVNSKAN------GKKSGSVKR-----
_Q5AK59_Candida_albicans_SC531      --------------------MAKTTKVKGN------KKKSDTSKV-----
_Q6C7D2_Yarrowia_lipolytica_CL      --------------------MGLTKKEKVEK----KARSVVKGKA-----
_Q74Z73_Eremothecium_gossypii_      --------------------MG--------------AQETREDAS-----
_Q6CXB7_Kluyveromyces_lactis_N      -------------------------------------------MA-----
_Q6FIL3_[Candida]_glabrata_CBS      -------------------------------------------MA-----
_A2Q9T6_Aspergillus_niger_CBS_      --------------MPIPVDTAKSINKKRKRKHGARAATDADDASPKPAV
_Q4P6N0_Ustilago_maydis_521         ----------------------------------MAPHKSDDVDA-----
_Q09916_Schizosaccharomyces_po      --------------------MAKSELKRKKHQSGNEEVKEKRQKP-----
_

In [52]:

align = AlignIO.read("ddx18_blastp_file.aln", "clustal")
print(align)
print("Número de linhas: %i" % len(align)) 
print(format(align, "clustal"))

for record in align:
    print("%s - %s" % (record.seq, record.id)) 

Alignment with 15 rows and 718 columns
----------------------MAVSSKSKTGRSPKPAKVQPSK...--- _Q6BH93_Debaryomyces_hansenii_
--------------------MSKGVKSKTRSESKPKAHKKQSDT...--- _A5DID7_Meyerozyma_guilliermon
--------------------MAKNVNSKAN------GKKSGSVK...--- _A3LNR6_Scheffersomyces_stipit
--------------------MAKTTKVKGN------KKKSDTSK...--- _Q5AK59_Candida_albicans_SC531
--------------------MGLTKKEKVEK----KARSVVKGK...--- _Q6C7D2_Yarrowia_lipolytica_CL
--------------------MG--------------AQETREDA...--- _Q74Z73_Eremothecium_gossypii_
-------------------------------------------M...--- _Q6CXB7_Kluyveromyces_lactis_N
-------------------------------------------M...--- _Q6FIL3_[Candida]_glabrata_CBS
--------------MPIPVDTAKSINKKRKRKHGARAATDADDA...--- _A2Q9T6_Aspergillus_niger_CBS_
----------------------------------MAPHKSDDVD...WSR _Q4P6N0_Ustilago_maydis_521
--------------------MAKSELKRKKHQSGNEEVKEKRQK...--- _Q09916_Schizosaccharomyces_po
-------------------MVSTKVKPTTTTTPTTTVNKTTQPT...--- _Q54S03_Dic

In [53]:
count = AlignIO.convert("ddx18_blastp_file.aln", "clustal","ddx18_blastp_file.sth", "stockholm") 
print ("Convertidos %i alinhamentos" % count )

Convertidos 1 alinhamentos


In [54]:
alignment = AlignIO.read("ddx18_blastp_file.sth", "stockholm")

In [55]:
calculator = DistanceCalculator('blosum62')
dm = calculator.get_distance(alignment)
print(dm)

_Q6BH93_Debaryomyces_hansenii_	0
_A5DID7_Meyerozyma_guilliermon	0.16211545108883507	0
_A3LNR6_Scheffersomyces_stipit	0.15108164689462666	0.17280945757997224	0
_Q5AK59_Candida_albicans_SC531	0.20849557522123896	0.23643410852713176	0.20836261419536195	0
_Q6C7D2_Yarrowia_lipolytica_CL	0.258222533240028	0.27617391304347827	0.2833333333333333	0.31069669247009146	0
_Q74Z73_Eremothecium_gossypii_	0.18524332810047095	0.20823620823620825	0.20195312499999996	0.20740445859872614	0.23610023492560694	0
_Q6CXB7_Kluyveromyces_lactis_N	0.17411950929956466	0.18493690851735012	0.18663503361012257	0.19847020933977455	0.23571428571428577	0.13038229376257549	0
_Q6FIL3_[Candida]_glabrata_CBS	0.1633386581469649	0.18037346046881209	0.17543859649122806	0.18613138686131392	0.22346368715083798	0.13758116883116878	0.09500998003992012	0
_A2Q9T6_Aspergillus_niger_CBS_	0.34607438016528924	0.34444826400825024	0.34772099447513816	0.3715885234429671	0.37091888253113425	0.26679611650485435	0.2661417322834646	0.259611573

In [56]:
constructor = DistanceTreeConstructor() 
upgmatree = constructor.upgma(dm)
print(upgmatree)

Tree(rooted=True)
    Clade(branch_length=0, name='Inner14')
        Clade(name='Inner12')
            Clade(name='_Q9VD51_Drosophila_melanogaste')
            Clade(name='Inner2')
                Clade(name='_Q8K363_Mus_musculus')
                Clade(name='Q9NVP1_Homo_sapiens')
        Clade(name='Inner13')
            Clade(name='_Q54S03_Dictyostelium_discoide')
            Clade(name='Inner11')
                Clade(name='_A2Q9T6_Aspergillus_niger_CBS_')
                Clade(name='Inner10')
                    Clade(name='_Q09916_Schizosaccharomyces_po')
                    Clade(name='Inner9')
                        Clade(name='_Q4P6N0_Ustilago_maydis_521')
                        Clade(name='Inner8')
                            Clade(name='_Q6C7D2_Yarrowia_lipolytica_CL')
                            Clade(name='Inner7')
                                Clade(name='_Q5AK59_Candida_albicans_SC531')
                                Clade(name='Inner6')
                             

In [57]:
njtree = constructor.nj(dm)
print(njtree)

Tree(rooted=False)
    Clade(branch_length=0, name='Inner13')
        Clade(name='Inner12')
            Clade(name='Inner11')
                Clade(name='Inner10')
                    Clade(name='_Q6FIL3_[Candida]_glabrata_CBS')
                    Clade(name='_Q6CXB7_Kluyveromyces_lactis_N')
                Clade(name='Inner6')
                    Clade(name='Inner5')
                        Clade(name='Inner4')
                            Clade(name='_A3LNR6_Scheffersomyces_stipit')
                            Clade(name='_Q6BH93_Debaryomyces_hansenii_')
                        Clade(name='_A5DID7_Meyerozyma_guilliermon')
                    Clade(name='_Q5AK59_Candida_albicans_SC531')
            Clade(name='_Q6C7D2_Yarrowia_lipolytica_CL')
        Clade(name='_Q74Z73_Eremothecium_gossypii_')
        Clade(name='Inner9')
            Clade(name='Inner8')
                Clade(name='_Q09916_Schizosaccharomyces_po')
                Clade(name='Inner7')
                    Clade(name='I

In [58]:
Phylo.write([upgmatree, njtree],"phylotree_DDX18.nhx","newick")

2

In [59]:
Phylo.draw_ascii(njtree)

    _____ _Q6FIL3_[Candida]_glabrata_CBS
  _|
 | |_____ _Q6CXB7_Kluyveromyces_lactis_N
 |
 |      __________ _A3LNR6_Scheffersomyces_stipit
 |     |
 |   __|_________ _Q6BH93_Debaryomyces_hansenii_
 |  |  |
 |__|  |___________ _A5DID7_Meyerozyma_guilliermon
 |  |
 |  |_______________ _Q5AK59_Candida_albicans_SC531
 |
 |_____________________ _Q6C7D2_Yarrowia_lipolytica_CL
 |
 |________ _Q74Z73_Eremothecium_gossypii_
_|
 |   ____________________ _Q09916_Schizosaccharomyces_po
 |  |
 |  |             ___________________________ _Q9VD51_Drosophila_melanogaste
 | ,|     _______|
 | ||    |       |                   _______ _Q8K363_Mus_musculus
 | || ___|       |__________________|
 | |||   |                          |________ Q9NVP1_Homo_sapiens
 |_|||   |
   | |   |____________________________ _Q54S03_Dictyostelium_discoide
   | |
   | |____________________ _Q4P6N0_Ustilago_maydis_521
   |
   |_________________________ _A2Q9T6_Aspergillus_niger_CBS_



In [60]:
Phylo.draw_ascii(upgmatree)

        ___________________________________ _Q9VD51_Drosophila_melanogaste
  _____|
 |     |                         __________ _Q8K363_Mus_musculus
 |     |________________________|
_|                              |__________ Q9NVP1_Homo_sapiens
 |
 |   ______________________________________ _Q54S03_Dictyostelium_discoide
 |  |
 |__|        ______________________________ _A2Q9T6_Aspergillus_niger_CBS_
    |       |
    |_______|  ____________________________ _Q09916_Schizosaccharomyces_po
            | |
            |_| ___________________________ _Q4P6N0_Ustilago_maydis_521
              ||
              ||   ________________________ _Q6C7D2_Yarrowia_lipolytica_CL
               |  |
               |__|      __________________ _Q5AK59_Candida_albicans_SC531
                  |     |
                  |     |             ________ _Q6FIL3_[Candida]_glabrata_CBS
                  |_____|         ___|
                        |    ____|   |________ _Q6CXB7_Kluyveromyces_lactis_N
         