# Literature Analysis

## Automated searches with Biopython

In [1]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW
import requests, sys, json
import re
from Bio import SeqIO
from Bio import Entrez
from Bio import Medline
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SearchIO
from Bio.SwissProt import KeyWList
from Bio import SwissProt
from Bio import ExPASy
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo

Two possible strategies for searching for articles.
The first allows searching globally, the second is more targeted and involves a pre-selection of the articles.

In [2]:
#First Strategie

database = "PubMed"#For literature will be PubMed
word = "DDX18"
res= "5" #I choose a limit of thirty
email= "guilherme.lobo@ua.pt"

Entrez.email= email
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="medline", retmode="text") #para este caso em especifico devemos ter como database a PubMed("Medline")
records = Medline.parse(handle)

for record in records:
    #print(record) 
    #print("PMID:",record.get("PMID","-"))
    print("title:", record.get("TI", "-"))
    #print('abstract:', record.get('AB', '-'))
    print("authors:", record.get("AU", "-"))
    print("source:", record.get("SO", "-"))
    print("")

title: Detection of Selection Signatures in Anqing Six-End-White Pigs Based on Resequencing Data.
authors: ['Chen Y', 'Wu X', 'Wang J', 'Hou Y', 'Liu Y', 'Wang B', 'Hu X', 'Zheng X', 'Zhang X', 'Ding Y', 'Yin Z']
source: Genes (Basel). 2022 Dec 8;13(12):2310. doi: 10.3390/genes13122310.

title: DDX18 prevents R-loop-induced DNA damage and genome instability via PARP-1.
authors: ['Lin WL', 'Chen JK', 'Wen X', 'He W', 'Zarceno GA', 'Chen Y', 'Chen S', 'Paull TT', 'Liu HW']
source: Cell Rep. 2022 Jul 19;40(3):111089. doi: 10.1016/j.celrep.2022.111089.

title: Identification of Important Modules and Hub Gene in Chronic Kidney Disease Based on WGCNA.
authors: ['Wang J', 'Yin Y', 'Lu Q', 'Zhao YR', 'Hu YJ', 'Hu YZ', 'Wang ZY']
source: J Immunol Res. 2022 May 4;2022:4615292. doi: 10.1155/2022/4615292. eCollection 2022.

title: The RNA-Binding Protein DDX18 Promotes Gastric Cancer by Affecting the Maturation of MicroRNA-21.
authors: ['Zhang Y', 'Yu F', 'Ni B', 'Li Q', 'Bae SW', 'Choi JH', 'Yan

In [4]:
#Same strategie but for type II diabetes

database = "PubMed"#For literature will be PubMed
word = "Type 2 Diabetes"
res= "5" #I choose a limit of thirty
email= "guilherme.lobo@ua.pt"

Entrez.email= email
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="medline", retmode="text") #para este caso em especifico devemos ter como database a PubMed("Medline")
records = Medline.parse(handle)

for record in records:
    #print(record) 
    #print("PMID:",record.get("PMID","-"))
    print("title:", record.get("TI", "-"))
    #print('abstract:', record.get('AB', '-'))
    print("authors:", record.get("AU", "-"))
    print("source:", record.get("SO", "-"))
    print("")

title: Association between medication regimen complexity and glycemic control among patients with type 2 diabetes.
authors: ['Russell AM', 'Opsasnick L', 'Yoon E', 'Bailey SC', "O'Brien M", 'Wolf MS']
source: J Am Pharm Assoc (2003). 2022 Dec 31:S1544-3191(22)00458-7. doi: 10.1016/j.japh.2022.12.028.

title: Mumefural prevents insulin resistance and amyloid-beta accumulation in the brain by improving lowered interstitial fluid pH in type 2 diabetes mellitus.
authors: ['Hosogi S', 'Kuwahara A', 'Kuwahara Y', 'Tanaka S', 'Shimamoto C', 'Tagawa N', 'Kato I', 'Yoshimoto K', 'Aoi W', 'Takata K', 'Miyazaki H', 'Niisato N', 'Tsubo Y', 'Yagi K', 'Nakahari T', 'Marunaka Y']
source: Biomed Res. 2023;44(1):17-29. doi: 10.2220/biomedres.44.17.

title: Economic burden of low cardiorespiratory fitness in Canada.
authors: ['Chaput JP', 'Janssen I', 'Sampasa-Kanyinga H', 'Tomkinson GR', 'Lang JJ']
source: Prev Med. 2023 Jan 19:107424. doi: 10.1016/j.ypmed.2023.107424.

title: Intestinal microbiome div

In [3]:
#Second Strategie for myarticles_DDX18; articles used in literature analysis
alldata=[]
with open("Article_DDX18.txt",encoding="utf-8") as f: #For this examples I chosse articles from PubMed about Diabetes type II
    pmids= Medline.parse(f)
    for pmid in tqdm(pmids):
        try:
            pid=pmid["PMID"]
        except:
            pid="-"
        try:
            Title=pmid["TI"]
        except:
            Title="-"
        #try:        
            #Abstract=pmid["AB"]
        #except:
            #Abstract="-"
        #try:
            #Authors=pmid["AU"]
        #except:
            #Authors="-"
        #try:
            #Source=pmid["SO"]
        #except:
            #Source="-"
        
        dic={"PMID":pid,
             "Title":Title,
             #"Abstract":Abstract,
             #"Authors":Authors,
             #"Source":Source
            }
        alldata.append(dic)
#print(alldata)
df = pd.DataFrame.from_records(alldata,index=['1', '2',"3","4","5"])
print(df)
#writer = pd.ExcelWriter('articles_ddx18.xlsx')
#df.to_Exel(writer)
#writer.save()
#df=pd.DataFrame(alldata)
#df.to_csv("articles_ddx18.csv", index=False)

5it [00:00, 2717.93it/s]

       PMID                                              Title
1  33489896  The RNA-Binding Protein DDX18 Promotes Gastric...
2  18351129                  Cellular studies of MrDb (DDX18).
3  34603195  Non-Coding RNA as Biomarkers for Type 2 Diabet...
4  33479058  The First Genome-Wide Association Study for Ty...
5  30297969  Fine-mapping type 2 diabetes loci to single-va...





# Analysis of the sequence and features present in the NCBI

## DDX18

In [4]:
database ="nucleotide"
word = "ddx18 and homo sapiens and Chromosome 2 and not predicted and not unverified"
res= "30"
Entrez.email="guilherme.lobo@ua.pt"
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="gb") 
records = list(SeqIO.parse(handle,"gb"))
handle.close()
for x in records:
    print(x.id,x.description)

NM_006773.4 Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
AC009312.4 Homo sapiens BAC clone RP11-425F6 from 2, complete sequence
AC009404.5 Homo sapiens BAC clone RP11-28H22 from 2, complete sequence
CM000253.1 Homo sapiens chromosome 2, whole genome shotgun sequence
CH471103.1 Homo sapiens 211000035839809 genomic scaffold, whole genome shotgun sequence
DQ655976.2 Homo sapiens clone Affy08254D01, mRNA sequence
DQ655975.2 Homo sapiens clone Affy08248B12, mRNA sequence
NT_086328.2 Homo sapiens chromosome 2 sequence, ENCODE region ENr121


Taking into account the description given, we think that the id (**"NM_006773.4"**) would be the most appropriate and, as such, will be used from now on.

In [5]:
with open("nucleotide_ddx18.fasta","w") as f: 
    for rec in records:
        #print(rec)
        if rec.id == "NM_006773.4":
            print("Record sequence length: ", len(rec.seq))
            print("Record features count: ", len(rec.features))
            sequence_nucleotide_DDX18=rec.seq
            print( "Nuclleotide sequence:", str(rec.seq)) #useful for blastn
            f.write(f">{rec.id}\n{sequence_nucleotide_DDX18}\n\n") # only the sequence in the file
        
            cds = []
            for i,feature in enumerate(rec.features):
                if feature.type=="CDS":
                    cds.append(i)
                    for k in cds:
                        print("ddx18 gene synonyms:",rec.features[k].qualifiers["gene_synonym"])#can help latter in the search
                        print("name:",rec.features[k].qualifiers["product"])
                        protein_ID=rec.features[k].qualifiers["protein_id"]
                        sequence_protein=rec.features[k].qualifiers["translation"]#get the protein seq of my cds
                        print("Protein sequence:",sequence_protein)
            gene = []
            for j in range(len(rec.features)):
                if rec.features[j].type == "gene":
                    gene.append(j)
                    for g in gene:
                        print("Location:",rec.features[g].location)
                        print("Strand:",rec.features[g].location.strand)
                        print("Gene:",rec.features[g].qualifiers["gene"])
#the real important thing is the nucleotide sequence  and the protein sequence of cds

Record sequence length:  3753
Record features count:  27
Nuclleotide sequence: ACGTGCGGCCGGAAGGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCACTTGTTGGGCAGAATGTCACACCTGCCGATGAAACTCCTGCGTAAGAAGATCGAGAAGCGGAACCTCAAATTGCGGCAGCGGAACCTAAAGTTTCAGGGGGCCTCAAATCTGACCCTATCGGAAACTCAAAATGGAGATGTATCTGAAGAAACAATGGGAAGTAGAAAGGTTAAAAAATCAAAACAAAAGCCCATGAATGTGGGCTTATCAGAAACTCAAAATGGAGGCATGTCTCAAGAAGCAGTGGGAAATATAAAAGTTACAAAGTCTCCCCAGAAATCCACTGTATTAACCAATGGAGAAGCAGCAATGCAGTCTTCCAATTCAGAATCAAAAAAGAAAAAGAAGAAAAAGAGAAAAATGGTGAATGATGCTGAGCCTGATACGAAAAAAGCAAAAACTGAAAACAAAGGGAAATCTGAAGAAGAAAGTGCCGAGACTACTAAAGAAACAGAAAATAATGTGGAGAAGCCAGATAATGATGAAGATGAGAGTGAGGTGCCCAGTCTGCCCCTGGGACTGACAGGAGCTTTTGAGGATACTTCGTTTGCTTCTCTATGTAATCTTGTCAATGAAAACACTCTGAAGGCAATAAAAGAAATGGGTTTTACAAACATGACTGAAATTCAGCATAAAAGTATCAGACCACTTCTGGAAGGCAGGGATCTTCTAGCAGCTGCAAAAACAGGCAGTGGTAAAACCCTGGCTTTTCTCATCCCTGCAGTTGAACTCATTGTTAAGTTAAGGTTCATGCCCAGGAATGGAACAGGAGTCCTTATTCTCTCACCTACTAGAGAACTAGCCATGCAAACCTTTGGTGTTCTTAAGGAGCTGATGACTCACCAC

In [6]:
#The correspondent information of NM_006773.4 was downloaded to a file
import os
Entrez.email = "guilherme.lobo@ua.pt"
filename = "NM_006773.4.gb"
if not os.path.isfile(filename):
    net_handle = Entrez.efetch(db="nucleotide", id="NM_006773.4", rettype="gb", retmode="text")
    out_handle = open(filename, "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()

In [7]:
record = SeqIO.read(open("NM_006773.4.gb"), format="genbank")
position=0
record_types={}
for x in record.features:
    record_types[x.type]=record_types.get(x.type,0)+1
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        position=i
print("The length of the sequence: {}\n".format(len(record.seq)))
print("Type of features: {}\n".format(record_types))
print("Comment from NCBI: {}\n".format(record.annotations["comment"]))
#checking the location of the CDS on the original sequence
print("Location of the CDS on the original sequence: {}".format(record.features[position].location))

The length of the sequence: 3753

Type of features: {'source': 1, 'gene': 1, 'exon': 14, 'misc_feature': 4, 'CDS': 1, 'regulatory': 3, 'polyA_site': 3}

Comment from NCBI: REVIEWED REFSEQ: This record has been curated by NCBI staff. The
reference sequence was derived from BU599536.1, BC024739.1 and
AK091227.1.
On Nov 22, 2018 this sequence version replaced NM_006773.3.
Summary: DEAD box proteins, characterized by the conserved motif
Asp-Glu-Ala-Asp (DEAD), are putative RNA helicases. They are
implicated in a number of cellular processes involving alteration
of RNA secondary structure such as translation initiation, nuclear
and mitochondrial splicing, and ribosome and spliceosome assembly.
Based on their distribution patterns, some members of this family
are believed to be involved in embryogenesis, spermatogenesis, and
cellular growth and division. This gene encodes a DEAD box protein,
and it is activated by Myc protein. [provided by RefSeq, Jul 2008].
Publication Note:  This RefSeq re

In [8]:
CDS_nuc_seq=record.seq[int(record.features[position].location.start):int(record.features[position].location.end)]

In [9]:
#saving the CDS_nucleotides and CD_aminoacid seqs in files
filename = "NM_006773.4.gb"
filename_CDS_nucl = "CDS_nucleot_DDX18_seq.fasta"
filename_CDS_prot = "CDS_prot_DDX18_seq.fasta"
input_handle  = open(filename, "r")
output_handle_nucl = open(filename_CDS_nucl, "w")
output_handle_prot = open(filename_CDS_prot, "w")
for seq_record in SeqIO.parse(input_handle, "genbank") :
    output_handle_nucl.write(">\n%s" % (CDS_nuc_seq))
    output_handle_prot.write(">\n%s" % ("".join(record.features[position].qualifiers['translation']))) #without join, output is a list
    
output_handle_prot.close()
output_handle_nucl.close()
input_handle.close()

# Homology analysis by BLAST or Diamond

## DDX18

In [10]:
record = SeqIO.read(open("nucleotide_ddx18.fasta"), format="fasta") 
print(len(record.seq))

3753


In [11]:
result_handle=NCBIWWW.qblast("blastn","nt",record.seq)
with open("blastn_DNA_ddx18_file.xml","w") as out_handle: 
    out_handle.write(result_handle.read())
result_handle.close()

In [12]:
Blast=open("blastn_DNA_ddx18_file.xml")
Blast_record=NCBIXML.parse(Blast)
for x in Blast_record:
    print(x.alignments[0])#best-one

gi|1519243738|ref|NM_006773.4| Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
           Length = 3753



In [14]:
result_seq = open("blastn_DNA_ddx18_file.xml")
blast_results = NCBIXML.read(result_seq)
#print("Number of alignments:",len(blast_results.alignments))
for parameter in blast_results.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  NM_006773
Definition:  Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
E-value:  0.0
E-value:  0.0224187
E-value:  0.0224187

Accession:  BC024739
Definition:  Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 18, mRNA (cDNA clone MGC:29902 IMAGE:4995871), complete cds
E-value:  0.0
E-value:  0.0224187
E-value:  0.0224187

Accession:  XM_003819127
Definition:  PREDICTED: Pan paniscus DEAD-box helicase 18 (DDX18), mRNA
E-value:  0.0
E-value:  0.000527237
E-value:  0.0224187

Accession:  XM_515753
Definition:  PREDICTED: Pan troglodytes DEAD-box helicase 18 (DDX18), mRNA
E-value:  0.0
E-value:  0.0224187
E-value:  0.0224187

Accession:  XM_004031662
Definition:  PREDICTED: Gorilla gorilla gorilla DEAD-box helicase 18 (DDX18), mRNA
E-value:  0.0
E-value:  1.01781e-06
E-value:  0.000527237


In [15]:
E_VALUE_THRESH = 0.001
results_Blast= open('blastn_DNA_ddx18_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    print (alignment)
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            print('****Alignment****')
            print('sequence: ', alignment.title)
            print('lenght:', alignment.length)
            print(hsp.query[0:75] + '...')
            print(hsp.match[0:75] + '...')
            print(hsp.sbjct[0:75] + '...')
            print()
            

gi|1519243738|ref|NM_006773.4| Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
           Length = 3753

****Alignment****
sequence:  gi|1519243738|ref|NM_006773.4| Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
lenght: 3753
ACGTGCGGCCGGAAGGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCAC...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
ACGTGCGGCCGGAAGGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCAC...

gi|19353238|gb|BC024739.1| Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 18, mRNA (cDNA clone MGC:29902 IMAGE:4995871), complete cds
           Length = 3764

****Alignment****
sequence:  gi|19353238|gb|BC024739.1| Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 18, mRNA (cDNA clone MGC:29902 IMAGE:4995871), complete cds
lenght: 3764
GGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCACTTGTTGGGCAGAAT...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
GGGAAGTAACGTCAGCCTGAGAACTGAGTAGC

In [16]:
#filtering the "predicted" alignments
results_Blastn= open('blastn_DNA_ddx18_file.xml')
blastn_records = NCBIXML.read(results_Blastn)
E_VALUE_THRESH = 0.001
count_preditc, count_homo=0,0
list_filtered_alignments=[]
for alignment in  blastn_records.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            exist = re.search(r'PREDICTED:\s', alignment.title)
            if exist:
                # print( existe[0] )
                pre = re.match(r'PREDICTED:\s', exist[0] )
                if pre:
                    count_preditc+=1
                    #print(id)
            else:
                list_filtered_alignments.append(alignment.accession)
                homo=re.search(r'Homo\ssapiens',alignment.title)
                if homo:
                    count_homo+=1
print(list_filtered_alignments)
print('Total {} PREDICTED seqs found and remaining {} ids are from Homo sapiens'.format(count_preditc, count_homo))

['NM_006773', 'BC024739', 'NM_001132808', 'NM_001132808', 'NM_001132808', 'AK091227', 'AK001467', 'BC003360', 'BC001238', 'X98743', 'X98743', 'NG_008704', 'NG_008704', 'NG_008704', 'AL365434', 'AL365434', 'AL365434', 'LT744377', 'KJ897930', 'AB209392', 'AB209392', 'AB209392', 'CP034492', 'CP034492', 'NG_002480', 'NG_002480', 'AL391262', 'AL391262', 'NG_008706', 'AL138725']
Total 67 PREDICTED seqs found and remaining 18 ids are from Homo sapiens


# Blastp

## DDX18

In [17]:
record_protein = SeqIO.read(open("CDS_prot_DDX18_seq.fasta"), format="fasta") 
print(len(record_protein.seq))

670


In [18]:
result_handle=NCBIWWW.qblast("blastp","swissprot",record_protein.seq)#blast with the protein translate of my cds
with open("blastp_file_ddx18.xml","w") as out_handle:
    out_handle.write(result_handle.read())
result_handle.close()

In [19]:
result_seq = open("blastp_file_ddx18.xml")
blast_results = NCBIXML.read(result_seq)
#print("Number of alignments:",len(blast_results.alignments))
for parameter in blast_results.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  Q9NVP1
Definition:  RecName: Full=ATP-dependent RNA helicase DDX18; AltName: Full=DEAD box protein 18; AltName: Full=Myc-regulated DEAD box protein; Short=MrDb [Homo sapiens]
E-value:  0.0

Accession:  Q8K363
Definition:  RecName: Full=ATP-dependent RNA helicase DDX18; AltName: Full=DEAD box protein 18 [Mus musculus]
E-value:  0.0

Accession:  Q9VD51
Definition:  RecName: Full=Probable ATP-dependent RNA helicase pitchoune [Drosophila melanogaster]
E-value:  0.0

Accession:  Q1EA54
Definition:  RecName: Full=ATP-dependent RNA helicase HAS1 [Coccidioides immitis RS]
E-value:  0.0

Accession:  A4R8B5
Definition:  RecName: Full=ATP-dependent RNA helicase HAS1 [Pyricularia oryzae 70-15]
E-value:  0.0


In [20]:
results_Blastp= open('blastp_file_ddx18.xml')
blastp_records = NCBIXML.read(results_Blastp)
E_VALUE_THRESH = 0.001
list_filtered_alignments,list_species=[],[]
for alignment in  blastp_records.alignments:
    for hsp in alignment.hsps:
#         print(hsp.identities)    # maybe add more 
        if hsp.expect < E_VALUE_THRESH:
            list_filtered_alignments.append(alignment.accession)
            title_organism=re.search(r'\[.+\s.+\]', alignment.title)
            if title_organism:
                m = re.match(r'\[.+\s.+\]', title_organism[0] )
                specie = m.group(0)
                #print(specie)
                list_species.append(specie)
for x in sorted(set(list_species)):
    print("number of times: {} that appeared specie: {}".format(list_species.count(x),x))

print(list_filtered_alignments)

number of times: 2 that appeared specie: [Arabidopsis thaliana]
number of times: 2 that appeared specie: [Aspergillus clavatus NRRL 1]
number of times: 2 that appeared specie: [Aspergillus fischeri NRRL 181]
number of times: 2 that appeared specie: [Aspergillus fumigatus Af293]
number of times: 2 that appeared specie: [Aspergillus nidulans FGSC A4]
number of times: 2 that appeared specie: [Aspergillus niger CBS 513.88]
number of times: 2 that appeared specie: [Aspergillus oryzae RIB40]
number of times: 2 that appeared specie: [Aspergillus terreus NIH2624]
number of times: 1 that appeared specie: [Candida albicans SC5314]
number of times: 2 that appeared specie: [Chaetomium globosum CBS 148.51]
number of times: 2 that appeared specie: [Coccidioides immitis RS]
number of times: 1 that appeared specie: [Cryptococcus neoformans var. neoformans JEC21] >sp|P0CQ85.1| RecName: Full=ATP-dependent RNA helicase HAS1 [Cryptococcus neoformans var. neoformans B-3501A]
number of times: 1 that appeare

In [21]:
with open('blastp_files_ID_ddx18.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")

In [22]:
Blast=open("blastp_file_ddx18.xml")
Blast_record=NCBIXML.parse(Blast)
for x in Blast_record:
    print(x.database)
    print(x.matrix)
    print(x.gap_penalties)
    print(len(x.alignments))

swissprot
BLOSUM62
(11, 1)
50


In [23]:
result_handle=open("blastp_file_ddx18.xml")
blast_record=NCBIXML.read(result_handle)
E_VALUE_THRESH=0.01
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        if hsp.expect<E_VALUE_THRESH:
            print("----Aligment----")
            print("sequence:",alignment.title)
            print("lenght:",alignment.length)
            print("e.value:",hsp.expect) 
            print(hsp.query[0:75] + "...")
            print(hsp.query[0:75] + "...")
            print(hsp.sbjct[0:75] + "...")

----Aligment----
sequence: sp|Q9NVP1.2| RecName: Full=ATP-dependent RNA helicase DDX18; AltName: Full=DEAD box protein 18; AltName: Full=Myc-regulated DEAD box protein; Short=MrDb [Homo sapiens]
lenght: 670
e.value: 0.0
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
----Aligment----
sequence: sp|Q8K363.1| RecName: Full=ATP-dependent RNA helicase DDX18; AltName: Full=DEAD box protein 18 [Mus musculus]
lenght: 660
e.value: 0.0
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...
MSQLQMKLLRRKIEKRNAKLRQRNLKLQETSDTSLSQPQNGDVPKETGKGGKVKKALKRSVPVDSAEAQSGGMPE...
----Aligment----
sequence: sp|Q9VD51.2| RecName: Full=Probable ATP-dependent RNA helicase pitchoune [Drosophila melanogaster]
lenght: 680
e.valu

In [24]:
blast_qresult=SearchIO.read("blastp_file_ddx18.xml","blast-xml")
blast_hsp=blast_qresult[0][0]
print(blast_qresult[0][0])#best hsp
print(blast_hsp.aln)

      Query: unnamed protein product
        Hit: sp|Q9NVP1.2| RecName: Full=ATP-dependent RNA helicase DDX18; Alt...
Query range: [0:670] (0)
  Hit range: [0:670] (0)
Quick stats: evalue 0; bitscore 1384.01
  Fragments: 1 (670 columns)
     Query - MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQK~~~RQFSH
             MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQK~~~RQFSH
       Hit - MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQK~~~RQFSH
Alignment with 2 rows and 670 columns
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVS...FSH unnamed
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVS...FSH sp|Q9NVP1.2|


## Uniprot search of Blastp results

In [2]:
WEBSITE_API="https://rest.uniprot.org"
def get_url(url,**kwargs):
    response=requests.get(url,**kwargs);
    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()
    return response
r=get_url(f"{WEBSITE_API}/uniprotkb/search?query=ddx18 AND (taxonomy_id:9606) AND (reviewed:true)")#more detail possible
data=r.json()
n_results=len(data["results"])
print(f"Number of results: {n_results}\n")
total=r.headers.get("x-total-results")
page_total=len(data["results"])

Number of results: 2



In [3]:
#one way to list the ids
r=get_url(f"{WEBSITE_API}//uniprotkb/search?query=DDX18 AND (taxonomy_id:9606) AND (reviewed:true)&format=list&size=2")
#r.text

accessions=[r.text.replace("\n",",")]
accessions

['Q9NVP1,Q76FK4,']

In [4]:
#to format fasta that alow see the protein sequences and id in a form more familiary
r=get_url(f"{WEBSITE_API}//uniprotkb/search?query=DDX18  AND (taxonomy_id:9606) AND (reviewed:true)&format=fasta")
fasta=r.text
print(fasta)

>sp|Q9NVP1|DDX18_HUMAN ATP-dependent RNA helicase DDX18 OS=Homo sapiens OX=9606 GN=DDX18 PE=1 SV=2
MSHLPMKLLRKKIEKRNLKLRQRNLKFQGASNLTLSETQNGDVSEETMGSRKVKKSKQKP
MNVGLSETQNGGMSQEAVGNIKVTKSPQKSTVLTNGEAAMQSSNSESKKKKKKKRKMVND
AEPDTKKAKTENKGKSEEESAETTKETENNVEKPDNDEDESEVPSLPLGLTGAFEDTSFA
SLCNLVNENTLKAIKEMGFTNMTEIQHKSIRPLLEGRDLLAAAKTGSGKTLAFLIPAVEL
IVKLRFMPRNGTGVLILSPTRELAMQTFGVLKELMTHHVHTYGLIMGGSNRSAEAQKLGN
GINIIVATPGRLLDHMQNTPGFMYKNLQCLVIDEADRILDVGFEEELKQIIKLLPTRRQT
MLFSATQTRKVEDLARISLKKEPLYVGVDDDKANATVDGLEQGYVVCPSEKRFLLLFTFL
KKNRKKKLMVFFSSCMSVKYHYELLNYIDLPVLAIHGKQKQNKRTTTFFQFCNADSGTLL
CTDVAARGLDIPEVDWIVQYDPPDDPKEYIHRVGRTARGLNGRGHALLILRPEELGFLRY
LKQSKVPLSEFDFSWSKISDIQSQLEKLIEKNYFLHKSAQEAYKSYIRAYDSHSLKQIFN
VNNLNLPQVALSFGFKVPPFVDLNVNSNEGKQKKRGGGGGFGYQKTKKVEKSKIFKHISK
KSSDSRQFSH
>sp|Q76FK4|NOL8_HUMAN Nucleolar protein 8 OS=Homo sapiens OX=9606 GN=NOL8 PE=1 SV=1
MKVNRETKRLYVGGLSQDISEADLQNQFSRFGEVSDVEIITRKDDQGNPQKVFAYINISV
AEADLKKCMSVLNKTKWKGGTLQIQLAKESFLHRLAQEREAAKAKKEESTTGNANLLEKT
GGVDFHMKAVPGT

-we choose the first one the id Q9NVP1 

In [6]:
with ExPASy.get_sprot_raw("Q9NVP1") as handle:
    seq_record = SeqIO.read(handle, "swiss")
    #print(seq_record.id)
    #print(seq_record.entry_name, "\n")
    #print(", ".join(seq_record.accessions), "\n")
    #print(seq_record.keywords, "\n")
    #print(seq_record.organism, "\n")
    #print(len(seq_record.sequence), "aa", "\n")
    print(seq_record.seq)
#is not possible in my case!!!!!!!!!!!!!!!!!!!!1

IndexError: list index out of range

We can't understand why the id of ddx18 doesn't work and the other genes already work

In [7]:
results_Blastp= open('blastp_file_ddx18.xml')
blastp_records = NCBIXML.read(results_Blastp)
titles_list=[]
for alignment in  blastp_records.alignments:   
    titles_list.append(alignment.title)

file= open("blastp_files_ID_ddx18.txt", "r")
fields="sequence"
WEBSITE_API="https://rest.uniprot.org"
seqs=[]

for i in file:
    r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
    seqs.append(str(r.content))

with open('blastp_file_ddx18.fasta', 'w') as f:
    for index, seq in enumerate(seqs):
        existe = re.search(r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL)
        if existe:
            m = re.match( r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL )
            f.write(f">{titles_list[index]}\n{m.group(1)}\n\n")                    
            #print(m.group(1) )

In [8]:
file= open("blastp_files_ID_ddx18.txt", "r")
fields="accession,organism_name,protein_name,cc_subcellular_location,cc_function"
WEBSITE_API="https://rest.uniprot.org"
with open('uniprot_result_CDS_filtered.txt', 'w',encoding='utf-8') as f:
    for i in file:
        r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
        print(r.text)
        f.write(r.text)
        f.write('\n')

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q9NVP1	Homo sapiens (Human)	ATP-dependent RNA helicase DDX18 (EC 3.6.4.13) (DEAD box protein 18) (Myc-regulated DEAD box protein) (MrDb)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000269|PubMed:16963496}. Chromosome {ECO:0000269|PubMed:20813266}.	FUNCTION: Probable RNA-dependent helicase.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q8K363	Mus musculus (Mouse)	ATP-dependent RNA helicase DDX18 (EC 3.6.4.13) (DEAD box protein 18)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000250|UniProtKB:Q9NVP1}. Chromosome {ECO:0000250|UniProtKB:Q9NVP1}.	FUNCTION: Probable RNA-dependent helicase.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q9VD51	Drosophila melanogaster (Fruit fly)	Probable ATP-dependent RNA helicase pitchoune (EC 3.6.4.13)	SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:0000269|PubMed:9716523}.	FUNCTION: Probable RNA-dependent helicase. Functions in cell growth

ConnectionError: HTTPSConnectionPool(host='rest.uniprot.org', port=443): Max retries exceeded with url: /uniprotkb/search?query=A3LNR6%0A%20AND%20(reviewed:true)&fields=accession,organism_name,protein_name,cc_subcellular_location,cc_function&size=1&format=tsv (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002833F5229A0>: Failed to establish a new connection: [WinError 10060] Uma tentativa de ligação falhou porque o componente ligado não respondeu\r\ncorretamente após um período de tempo, ou a ligação estabelecida falhou\r\nporque o anfitrião ligado não respondeu'))