In [4]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW
import requests, sys, json
import re
from Bio import SeqIO
from Bio import Entrez
from Bio import Medline
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SearchIO
from Bio.SwissProt import KeyWList
from Bio import SwissProt
from Bio import ExPASy
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Align.Applications import ClustalwCommandline
from Bio.Align import AlignInfo
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo

# Analysis of the sequence and features present in the NCBI

## DDX18

In [4]:
database ="nucleotide"
word = "ddx18 and homo sapiens and Chromosome 2 and not predicted and not unverified"
res= "30"
Entrez.email="guilherme.lobo@ua.pt"
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="gb") 
records = list(SeqIO.parse(handle,"gb"))
handle.close()
for x in records:
    print(x.id,x.description)

NM_006773.4 Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
AC009312.4 Homo sapiens BAC clone RP11-425F6 from 2, complete sequence
AC009404.5 Homo sapiens BAC clone RP11-28H22 from 2, complete sequence
CM000253.1 Homo sapiens chromosome 2, whole genome shotgun sequence
CH471103.1 Homo sapiens 211000035839809 genomic scaffold, whole genome shotgun sequence
DQ655976.2 Homo sapiens clone Affy08254D01, mRNA sequence
DQ655975.2 Homo sapiens clone Affy08248B12, mRNA sequence
NT_086328.2 Homo sapiens chromosome 2 sequence, ENCODE region ENr121


Taking into account the description given, we think that the id (**"NM_006773.4"**) would be the most appropriate and, as such, will be used from now on.

In [5]:
with open("nucleotide_ddx18.fasta","w") as f: 
    for rec in records:
        #print(rec)
        if rec.id == "NM_006773.4":
            print("Record sequence length: ", len(rec.seq))
            print("Record features count: ", len(rec.features))
            sequence_nucleotide_DDX18=rec.seq
            print( "Nuclleotide sequence:", str(rec.seq)) #useful for blastn
            f.write(f">{rec.id}\n{sequence_nucleotide_DDX18}\n\n") # only the sequence in the file
        
            cds = []
            for i,feature in enumerate(rec.features):
                if feature.type=="CDS":
                    cds.append(i)
                    for k in cds:
                        print("ddx18 gene synonyms:",rec.features[k].qualifiers["gene_synonym"])#can help latter in the search
                        print("name:",rec.features[k].qualifiers["product"])
                        protein_ID=rec.features[k].qualifiers["protein_id"]
                        sequence_protein=rec.features[k].qualifiers["translation"]#get the protein seq of my cds
                        print("Protein sequence:",sequence_protein)
            gene = []
            for j in range(len(rec.features)):
                if rec.features[j].type == "gene":
                    gene.append(j)
                    for g in gene:
                        print("Location:",rec.features[g].location)
                        print("Strand:",rec.features[g].location.strand)
                        print("Gene:",rec.features[g].qualifiers["gene"])
#the real important thing is the nucleotide sequence  and the protein sequence of cds

Record sequence length:  3753
Record features count:  27
Nuclleotide sequence: ACGTGCGGCCGGAAGGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCACTTGTTGGGCAGAATGTCACACCTGCCGATGAAACTCCTGCGTAAGAAGATCGAGAAGCGGAACCTCAAATTGCGGCAGCGGAACCTAAAGTTTCAGGGGGCCTCAAATCTGACCCTATCGGAAACTCAAAATGGAGATGTATCTGAAGAAACAATGGGAAGTAGAAAGGTTAAAAAATCAAAACAAAAGCCCATGAATGTGGGCTTATCAGAAACTCAAAATGGAGGCATGTCTCAAGAAGCAGTGGGAAATATAAAAGTTACAAAGTCTCCCCAGAAATCCACTGTATTAACCAATGGAGAAGCAGCAATGCAGTCTTCCAATTCAGAATCAAAAAAGAAAAAGAAGAAAAAGAGAAAAATGGTGAATGATGCTGAGCCTGATACGAAAAAAGCAAAAACTGAAAACAAAGGGAAATCTGAAGAAGAAAGTGCCGAGACTACTAAAGAAACAGAAAATAATGTGGAGAAGCCAGATAATGATGAAGATGAGAGTGAGGTGCCCAGTCTGCCCCTGGGACTGACAGGAGCTTTTGAGGATACTTCGTTTGCTTCTCTATGTAATCTTGTCAATGAAAACACTCTGAAGGCAATAAAAGAAATGGGTTTTACAAACATGACTGAAATTCAGCATAAAAGTATCAGACCACTTCTGGAAGGCAGGGATCTTCTAGCAGCTGCAAAAACAGGCAGTGGTAAAACCCTGGCTTTTCTCATCCCTGCAGTTGAACTCATTGTTAAGTTAAGGTTCATGCCCAGGAATGGAACAGGAGTCCTTATTCTCTCACCTACTAGAGAACTAGCCATGCAAACCTTTGGTGTTCTTAAGGAGCTGATGACTCACCAC

In [92]:
#The correspondent information of NM_006773.4 was downloaded to a file
import os
Entrez.email = "guilherme.lobo@ua.pt"
filename = "NM_006773.4.gb"
if not os.path.isfile(filename):
    net_handle = Entrez.efetch(db="nucleotide", id="NM_006773.4", rettype="gb", retmode="text")
    out_handle = open(filename, "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()

In [6]:
record = SeqIO.read(open("NM_006773.4.gb"), format="genbank")
position=0
record_types={}
for x in record.features:
    record_types[x.type]=record_types.get(x.type,0)+1
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        position=i
print("The length of the sequence: {}\n".format(len(record.seq)))
print("Type of features: {}\n".format(record_types))
#print("Comment from NCBI: {}\n".format(record.annotations["comment"]))
print("Location of the CDS on the original sequence: {}".format(record.features[position].location))

The length of the sequence: 3753

Type of features: {'source': 1, 'gene': 1, 'exon': 14, 'misc_feature': 4, 'CDS': 1, 'regulatory': 3, 'polyA_site': 3}

Location of the CDS on the original sequence: [87:2100](+)


In [7]:
CDS_nuc_seq=record.seq[int(record.features[position].location.start):int(record.features[position].location.end)]

In [8]:
#saving the CDS_nucleotides and CD_aminoacid seqs in files
filename = "NM_006773.4.gb"
filename_CDS_nucl = "CDS_nucleot_DDX18_seq.fasta"
filename_CDS_prot = "CDS_prot_DDX18_seq.fasta"
input_handle  = open(filename, "r")
output_handle_nucl = open(filename_CDS_nucl, "w")
output_handle_prot = open(filename_CDS_prot, "w")
for seq_record in SeqIO.parse(input_handle, "genbank") :
    output_handle_nucl.write(">\n%s" % (CDS_nuc_seq))
    output_handle_prot.write(">\n%s" % ("".join(record.features[position].qualifiers['translation']))) #without join, output is a list
    
output_handle_prot.close()
output_handle_nucl.close()
input_handle.close()