In [2]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW
import requests, sys, json
import re
from Bio import SeqIO
from Bio import Entrez
from Bio import Medline
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SearchIO
from Bio.SwissProt import KeyWList
from Bio import SwissProt
from Bio import ExPASy
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Align.Applications import ClustalwCommandline
from Bio.Align import AlignInfo
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo

# Homology analysis by BLAST or Diamond

## Blastn

In [3]:
record = SeqIO.read(open("nucleotide_ddx18.fasta"), format="fasta") 
print(len(record.seq))

3753


In [4]:
result_handle=NCBIWWW.qblast("blastn","nt",record.seq)
with open("blastn_DNA_ddx18_file.xml","w") as out_handle: 
    out_handle.write(result_handle.read())
result_handle.close()

In [5]:
Blast=open("blastn_DNA_ddx18_file.xml")
Blast_record=NCBIXML.parse(Blast)
for x in Blast_record:
    print(x.alignments[0])#best-one

gi|1519243738|ref|NM_006773.4| Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
           Length = 3753



In [6]:
result_seq = open("blastn_DNA_ddx18_file.xml")
blast_results = NCBIXML.read(result_seq)
#print("Number of alignments:",len(blast_results.alignments))
for parameter in blast_results.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
         print('E-value: ', e_v.expect)


Accession:  NM_006773
Definition:  Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
E-value:  0.0
E-value:  0.0225354
E-value:  0.0225354

Accession:  BC024739
Definition:  Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 18, mRNA (cDNA clone MGC:29902 IMAGE:4995871), complete cds
E-value:  0.0
E-value:  0.0225354
E-value:  0.0225354

Accession:  XM_003819127
Definition:  PREDICTED: Pan paniscus DEAD-box helicase 18 (DDX18), mRNA
E-value:  0.0
E-value:  0.000529982
E-value:  0.0225354

Accession:  XM_515753
Definition:  PREDICTED: Pan troglodytes DEAD-box helicase 18 (DDX18), mRNA
E-value:  0.0
E-value:  0.0225354
E-value:  0.0225354

Accession:  XM_004031662
Definition:  PREDICTED: Gorilla gorilla gorilla DEAD-box helicase 18 (DDX18), mRNA
E-value:  0.0
E-value:  1.02311e-06
E-value:  0.000529982


In [15]:
E_VALUE_THRESH = 0.00001
results_Blast= open('blastn_DNA_ddx18_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    print (alignment)
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            print('****Alignment****')
            print('sequence: ', alignment.title)
            print('lenght:', alignment.length)
            print(hsp.query[0:75] + '...')
            print(hsp.match[0:75] + '...')
            print(hsp.sbjct[0:75] + '...')
            print()
            

gi|1519243738|ref|NM_006773.4| Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
           Length = 3753

****Alignment****
sequence:  gi|1519243738|ref|NM_006773.4| Homo sapiens DEAD-box helicase 18 (DDX18), mRNA
lenght: 3753
ACGTGCGGCCGGAAGGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCAC...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
ACGTGCGGCCGGAAGGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCAC...

gi|19353238|gb|BC024739.1| Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 18, mRNA (cDNA clone MGC:29902 IMAGE:4995871), complete cds
           Length = 3764

****Alignment****
sequence:  gi|19353238|gb|BC024739.1| Homo sapiens DEAD (Asp-Glu-Ala-Asp) box polypeptide 18, mRNA (cDNA clone MGC:29902 IMAGE:4995871), complete cds
lenght: 3764
GGGAAGTAACGTCAGCCTGAGAACTGAGTAGCTGTACTGTGTGGCGCCTTATTCTAGGCACTTGTTGGGCAGAAT...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
GGGAAGTAACGTCAGCCTGAGAACTGAGTAGC

In [16]:
#filtering the "predicted" alignments
results_Blastn= open('blastn_DNA_ddx18_file.xml')
blastn_records = NCBIXML.read(results_Blastn)
E_VALUE_THRESH = 0.00001
count_preditc, count_homo=0,0
list_filtered_alignments=[]
for alignment in  blastn_records.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            exist = re.search(r'PREDICTED:\s', alignment.title)
            if exist:
                pre = re.match(r'PREDICTED:\s', exist[0] )
                if pre:
                    count_preditc+=1
            else:
                list_filtered_alignments.append(alignment.accession)
                homo=re.search(r'Homo\ssapiens',alignment.title)
                if homo:
                    count_homo+=1
print(list_filtered_alignments)
print()
print('Total {} PREDICTED seqs found and remaining {} ids are from Homo sapiens'.format(count_preditc, count_homo))

['NM_006773', 'BC024739', 'NM_001132808', 'AK091227', 'AK001467', 'BC003360', 'BC001238', 'X98743', 'X98743', 'NG_008704', 'NG_008704', 'NG_008704', 'AL365434', 'AL365434', 'AL365434', 'LT744377', 'KJ897930', 'AB209392', 'AB209392', 'AB209392', 'CP034492', 'NG_002480', 'AL391262', 'NG_008706', 'AL138725']

Total 43 PREDICTED seqs found and remaining 16 ids are from Homo sapiens


In [17]:
#ids after filtration
with open('DDX18_nucleotide_result_blast.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")