-
Notifications
You must be signed in to change notification settings - Fork 0
/
Extract_Seqs_using_id.py
36 lines (29 loc) · 1.14 KB
/
Extract_Seqs_using_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
###Copyrighter:Environmental Biotechnology Lab, The University of Hong Kong
file_name1=raw_input("Enter the full name of id.txt: ")
file_name2=raw_input("Enter the full name of the database fasta: ")
from Bio import SeqIO
fileinput =open(file_name1,'r')
fileoutput=open('Extracted_sequences_'+file_name1,'w')
print 'The Python script is running... Pls wait!'
a=[]
for line in open(file_name1,'r'):
a.append(str(line).strip())
b=list(set(a))
print len(a),'ids in '+file_name1
print len(b),'unique ids in '+file_name1
Num=0
for record in SeqIO.parse(file_name2,'fasta'):
Num+=1
if str(record.id).strip() in b:
fileoutput.write('>'+str(record.id).strip()+'\n')
fileoutput.write(str(record.seq).strip()+'\n')
b.remove(str(record.id).strip())
if Num%100000==0:
print Num, 'sequences have been searched!'
print len(b),'ids in',file_name1,'are not found in',file_name2
if len(b)!=0:
fileoutput1=open('ID_Not_Found_'+file_name1,'w')
for item in b:
fileoutput1.write(item+'\n')
print 'OK, Finished!'
raw_input("Press <Enter> to close this window: ")