In [1]:
import re
from collections import Counter
import pandas as pd
from Bio import SeqIO
import seaborn as sns

In [2]:
taxonomy_map = dict()
with open('taxonomy.txt') as handle:
    for line in handle:
        accession, organism = line.strip().split('\t')
        taxonomy_map[accession] = organism

In [29]:
blast_outfile = '/media/NGS/Nanopore_1/mNGS/20210121_spike/untrim/Blast/barcode11_ref-prok.txt'
seqfile = '/media/NGS/Nanopore_1/mNGS/20210121_spike/untrim/Blast/barcode11.fa'

In [30]:
seqids = set(record.id for record in SeqIO.parse(seqfile, 'fasta') if len(record.seq) >= 300)

In [31]:
names = ['qseqid', 'sseqid', 'pident', "length", "mismatch", "gapopen", "qstart", "qend",
         "sstart", "send", "evalue", "bitscore", 'qlen', 'slen']

df = pd.read_csv(blast_outfile, sep='\t', names=names)

In [32]:
df = df[df['qseqid'].isin(seqids)]

In [33]:
df = df.sort_values('bitscore', ascending=False).drop_duplicates('qseqid')

In [34]:
df = df[(df['pident']>=85)&(df['evalue']<=1e-6)&(df['length']/df['qlen']>=0.8)]

In [35]:
prog = re.compile('[A-Z]+_\w+.\w')
df['sseqid'] = [prog.search(i).group() for i in df['sseqid']]
df['organism'] = df['sseqid'].map(taxonomy_map)
df['organism'] = df['organism'].str.replace('[', '').str.replace(']', '')

prog = re.compile('[A-Z][a-z]+ [a-z]+')
df['organism'] = [prog.search(i).group() for i in df['organism']]

In [36]:
num_reads = len(seqids)
num_classified_reads = df.shape[0]
num_unclassified_reads = num_reads - num_classified_reads

In [37]:
count = Counter(df['organism'])

In [38]:
print('total reads', num_reads, sep='\t')
print('unclassified reads', num_unclassified_reads, sep='\t')
print('Klebsiella pneumoniae', count['Klebsiella pneumoniae'], sep='\t')
print('other', num_classified_reads-count['Klebsiella pneumoniae'], sep='\t')

total reads	95980
unclassified reads	36184
Klebsiella pneumoniae	53402
other	6394


In [39]:
s = pd.Series(count)

In [40]:
s = s.sort_values(ascending=False)

In [41]:
s.to_csv('/media/NGS/Nanopore_1/mNGS/20210121_spike/untrim/Blast/barcode11_match_organism.txt', sep='\t', header=False)