In [1]:
from io import StringIO
from pathlib import Path
import pandas as pd
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline

In [13]:
def removal_overlap(records):
    results = []
    for _, group in records.groupby('Position in contig'):
        uniques = []
        for _, row in group.iterrows():
            if uniques:
                overlaps = set()
                for unique in uniques:
                    overlaps.add(len(set(range(row['qstart'], row['qend'])) & set(range(unique['qstart'], unique['qend']))))
                if max(overlaps) == 0:
                    uniques.append(row)
            else:
                uniques.append(row)
        results += uniques
    return pd.DataFrame(results)

In [51]:
query = '/media/Central_Lab_Storage/MinION/mNGS/20211006_CMUH/denovo/barcode10/contigs.fa'

In [52]:
database = '/media/GenomicResearch/Tools/CGE/resfinder_db'

align_results = []
for seqfile in Path(database).glob('*.fsa'):
    cline = NcbiblastnCommandline(
        query=query,
        subject=seqfile,
        outfmt='6 qseqid sseqid sstrand length qstart qend sstart send slen',
        perc_identity=90,
    )
    stdout_str, stderr_str = cline()
    align_result = pd.read_csv(StringIO(stdout_str), sep='\t', header=None, names=['qseqid', 'sseqid', 'sstrand', 'length', 'qstart', 'qend', 'sstart', 'send', 'slen'])
    align_result['cov'] = align_result['length']/align_result['slen']*100
    align_result = align_result[align_result['cov']>=60]
    align_result = removal_overlap(align_result)
    align_results.append(align_result)

In [53]:
df = pd.concat(align_results, ignore_index=True)

In [54]:
df['gene'] = df['sseqid'].str.split('_', 1).str[0]

In [55]:
df = df.drop_duplicates('gene')

In [56]:
phenotypes = pd.read_csv(Path(database, 'phenotypes.txt'), sep='\t', usecols=['Gene_accession no.', 'Phenotype'])

In [57]:
df.merge(phenotypes, left_on='sseqid', right_on='Gene_accession no.').drop(['sseqid', 'Gene_accession no.'], axis=1)

Unnamed: 0,qseqid,sstrand,length,qstart,qend,sstart,send,slen,cov,gene,Phenotype
0,contig_1,plus,1443,120668,122110,1,1440,1440,100.208333,aac(6')-aph(2''),"Gentamicin, Tobramycin, Netilmicin, Kanamycin,..."
1,contig_1,plus,771,1937629,1938399,1,771,771,100.0,aadD,"Amikacin, Tobramycin"
2,contig_1,plus,2009,1932178,1934186,1,2007,2007,100.099651,mecA,"Amoxicillin, Amoxicillin+Clavulanic acid, Ampi..."
3,contig_1,minus,888,217899,218785,888,1,888,100.0,blaZ,"Amoxicillin, Ampicillin, Penicillin, Piperacillin"
4,contig_1,plus,1545,112828,114372,1,1545,1545,100.0,qacA,"Benzylkonium Chloride, Ethidium Bromide, Chlor..."
5,contig_1,plus,430,2133156,2133585,1,429,429,100.2331,fosB,Fosfomycin
6,contig_1,plus,642,196772,197413,1,642,642,100.0,fusB,Fusidic acid
7,contig_2,minus,735,2724,3458,735,1,735,100.0,erm(C),"Erythromycin, Lincomycin, Clindamycin, Quinupr..."


In [3]:
df = pd.read_csv('/media/Central_Lab_Storage/MinION/mNGS/20211028_ICU001/resfinder/barcode30/ResFinder_results_tab.txt', sep='\t')

In [16]:
df['qstart'], df['qend'] = df['Position in contig'].str.split('\.\.').str
df['qstart'], df['qend'] = df['qstart'].astype(int), df['qend'].astype(int)

  """Entry point for launching an IPython kernel.


In [17]:
removal_overlap(df)

Unnamed: 0,Resistance gene,Identity,Alignment Length/Gene Length,Coverage,Position in reference,Contig,Position in contig,Phenotype,Accession no.,qstart,qend
18,sul2,100.0,816/816,100.0,1..816,contig_1_polish,109253..110068,Sulphonamide resistance,FN995456,109253,110068
4,aph(3'')-Ib,100.0,804/804,100.0,1..804,contig_1_polish,110129..110932,Aminoglycoside resistance Alternate name; aph(...,AF321551,110129,110932
2,aph(6)-Id,100.0,837/837,100.0,1..837,contig_1_polish,110932..111768,Aminoglycoside resistance Alternate name; aph(...,M28829,110932,111768
3,aph(3')-Ia,100.0,816/816,100.0,1..816,contig_1_polish,112684..113499,Aminoglycoside resistance,V00359,112684,113499
22,dfrA7,100.0,474/474,100.0,1..474,contig_1_polish,121273..121746,Trimethoprim resistance,AB161450,121273,121746
27,qacE,100.0,282/333,84.684685,1..282,contig_1_polish,121976..122257,Disinfectant resistance,X68232,121976,122257
17,sul1,100.0,840/840,100.0,1..840,contig_1_polish,122317..123156,Sulphonamide resistance,U12338,122317,123156
20,tet(D),100.0,1185/1185,100.0,1..1185,contig_1_polish,126957..128141,Tetracycline resistance,AF467077,126957,128141
5,blaSHV-187,99.88,867/867,100.0,1..867,contig_7_polish,2195720..2196586,Beta-lactam resistance,LN515533,2195720,2196586
19,tet(A),99.75,1200/1200,100.0,1..1200,contig_9_polish,25096..26295,Tetracycline resistance,AY196695,25096,26295
