In [1]:
import os
import sys
import shutil
from pathlib import Path
from multiprocessing import Pool
from tempfile import TemporaryDirectory
import pandas as pd

sys.path.append('../src')

from run import run_resfinder
from utils import run_cmd

In [12]:
db_res = '/media/GenomicResearch/Tools/CGE/resfinder_db'
db_point = '/media/GenomicResearch/Tools/CGE/pointfinder_db'
species = 'enterococcus_faecium'

In [13]:
point = True
acquired = True

run_resfinder(
    '/media/Central_Lab_Storage/MinION/mNGS/20220216_ICU31_34_strain/denovo/barcode74/contigs.fasta',
    '/media/Central_Lab_Storage/MinION/mNGS/20220216_ICU31_34_strain/resfinder/barcode74',
    db_res, db_point, species=species, point=point, acquired=acquired, min_cov=0.6, threshold=0.9)

In [None]:
dirpath = Path('/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/amplification/20211028_R18-1656H/denovo')
outpath = Path('/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/amplification/20211028_R18-1656H/resfinder')

In [None]:
with Pool(64) as p:
    try:
        for i in dirpath.iterdir():
            infile = i/'contigs.fasta'
            outdir = outpath/i.name
            p.apply_async(
                run_resfinder,
                args=(infile, outdir, db_res, db_point),
                kwds={'species': species, 'acquired': True, 'point': True, 'min_cov': 0.6, 'threshold': 0.9},
            )
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()

In [None]:
def removal_overlap(records):
    results = []
    for _, group in records.groupby('Position in contig'):
        uniques = []
        for _, row in group.iterrows():
            if uniques:
                overlaps = set()
                for unique in uniques:
                    overlaps.add(len(set(range(row['qstart'], row['qend'])) & set(range(unique['qstart'], unique['qend']))))
                if max(overlaps) == 0:
                    uniques.append(row)
            else:
                uniques.append(row)
        results += uniques
    return pd.DataFrame(results)

In [None]:
db_res = '/media/GenomicResearch/Tools/CGE/resfinder_db'
phenotypes = pd.read_csv(os.path.join(db_res, 'phenotypes.txt'), sep='\t', usecols=['Gene_accession no.', 'Phenotype'])
phenotypes['Gene_accession no.'] = phenotypes['Gene_accession no.'].str.split("_", 1).str[0]

phenotypes = phenotypes.drop_duplicates('Gene_accession no.')
phenotypes = dict(zip(phenotypes['Gene_accession no.'], phenotypes['Phenotype']))

In [None]:
dirpath = Path('/media/Central_Lab_Storage/MinION/mNGS/20211028_ICU001/resfinder/barcode27')

In [None]:
df = pd.read_csv(dirpath/'ResFinder_results_tab.txt', sep='\t')

df['qstart'], df['qend'] = df['Position in contig'].str.split('\.\.').str
df['qstart'], df['qend'] = df['qstart'].astype(int), df['qend'].astype(int)

genes = removal_overlap(df)['Resistance gene'].to_list()

In [None]:
df = pd.read_csv(dirpath/'PointFinder_results.txt', sep='\t')
df['Resistance'] = df['Resistance'].str.capitalize()

In [None]:
phenotype_map = {**{gene: phenotypes.get(gene) for gene in genes}, **dict(zip(df['Mutation'], df['Resistance']))}

In [None]:
phenotype_map

In [None]:
import re
from io import StringIO
from collections import defaultdict
import pandas as pd

In [None]:
def point_summary(file):
    df = pd.read_csv(file, sep='\t', usecols=['Mutation'])
    summary = defaultdict(list)
    for mutation in df['Mutation']:
        gene_name, position = re.search('^(.*) \w.(.*$)', mutation).groups()
        summary[gene_name].append(position)
    return {gene_name: ', '.join(position) for gene_name, position in summary.items()}

In [None]:
dirpath = Path('/media/NAS/Central_Lab_Storage/MinION/mNGS/20210901/time_filt/resfinder')

summaries = dict()
for i in dirpath.iterdir():
    target = i/'PointFinder_results.txt'
    summaries[i.name] = point_summary(target)

In [None]:
df = pd.DataFrame(summaries).T.sort_index()
df.index.name = 'Key'
df.head()

In [None]:
df.to_csv('/media/NAS/Central_Lab_Storage/MinION/mNGS/20210901/time_filt/PointFinder.txt', sep='\t')

In [None]:
def split_by_database(file):
    with open(file) as f:
        database_name = ''
        data = []
        for line in f:
            line = line.strip()
            if not line:
                yield (database_name, '\n'.join(data))
                database_name = ''
                data = []
            elif not database_name:
                database_name = line
            else:
                data.append(line)

def resistance_summary(file):
    summary = pd.DataFrame()
    for database_name, result in split_by_database(file):
        if result != 'No hit found':
            df = pd.read_csv(StringIO(result), sep='\t')
            df['Database'] = database_name
            summary = pd.concat([summary, df])
    try:
        return summary.drop_duplicates('Resistance gene').groupby('Database')['Resistance gene'].apply(lambda x: x.sort_values().str.cat(sep=', '))
    except KeyError:
        return

In [None]:
dirpath = Path('/media/NAS/Central_Lab_Storage/MinION/mNGS/20210901/time_filt/resfinder')

summaries = dict()
for i in dirpath.iterdir():
    summaries[i.name] = resistance_summary(i/'ResFinder_results_table.txt')
df = pd.DataFrame(summaries).T.sort_index()
df.index.name = 'Key'
df.head()

In [None]:
df.to_csv('/media/NAS/Central_Lab_Storage/MinION/mNGS/20210901/time_filt/ResFinder.txt', sep='\t')

In [None]:
resistance_summary('/media/NAS/Central_Lab_Storage/MinION/mNGS/20210901/resfinder/barcode18/ResFinder_results_table.txt')

In [None]:
def point_summary(file):
    df = pd.read_csv(file, sep='\t', usecols=['Mutation'])
    return {' '.join(re.search('^(.*) \w.(.*$)', mutation).groups()): 1 for mutation in df['Mutation']}

In [None]:
dirpath = Path('/media/NAS/Central_Lab_Storage/MiSeq/Shigella/Analysis')

summaries = dict()
for i in dirpath.iterdir():
    target = i/'resfinder'/'PointFinder_results.txt'
    summaries[i.name] = point_summary(target)

In [None]:
df = pd.DataFrame(summaries).T.sort_index(axis=1).sort_index(axis=0)
df.index.name = 'Key'
df.head()

In [None]:
df.to_csv('/media/NAS/Central_Lab_Storage/MiSeq/Shigella/point_mutation.txt', sep='\t')

In [None]:
file = '/media/NAS/Central_Lab_Storage/MinION/mNGS/20210802_clBC_Test66/resfinder/barcode13/ResFinder_results_table.txt'

In [None]:
def fn(file):
    with open(file) as f:
        database_name = ''
        data = []
        for line in f:
            line = line.strip()
            if not line:
                yield (database_name, '\n'.join(data))
                database_name = ''
                data = []
            elif not database_name:
                database_name = line
            else:
                data.append(line)

In [None]:
summary = pd.DataFrame()
for db_name, result in fn(file):
    if result != 'No hit found':
        df = pd.read_csv(StringIO(result), sep='\t')
        df['Database'] = db_name
        summary = pd.concat([summary, df])

In [None]:
summary.drop_duplicates('Resistance gene').groupby('Database')['Resistance gene'].apply(lambda x: x.str.cat(sep=', '))

In [None]:
db_name