In [1]:
import os
from pathlib import Path
import pandas as pd

In [2]:
import sys
sys.path.append('../../src')
from parse import parse_amrfinder_result, parse_resfinder_result, parse_mlst_result, parse_sistr_result, parse_plasmidfinder_result, parse_pointfinder_result

In [3]:
def parse_text_result(path):
    amrfinder_filename = os.path.join(path, 'amrfinder.txt')
    mlst_filename = os.path.join(path, 'mlst', 'data.json')
    pointfinder_filename = os.path.join(path, 'resfinder', 'PointFinder_results.txt')
    resfinder_filename = os.path.join(path, 'resfinder', 'ResFinder_results_tab.txt')
    sistr_filename = os.path.join(path, 'sistr.json')
    plasmidfinder_filename = os.path.join(path, 'plasmidfinder', 'results_tab.tsv')
    
    summary = dict()
    records = parse_amrfinder_result(amrfinder_filename)
    summary['AMR'] = ', '.join(sorted(record['gene_symbol'] for record in records if record['element_subtype'] == 'AMR'))
#     if os.path.exists(resfinder_filename):
#         summary['resfinder_result'] = parse_resfinder_result(resfinder_filename)
    points = [rec['gene_symbol'] for rec in records if rec['element_subtype'] == 'POINT'] + parse_pointfinder_result(pointfinder_filename)
    summary['Point'] = ', '.join(sorted(set(points)))
    summary.update(parse_mlst_result(mlst_filename))
    summary.update(parse_sistr_result(sistr_filename))
    records = parse_plasmidfinder_result(plasmidfinder_filename)
    summary['Inc type'] = ', '.join(sorted(records))
    return summary

In [4]:
dirpath = Path('/media/GenomicResearch/MiSeq/Salmonella_enterica/Analysis')
summeries = {i.name: parse_text_result(i) for i in dirpath.iterdir()}

In [5]:
df = pd.DataFrame(summeries).T.sort_index()
df.index.name = 'Key'
df.head()

Unnamed: 0_level_0,AMR,Point,ST,hisD,thrA,purE,hemD,aroC,sucA,dnaN,subspecies,serovar,serogroup,Inc type
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
04-901-043322,,,2,1,5,1,2,1,1,1,enterica,Typhi,D1,
04-901-141083,,,2,1,5,1,2,1,1,1,enterica,Typhi,D1,
04-901-326992,,,2,1,5,1,2,1,1,1,enterica,Typhi,D1,
04-901-347232,,,1,1,5,1,1,1,1,1,enterica,Typhi,D1,
04-901-480280,,,2,1,5,1,2,1,1,1,enterica,I D1:j:-,D1,


In [7]:
df.to_csv('/media/GenomicResearch/MiSeq/Salmonella_enterica/summaries.txt', sep='\t')

In [None]:
basepath = Path('/media/GenomicResearch/MiSeq/Salmonella_enterica')
dirpath = basepath/'Analysis'

summaries = dict()
for i in dirpath.iterdir():
    summary = dict()
    records = parse_amrfinder_result(i/'amrfinder.txt')
    for rec in records:
        if rec['element_subtype'] == 'AMR':
            if rec['coverage_of_reference_sequence'] >= 90:
                summary[rec['gene_symbol']] = 1
            else:
                summary[rec['gene_symbol']] = 0.5
    summaries[i.name] = summary
summaries_tab = pd.DataFrame(summaries).T.sort_index().sort_index(axis=1)
summaries_tab.index.name = 'Key'
summaries_tab.to_csv(basepath/'amr2bns.txt', sep='\t')

In [None]:
basepath = Path('/media/GenomicResearch/MiSeq/Salmonella_enterica')
dirpath = basepath/'Analysis'

summaries = dict()
for i in dirpath.iterdir():
    summary = {x: 1 for x in parse_plasmidfinder_result(i/'plasmidfinder'/'results_tab.tsv')}
    summaries[i.name] = summary
summaries_tab = pd.DataFrame(summaries).T.sort_index().sort_index(axis=1)
summaries_tab.index.name = 'Key'
summaries_tab.to_csv(basepath/'plasmid2bns.txt', sep='\t')

In [None]:
basepath = Path('/media/GenomicResearch/MiSeq/Salmonella_enterica')
dirpath = basepath/'Analysis'

summaries = dict()
for i in dirpath.iterdir():
    amrfinder_records = [rec['gene_symbol'] for rec in parse_amrfinder_result(i/'amrfinder.txt') if rec['element_subtype'] == 'POINT']
    resfinder_records = parse_resfinder_result(i/'resfinder'/'PointFinder_results.txt')
    summaries[i.name] = {x: 1 for x in set(amrfinder_records + resfinder_records)}
summaries_tab = pd.DataFrame(summaries).T.sort_index().sort_index(axis=1)
summaries_tab.index.name = 'Key'

summaries_tab.to_csv(basepath/'point2bns.txt', sep='\t')

In [None]:
df = pd.read_csv("/media/GenomicResearch/MiSeq/Salmonella_enterica/Analysis/R21.0464/resfinder/PointFinder_results.txt", sep='\t', usecols=['Mutation'])

In [None]:
df