In [1]:
import os
import json
from pathlib import Path
import pandas as pd

In [2]:
import sys
sys.path.append('../../src')

In [22]:
from parse import parse_amrfinder_result, parse_mlst_result, parse_plasmidfinder_result
from tree import to_numeric

In [10]:
def parse_text_result(path):
    amrfinder_filename = os.path.join(path, 'amrfinder.txt')
    mlst_filename = os.path.join(path, 'mlst', 'data.json')
    plasmidfinder_filename = os.path.join(path, 'plasmidfinder', 'results_tab.tsv')
    
    summary = dict()
    records = parse_amrfinder_result(amrfinder_filename)
    summary['AMR'] = ', '.join(sorted(record['gene_symbol'] for record in records if record['element_subtype'] == 'AMR'))
    summary['Point'] = ', '.join(sorted(rec['gene_symbol'] for rec in records if rec['element_subtype'] == 'POINT'))
    records = parse_plasmidfinder_result(plasmidfinder_filename)
    summary['Inc type'] = ', '.join(sorted(records))
    summary.update(parse_mlst_result(mlst_filename))
    return summary

In [11]:
dirpath = Path('/media/GenomicResearch/MiSeq/Campylobacter/Analysis')
summeries = {i.name: parse_text_result(i) for i in dirpath.iterdir()}

In [12]:
df = pd.DataFrame(summeries).T.sort_index()
df.index.name = 'Key'
df.head()

Unnamed: 0_level_0,AMR,Point,Inc type,ST,tkt,gltA,glyA,pgm,glnA,uncA,aspA
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
R16.0120,"blaOXA, lnu(P), tet(O)",,,609,5,5,72,2,2,39,24
R16.0139,"blaOXA-193, tet(O)","50S_L22_A103V, gyrA_T86I",,9243,3,2,2,86,2,1,24
R16.0174,"blaOXA, tet(O/M/O)",gyrA_T86I,,760,100,52,3,2,1,5,2
R16.0745,blaOXA,gyrA_T86I,,257,5,4,62,4,2,6,9
R16.0746,"blaOXA-193, tet(O)",50S_L22_A103V,,5,3,5,2,10,2,6,7


In [13]:
df.to_csv('/media/GenomicResearch/MiSeq/Campylobacter/summaries.txt', sep='\t')

In [32]:
dirpath = Path('/media/MiSeqOutput/210826_M04855_0083_000000000-JYGBK/Assembly')
accs = set(x.stem.replace('-', '.', 1) for x in dirpath.iterdir())
df.filter(accs, axis=0).sort_index().to_csv('/media/GenomicResearch/MiSeq/Campylobacter/202108026_summaries.txt', sep='\t')

In [15]:
dirpath = Path('/media/GenomicResearch/MiSeq/Campylobacter/Analysis')
summaries = dict()
for i in dirpath.iterdir():
    summary = dict()
    records = parse_amrfinder_result(i/'amrfinder.txt')
    for rec in records:
        if rec['element_subtype'] == 'AMR':
            if rec['coverage_of_reference_sequence'] >= 90:
                summary[rec['gene_symbol']] = 1
            else:
                summary[rec['gene_symbol']] = 0.5
    summaries[i.name] = summary
summaries_tab = pd.DataFrame(summaries).T.sort_index().sort_index(axis=1)
summaries_tab.index.name = 'Key'

summaries_tab.to_csv('/media/GenomicResearch/MiSeq/Campylobacter/amr2bns.txt', sep='\t')

In [16]:
dirpath = Path('/media/GenomicResearch/MiSeq/Campylobacter/Analysis')
summaries = dict()
for i in dirpath.iterdir():
    summary = {x: 1 for x in parse_plasmidfinder_result(i/'plasmidfinder'/'results_tab.tsv')}
    summaries[i.name] = summary
summaries_tab = pd.DataFrame(summaries).T.sort_index().sort_index(axis=1)
summaries_tab.index.name = 'Key'

summaries_tab.to_csv('/media/GenomicResearch/MiSeq/Campylobacter/plasmid2bns.txt', sep='\t')

In [20]:
dirpath = Path('/media/GenomicResearch/MiSeq/Campylobacter/Analysis')
summaries = dict()
for i in dirpath.iterdir():
    records = [rec['gene_symbol'] for rec in parse_amrfinder_result(i/'amrfinder.txt') if rec['element_subtype'] == 'POINT']
    summaries[i.name] = {x: 1 for x in records}
summaries_tab = pd.DataFrame(summaries).T.sort_index().sort_index(axis=1)
summaries_tab.index.name = 'Key'

summaries_tab.to_csv('/media/GenomicResearch/MiSeq/Campylobacter/point2bns.txt', sep='\t')

In [25]:
dirpath = Path('/media/GenomicResearch/MiSeq/Campylobacter/Profile/')
profile = pd.concat((pd.read_csv(i, sep='\t', header=0, index_col=0, names=[i.stem]) for i in dirpath.iterdir()), axis=1)

In [26]:
to_numeric(profile).to_csv('/media/GenomicResearch/MiSeq/Campylobacter/cgmlst2bns.txt', sep='\t', index=False)