In [2]:
import pandas, pathlib, json

from collections import defaultdict

## Parsing the JSON files produced by EIT Pathogena

We've downloaded the `main_report` and `resistance_prediction_report` for each sample using the CLI tool; now we need to aggregate the relevant data into some pandas DataFrames to make downstream analysis easier.

We will produce two tables.

`EFFECTS`: one row per mutation that leads to a predicted effect on a drug

`PREDICTIONS`: the above collapsed down to one row per sample per drug

In [50]:
predictions = defaultdict(list)
effects = defaultdict(list)

for folder in ['ukmyc', 'mgit']:

    path = pathlib.Path('dat/outputs/')

    for i in (path / folder).glob('*.resistance_prediction_report.json'):

        f = open(i)

        sample = i.stem.split('.')[0]

        data = json.load(f)

        for drug, prediction in data['data']['antibiogram'].items():
            predictions['ENA_RUN_ACCESSION'].append(sample)
            predictions['DRUG'].append(drug)
            predictions['PREDICTION'].append(prediction)

        for drug in data['data']['effects']:
            for i in (data['data']['effects'][drug]):
                if "phenotype" in i.keys():
                    continue
                effects['ENA_RUN_ACCESSION'].append(sample)
                effects['DRUG'].append(drug)
                effects['GENE'].append(i['gene'])
                effects['MUTATION'].append(i['mutation'])
                effects['PREDICTION'].append(i['prediction'])

predictions= pandas.DataFrame.from_dict(predictions)
effects = pandas.DataFrame.from_dict(effects)

In [51]:
effects[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,GENE,MUTATION,PREDICTION
0,ERR4829376,CAP,tlyA,L11L,S
1,ERR4829376,DLM,fgd1,F320F,S
2,ERR4829376,ETH,ethA,Y147!,R


In [52]:
print(f"The EFFECTS table contains {effects.ENA_RUN_ACCESSION.nunique()} samples and {effects.shape[0]} rows")

The EFFECTS table contains 2663 samples and 53398 rows


In [53]:
predictions[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,PREDICTION
0,ERR4829376,AMI,S
1,ERR4829376,BDQ,S
2,ERR4829376,CAP,S


In [54]:
print(f"The PREDICTIONS table contains {predictions.ENA_RUN_ACCESSION.nunique()} samples and {predictions.shape[0]} rows, the latter being 15 times the former since the WHOv2 catalogue contains 15 drugs")

The PREDICTIONS table contains 2663 samples and 39945 rows, the latter being 15 times the former since the WHOv2 catalogue contains 15 drugs


In [56]:
effects.set_index(['ENA_RUN_ACCESSION','DRUG','GENE','MUTATION'], inplace=True, verify_integrity=True)
predictions.set_index(['ENA_RUN_ACCESSION','DRUG'], inplace=True, verify_integrity=True)


In [57]:
effects[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PREDICTION
ENA_RUN_ACCESSION,DRUG,GENE,MUTATION,Unnamed: 4_level_1
ERR4829376,CAP,tlyA,L11L,S
ERR4829376,DLM,fgd1,F320F,S
ERR4829376,ETH,ethA,Y147!,R


In [58]:
predictions[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,PREDICTION
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1
ERR4829376,AMI,S
ERR4829376,BDQ,S
ERR4829376,CAP,S


In [59]:
effects.to_csv('dat/EFFECTS.csv')
predictions.to_csv('dat/PREDICTIONS.csv')