In [1]:
import pandas, pathlib, json

from collections import defaultdict

## Parsing the JSON files produced by EIT Pathogena

We've downloaded the `main_report` and `resistance_prediction_report` for each sample using the CLI tool; now we need to aggregate the relevant data into some pandas DataFrames to make downstream analysis easier.

We will produce two tables.

`EFFECTS`: one row per mutation that leads to a predicted effect on a drug

`PREDICTIONS`: the above collapsed down to one row per sample per drug

In [2]:
predictions = defaultdict(list)
effects = defaultdict(list)

for folder in ['ukmyc', 'mgit']:

    path = pathlib.Path('dat/outputs/')

    for i in (path / folder).glob('*.resistance_prediction_report.json'):

        f = open(i)

        sample = i.stem.split('.')[0]

        data = json.load(f)

        for drug, prediction in data['data']['antibiogram'].items():
            predictions['ENA_RUN_ACCESSION'].append(sample)
            predictions['DRUG'].append(drug)
            predictions['PREDICTION'].append(prediction)

        for drug in data['data']['effects']:
            for i in (data['data']['effects'][drug]):
                if "phenotype" in i.keys():
                    continue
                effects['ENA_RUN_ACCESSION'].append(sample)
                effects['DRUG'].append(drug)
                effects['GENE'].append(i['gene'])
                effects['MUTATION'].append(i['mutation'])
                effects['PREDICTION'].append(i['prediction'])
                epistasis = False
                if 'expert_rule' in i['evidence'].keys():
                    if "epistasis" in i['evidence']['expert_rule']:
                        epistasis = True
                effects['EPISTASIS'].append(epistasis)         

predictions= pandas.DataFrame.from_dict(predictions)
effects = pandas.DataFrame.from_dict(effects)

In [3]:
effects[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,GENE,MUTATION,PREDICTION,EPISTASIS
0,ERR4829376,CAP,tlyA,L11L,S,False
1,ERR4829376,DLM,fgd1,F320F,S,False
2,ERR4829376,ETH,ethA,Y147!,R,False


In [4]:
effects[effects.EPISTASIS]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,GENE,MUTATION,PREDICTION,EPISTASIS
13109,ERR4829406,AMI,,eis@626_ins_g&eis@c-14t,S,True
13124,ERR4829406,KAN,,eis@626_ins_g&eis@c-14t,S,True
33524,ERR5917746,AMI,,eis@268_del_acgcatcgccg&eis@c-14t,S,True
33538,ERR5917746,KAN,,eis@268_del_acgcatcgccg&eis@c-14t,S,True


In [5]:
def assign_booleans(row):
    minor_call = False
    is_null = False
    if ":" in row.MUTATION:
        minor_call = True
    if 'x' in row.MUTATION:
        is_null = True
    elif 'X' in row.MUTATION:
        is_null = True
    return pandas.Series([minor_call, is_null])

effects[['IS_MINOR_ALLELE','IS_NULL']] = effects.apply(assign_booleans, axis=1) 

pandas.crosstab(effects.IS_MINOR_ALLELE, effects.IS_NULL)

IS_NULL,False,True
IS_MINOR_ALLELE,Unnamed: 1_level_1,Unnamed: 2_level_1
False,51966,375
True,1057,0


In [6]:
print(f"The EFFECTS table contains {effects.ENA_RUN_ACCESSION.nunique()} samples and {effects.shape[0]} rows")

The EFFECTS table contains 2663 samples and 53398 rows


In [7]:
predictions[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,PREDICTION
0,ERR4829376,AMI,S
1,ERR4829376,BDQ,S
2,ERR4829376,CAP,S


In [8]:
print(f"The PREDICTIONS table contains {predictions.ENA_RUN_ACCESSION.nunique()} samples and {predictions.shape[0]} rows, the latter being 15 times the former since the WHOv2 catalogue contains 15 drugs")

The PREDICTIONS table contains 2663 samples and 39945 rows, the latter being 15 times the former since the WHOv2 catalogue contains 15 drugs


In [9]:
effects.set_index(['ENA_RUN_ACCESSION','DRUG','GENE','MUTATION'], inplace=True, verify_integrity=True)
predictions.set_index(['ENA_RUN_ACCESSION','DRUG'], inplace=True, verify_integrity=True)


In [10]:
effects[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PREDICTION,EPISTASIS,IS_MINOR_ALLELE,IS_NULL
ENA_RUN_ACCESSION,DRUG,GENE,MUTATION,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ERR4829376,CAP,tlyA,L11L,S,False,False,False
ERR4829376,DLM,fgd1,F320F,S,False,False,False
ERR4829376,ETH,ethA,Y147!,R,False,False,False


In [11]:
predictions[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,PREDICTION
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1
ERR4829376,AMI,S
ERR4829376,BDQ,S
ERR4829376,CAP,S


In [12]:
effects.to_csv('dat/RAW_EFFECTS.csv')
predictions.to_csv('dat/RAW_PREDICTIONS.csv')