In [1]:
import pandas, pathlib, json

from collections import defaultdict

## Parsing the JSON files produced by EIT GPAS

We've downloaded the `vcf` file for each sample amd run `gnomonicius` locally; now we need to aggregate the relevant data into some pandas DataFrames to make downstream analysis easier.

We will produce two tables.

`EFFECTS`: one row per mutation that leads to a predicted effect on a drug

`PREDICTIONS`: the above collapsed down to one row per sample per drug

In [2]:
predictions = defaultdict(list)
effects = defaultdict(list)

for folder in ['mgit', 'ukmyc']:

    path = pathlib.Path('dat/outputs/')

    for i in (path / folder).glob('*.gnomonicus-out.json'):

        # Exclude the five samples with read naming issues
        if i.stem.split('.')[0] in ['ERR4796519', 'ERR4796408', 'ERR4796312', 'ERR4796311', 'ERR4796303']:
            print(f"Skipping {i.stem} due to read naming issues")
            continue

        f = open(i)

        sample = i.stem.split('.')[0]

        data = json.load(f)

        for drug, prediction in data['data']['antibiogram'].items():
            predictions['ENA_RUN_ACCESSION'].append(sample)
            predictions['DRUG'].append(drug)
            predictions['PREDICTION'].append(prediction)

        for drug in data['data']['effects']:
            for j in (data['data']['effects'][drug]):
                if "phenotype" in j.keys():
                    continue
                effects['ENA_RUN_ACCESSION'].append(sample)
                effects['DRUG'].append(drug)
                effects['GENE'].append(j['gene'])
                effects['MUTATION'].append(j['mutation'])
                effects['PREDICTION'].append(j['prediction'])
                epistasis = False
                if 'expert_rule' in j['evidence'].keys():
                    if "epistasis" in j['evidence']['expert_rule']:
                        epistasis = True
                effects['EPISTASIS'].append(epistasis)         

predictions= pandas.DataFrame.from_dict(predictions)
effects = pandas.DataFrame.from_dict(effects)

Skipping ERR4796519.gnomonicus-out due to read naming issues
Skipping ERR4796311.gnomonicus-out due to read naming issues
Skipping ERR4796408.gnomonicus-out due to read naming issues
Skipping ERR4796303.gnomonicus-out due to read naming issues
Skipping ERR4796312.gnomonicus-out due to read naming issues


In [3]:
effects[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,GENE,MUTATION,PREDICTION,EPISTASIS
0,ERR13286073,AMI,rrs,c492t:11,S,False
1,ERR13286073,BDQ,Rv0678,c-11a:11,S,False
2,ERR13286073,BDQ,mmpL5,D767N:5,S,False


In [4]:
effects[effects.EPISTASIS]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,GENE,MUTATION,PREDICTION,EPISTASIS
2219,ERR5917746,AMI,,eis@268_del_acgcatcgccg&eis@c-14t,S,True
2241,ERR5917746,KAN,,eis@268_del_acgcatcgccg&eis@c-14t,S,True
47241,ERR4829406,AMI,,eis@626_ins_g&eis@c-14t,S,True
47262,ERR4829406,KAN,,eis@626_ins_g&eis@c-14t,S,True


In [5]:
def assign_booleans(row):
    minor_call = False
    is_null = False
    if ":" in row.MUTATION:
        minor_call = True
    if 'x' in row.MUTATION:
        is_null = True
    elif 'X' in row.MUTATION:
        is_null = True
    return pandas.Series([minor_call, is_null])

effects[['IS_MINOR_ALLELE','IS_NULL']] = effects.apply(assign_booleans, axis=1) 

pandas.crosstab(effects.IS_MINOR_ALLELE, effects.IS_NULL)

IS_NULL,False,True
IS_MINOR_ALLELE,Unnamed: 1_level_1,Unnamed: 2_level_1
False,61774,503
True,3596,0


In [6]:
print(f"The EFFECTS table contains {effects.ENA_RUN_ACCESSION.nunique()} samples and {effects.shape[0]} rows")

The EFFECTS table contains 2658 samples and 65873 rows


In [7]:
predictions[:3]

Unnamed: 0,ENA_RUN_ACCESSION,DRUG,PREDICTION
0,ERR13286073,AMI,S
1,ERR13286073,BDQ,S
2,ERR13286073,CAP,S


In [8]:
print(f"The PREDICTIONS table contains {predictions.ENA_RUN_ACCESSION.nunique()} samples and {predictions.shape[0]} rows, the latter being 15 times the former since the WHOv2 catalogue contains 15 drugs")

The PREDICTIONS table contains 2658 samples and 39870 rows, the latter being 15 times the former since the WHOv2 catalogue contains 15 drugs


In [9]:
effects.set_index(['ENA_RUN_ACCESSION','DRUG','GENE','MUTATION'], inplace=True, verify_integrity=True)
predictions.set_index(['ENA_RUN_ACCESSION','DRUG'], inplace=True, verify_integrity=True)


In [10]:
effects[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PREDICTION,EPISTASIS,IS_MINOR_ALLELE,IS_NULL
ENA_RUN_ACCESSION,DRUG,GENE,MUTATION,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ERR13286073,AMI,rrs,c492t:11,S,False,True,False
ERR13286073,BDQ,Rv0678,c-11a:11,S,False,True,False
ERR13286073,BDQ,mmpL5,D767N:5,S,False,True,False


In [11]:
predictions[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,PREDICTION
ENA_RUN_ACCESSION,DRUG,Unnamed: 2_level_1
ERR13286073,AMI,S
ERR13286073,BDQ,S
ERR13286073,CAP,S


In [12]:
effects.to_csv('dat/RAW_EFFECTS.csv')
predictions.to_csv('dat/RAW_PREDICTIONS.csv')