In [11]:
import pandas

Define a `dict` so we can convert 3-letter amino acids to 1-letter codes

In [12]:
amino_acid_lookup = {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K', 'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N', 'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W', 'Ala': 'A', 'Val':'V', 'Glu': 'E', 'Tyr': 'Y', 'Met': 'M', 'stop': '!'}

Load in the amino acid sequence of version 3 of H37Rv / NC_000962.

Retrieved from mycobrowser.epfl.ch on 6 Jan 2023

In [13]:
with open('data/reference/NC_000962.3_pncA_aminoacids.fasta','r') as INPUT:
    INPUT.readline()
    pncA_amino_acid_sequence = INPUT.readline()


# Test/train dataset

This is derived from three sources
1. an in vitro mutagenesis study by Yadon et al. (Yadon2017)
2. the NEJM resistance catalogue (NEJM2018)
3. the WHO resistance catalogue (WHO2021)



## Yadon2017

In [14]:
YADON = pandas.read_excel('./data/source-data/41467_2017_721_MOESM4_ESM.xlsx')

# rename the columns to not contain spaces
YADON.rename(columns={  'Substitution':'MUTATION',
                        'Under-Represented?': 'UNDER_REPRESENTED',
                        'Catalogue': 'CATALOGUE',
                        'Miotto et. al.': 'MIOTTO2014',
                        'Walker et. al.': 'WALKER2015'}, inplace=True)

# make a Boolean IS_NONSYN columns
def define_nonsyn(row):
    IS_NONSYN = False
    IS_SYN = False
    CONTAINS_STOP = False
    if row['MUTATION'][-1]=='*':
        CONTAINS_STOP=True
    if row['MUTATION'][0]==row['MUTATION'][-1]:
        IS_SYN=True
    elif row['MUTATION'][0]!=row['MUTATION'][-1]:
        IS_NONSYN=True
    return pandas.Series([IS_NONSYN,IS_SYN,CONTAINS_STOP])

YADON[['IS_NONSYNONYMOUS','IS_SYNONYMOUS','CONTAINS_STOP']] = YADON.apply(define_nonsyn, axis=1)       
YADON['UNDER_REPRESENTED'] = YADON.UNDER_REPRESENTED=='Y'

# replace the NaNs with zeros and then convert to True/False                        
for i in ['in vitro Resistant', 'in vitro Susceptible', 'in vivo Resistant', 'in vivo Susceptible','MIOTTO2014','WALKER2015']:
    YADON[i] = YADON[i].fillna(0).astype('bool')                        

# combine the one-hot encoded columns
def assign_phenotypes(row, string):
    if row[string+' Resistant'] and not row[string+' Susceptible']:
        return('R')
    elif not row[string+' Resistant'] and row[string+' Susceptible']:
        return('S')
    else:
        return(None)

YADON['IN_VITRO_PHENOTYPE'] = YADON.apply(assign_phenotypes, args=('in vitro',), axis=1)
YADON['IN_VIVO_PHENOTYPE'] = YADON.apply(assign_phenotypes, args=('in vivo',), axis=1)

# drop the one-hot columns
YADON.drop(columns=['in vitro Resistant', 'in vitro Susceptible', 'in vivo Resistant', 'in vivo Susceptible'], inplace=True)

def interpet_catalogue(row):
    if row.CATALOGUE=='E':
        return('R')
    elif row.CATALOGUE=='D':
        return('S')

YADON['YADON_RESULT'] = YADON.apply(interpet_catalogue, axis=1)       

def replace_stop_codon(row):
    if "*" in row.MUTATION:
        return row.MUTATION.replace("*","!")
    else:
        return row.MUTATION

YADON['MUTATION'] = YADON.apply(replace_stop_codon, axis=1)

# reorder the dataset
YADON = YADON[['MUTATION', 'UNDER_REPRESENTED', 'IN_VITRO_PHENOTYPE', 'IN_VIVO_PHENOTYPE', 'YADON_RESULT', 'MIOTTO2014',
       'WALKER2015', 'IS_NONSYNONYMOUS','IS_SYNONYMOUS','CONTAINS_STOP']]

YADON = YADON[(~YADON.UNDER_REPRESENTED) & (YADON.YADON_RESULT.notna())]
YADON.set_index('MUTATION', inplace=True)

YADON[['YADON_RESULT']].to_csv('data/catalogues/ds-yadon2017.csv', index=True)

YADON[:3]

  warn(msg)


Unnamed: 0_level_0,UNDER_REPRESENTED,IN_VITRO_PHENOTYPE,IN_VIVO_PHENOTYPE,YADON_RESULT,MIOTTO2014,WALKER2015,IS_NONSYNONYMOUS,IS_SYNONYMOUS,CONTAINS_STOP
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
M1I,False,,R,R,False,False,True,False,False
M1L,False,R,R,R,False,False,True,False,False
M1V,False,,S,S,False,False,True,False,False


## NEJM2018

In [15]:
nejm2018 = pandas.read_csv('data/source-data/NC_000962.3_NEJM2018_v1.1_GARC1_RUS.csv')

nejm2018 = nejm2018[(nejm2018.DRUG=='PZA') & (nejm2018.PREDICTION.isin(['R','S'])) & (~nejm2018.MUTATION.str.contains('*', regex=False))]

def infer_type(row):
    mut = row.MUTATION
    cols = mut.split('@')
    IS_SNP = True
    IN_CDS = True
    if 'ins' in mut:
        IS_SNP=False
    elif 'del' in mut:
        IS_SNP=False
    elif '-' in mut:
        IN_CDS=False
    return pandas.Series([IS_SNP, IN_CDS, cols[0], cols[1]])        

nejm2018[['IS_SNP', 'IN_CDS', 'GENE', 'MUTATION']] = nejm2018.apply(infer_type, axis=1)    

nejm2018[['MUTATION', 'PREDICTION']].to_csv('data/catalogues/ds-nejm2018.csv', index=False)

nejm2018[:3]

Unnamed: 0.1,Unnamed: 0,GENBANK_REFERENCE,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,PREDICTION_VALUES,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER,IS_SNP,IN_CDS,GENE
450,469,NC_000962.3,NEJM2018,v1.1,GARC1,RUS,PZA,-3_indel,S,"{""DOI"": ""10.1056/NEJMoa1800474""}",{},"{""LITERATURE_SOURCE"": ""Walker2015"", ""LITERATUR...",False,True,pncA
451,472,NC_000962.3,NEJM2018,v1.1,GARC1,RUS,PZA,145_indel,S,"{""DOI"": ""10.1056/NEJMoa1800474""}",{},"{""LITERATURE_SOURCE"": ""Walker2015"", ""LITERATUR...",False,True,pncA
452,473,NC_000962.3,NEJM2018,v1.1,GARC1,RUS,PZA,185_indel,R,"{""DOI"": ""10.1056/NEJMoa1800474""}",{},"{""LITERATURE_SOURCE"": ""Walker2015"", ""LITERATUR...",False,True,pncA


## WHO2021

In [16]:
who = pandas.read_csv('data/source-data/NC_000962.3_WHO-UCN-GTB-PCI-2021.7_v1.0_GARC1_RUS.csv')

who = who[(who.DRUG=='PZA') & (who.PREDICTION.isin(['R','S'])) & (~who.MUTATION.str.contains('*', regex=False))]

def infer_type(row):
    mut = row.MUTATION
    cols = mut.split('@')
    IS_SNP = True
    IN_CDS = True
    if 'ins' in mut:
        IS_SNP=False
    elif 'del' in mut:
        IS_SNP=False
    elif '-' in mut:
        IN_CDS=False
    return pandas.Series([IS_SNP, IN_CDS, cols[0], cols[1]])        

who[['IS_SNP', 'IN_CDS', 'GENE', 'MUTATION']] = who.apply(infer_type, axis=1)    

who[['MUTATION', 'PREDICTION']].to_csv('data/catalogues/ds-who2021.csv', index=False)

who[:3]

Unnamed: 0.1,GENBANK_REFERENCE,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,PREDICTION_VALUES,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER,Unnamed: 0,IS_SNP,IN_CDS,GENE
228,NC_000962.3,WHO-UCN-GTB-PCI-2021.7,1.0,GARC1,RUS,PZA,-5_del_g,R,{},"{""Present_SOLO_R"": 9, ""Present_SOLO_SR"": 10, ""...","{""FINAL_CONFIDENCE_GRADING"": ""1) Assoc w R""}",,False,True,pncA
229,NC_000962.3,WHO-UCN-GTB-PCI-2021.7,1.0,GARC1,RUS,PZA,108_ins_tacctggc,R,{},"{""Present_SOLO_R"": 1, ""Present_SOLO_SR"": 1, ""P...","{""FINAL_CONFIDENCE_GRADING"": ""2) Assoc w R - I...",,False,True,pncA
230,NC_000962.3,WHO-UCN-GTB-PCI-2021.7,1.0,GARC1,RUS,PZA,116_del_cggactaccatcacgtc,R,{},"{""Present_SOLO_R"": 1, ""Present_SOLO_SR"": 1, ""P...","{""FINAL_CONFIDENCE_GRADING"": ""2) Assoc w R - I...",,False,True,pncA


# Validation dataset

This is also derived from three sources

## CRyPTIC2021

In [17]:
DST_CORE = pandas.read_csv('data/source-data/cryptic1-dst.csv')
DST_CORE.set_index('UNIQUEID', inplace=True)

MUTATIONS_CORE = pandas.read_csv('data/source-data/cryptic1-mutations.csv')
MUTATIONS_CORE.set_index('UNIQUEID', inplace=True)
samples_with_mutations=set(MUTATIONS_CORE.index)

DST_CORE.loc[DST_CORE.index.isin(samples_with_mutations), 'HAS_MUTATION']=True
DST_CORE.HAS_MUTATION.fillna(False,inplace=True)
DST_CORE.HAS_MUTATION.value_counts()

False    15991
True      6851
Name: HAS_MUTATION, dtype: int64

There are 3,549 samples with one or more mutations in pncA (and 19,293 that do not). Mutation here means a non-synonymous mutation (missense and nonsense), an assumed promoter mutation or a detected insertion or deletion.

In [18]:
MUTATIONS_CORE.reset_index(inplace=True)
MUTATIONS_CORE_COUNTS=MUTATIONS_CORE[['UNIQUEID','SITEID']].groupby('UNIQUEID').count()
MUTATIONS_CORE.set_index('UNIQUEID',inplace=True)
MUTATIONS_CORE_COUNTS.rename(columns={'SITEID':'N_MUTATIONS'},inplace=True)
MUTATIONS_CORE_COUNTS
MUTATIONS_CORE_COUNTS.N_MUTATIONS.value_counts()

1    6622
2     228
3       1
Name: N_MUTATIONS, dtype: int64

In [19]:
DST_CORE=DST_CORE.join(MUTATIONS_CORE_COUNTS,how='left')
DST_CORE.N_MUTATIONS.fillna(0,inplace=True) 
DST_CORE['IS_SOLO']=DST_CORE.N_MUTATIONS==1
SOLOS=MUTATIONS_CORE.join(DST_CORE[DST_CORE.IS_SOLO][['PHENOTYPE', 'SOURCE']],how='inner')
SOLOS[:3]

Unnamed: 0_level_0,GENE,MUTATION,POSITION,AMINO_ACID_NUMBER,GENOME_INDEX,NUCLEOTIDE_NUMBER,REF,ALT,IS_SNP,IS_INDEL,...,IS_FILTER_PASS,ELEMENT_TYPE,MUTATION_TYPE,INDEL_LENGTH,INDEL_1,INDEL_2,SITEID,NUMBER_NUCLEOTIDE_CHANGES,PHENOTYPE,SOURCE
UNIQUEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
site.00.subj.1000347.lab.H111540004.iso.1,pncA,Q141P,141.0,141.0,,,cag,ccg,True,False,...,True,GENE,AAM,,,,0,1,R,NEJM2018
site.00.subj.1000595.lab.H123460044.iso.1,pncA,D12A,12.0,12.0,,,gac,gcc,True,False,...,True,GENE,AAM,,,,0,1,S,NEJM2018
site.00.subj.1004213.lab.H111060034.iso.1,pncA,392_indel,392.0,131.0,2288850.0,392.0,,,False,True,...,True,GENE,INDEL,2.0,392_ins,392_ins_2,0,0,R,NEJM2018


In [20]:
SOLO_MUTATIONS_CROSSTAB=pandas.crosstab([SOLOS.MUTATION,SOLOS.IS_SNP,SOLOS.IN_CDS],SOLOS.PHENOTYPE,margins=False,margins_name='TOTAL')
SOLO_MUTATIONS_CROSSTAB.to_csv('data/clinical-samples/ds-cryptic2021.csv')
SOLO_MUTATIONS_CROSSTAB

Unnamed: 0_level_0,Unnamed: 1_level_0,PHENOTYPE,R,S
MUTATION,IS_SNP,IN_CDS,Unnamed: 3_level_1,Unnamed: 4_level_1
-2_indel,False,False,0,12
-32_indel,False,False,0,1
-3_indel,False,False,2,0
-4_indel,False,False,10,1
-9_indel,False,False,0,1
...,...,...,...,...
g-9a,True,False,0,1
t-10c,True,False,0,2
t-12c,True,False,8,1
t-7c,True,False,5,0


## Miotto2014 data

> Miotto, P., Cabibbe, A. M., Feuerriegel, S., Casali, N., Drobniewski, F., Rodionova, Y., Bakonyte, D., Stakenas, P., Pimkina, E., Augustynowicz-Kopeć, E., Degano, M., Ambrosi, A., Hoffner, S., Mansjö, M., Werngren, J., Rüsch-Gerdes, S., Niemann, S., & Cirillo, D. M. (2014). Mycobacterium tuberculosis pyrazinamide resistance determinants: a multicenter study. mBio, 5(5), e01819-14. https://doi.org/10.1128/mBio.01819-14

Load the raw Excel file from the supplement of this paper.

In [21]:
MIOTTO2014 = pandas.read_excel('./data/source-data/mbo005142032st1.xlsx', header=2)

def lookup_mutation(row):

    mutation = None
    IS_SNP = None
    IN_CDS = None
    
    # skipping rows with a comma skips samples with multiple mutations
    if row['pncA aa'] != 'WT' and ',' not in row['pncA nt'] and ',' not in row['pncA aa'] and row['pncA nt'] != 'del >200 nt':
        if row['pncA aa'][:3] not in amino_acid_lookup.keys():
            if 'promoter' in row['pncA aa']: # or 'shift' not in row['pncA aa']:
                IN_CDS = False
                if row['pncA nt'][:3]  in ['Ins', 'Del']: 
                    IS_SNP = False
                    mut = row['pncA nt']
                    pos = mut.split(' ')[0][3:]
                    mutation = ('indel_'+str(int(pos)))
                else:
                    IS_SNP = True
                    mut = row['pncA nt']
                    assert mut[0] in ['a','t','c','g'], mut
                    assert mut[-1] in ['a','t','c','g'], mut
                    mutation = (row['pncA nt'])
        else:
            mut = row['pncA aa']
            IS_SNP = True
            IN_CDS = True
            if 'stop' in mut:
                if mut[:4] == 'stop':
                    mutation = (amino_acid_lookup[mut[:4]] + mut[4:-3] + amino_acid_lookup[mut[-3:]])
                else:
                    mutation = (amino_acid_lookup[mut[:3]] + mut[3:-4] + amino_acid_lookup[mut[-4:]])
            else:
                ref = amino_acid_lookup[mut[:3]]
                pos = mut[3:-3]
                alt = amino_acid_lookup[mut[-3:]]
                assert pncA_amino_acid_sequence[int(pos)-1] == ref, mut

                mutation = (ref + pos + alt)
    return pandas.Series([mutation, IS_SNP, IN_CDS])
    
MIOTTO2014[['MUTATION', 'IS_SNP', 'IN_CDS']] = MIOTTO2014.apply(lookup_mutation, axis=1)

MIOTTO2014.rename(columns = {'PZA': 'PHENOTYPE'}, inplace=True)

MIOTTO2014[:3]

Unnamed: 0,Isolate n,From,Other (SIRE),PHENOTYPE,pncA nt,pncA aa,Δ free energy (pH 6),Notes,Structure analysis,Enzymatic activity,...,Structure,Free energy,Summary structure+energy,Lineage.1,p.S,p.R,Category,MUTATION,IS_SNP,IN_CDS
0,1,Location_12,none,S,WT,WT,-,,,na,...,S,S,S,West African 1,0.841935,0.158065,B,,,
1,2,Location_12,multi,S,WT,WT,-,,,na,...,S,S,S,LAM,0.841935,0.158065,B,,,
2,3,Location_12,mono,S,WT,WT,-,,,na,...,S,S,S,Sierra Leone-1,0.841935,0.158065,B,,,


In [22]:
MIOTTO2014_MUTATIONS_SUMMARY=pandas.crosstab([MIOTTO2014.MUTATION,MIOTTO2014.IS_SNP,MIOTTO2014.IN_CDS],MIOTTO2014.PHENOTYPE,margins=False)
MIOTTO2014_MUTATIONS_SUMMARY

Unnamed: 0_level_0,Unnamed: 1_level_0,PHENOTYPE,R,S
MUTATION,IS_SNP,IN_CDS,Unnamed: 3_level_1,Unnamed: 4_level_1
A102P,True,True,0,1
A102R,True,True,0,1
A102T,True,True,1,0
A102V,True,True,1,1
A134V,True,True,2,0
...,...,...,...,...
g-13t,True,False,0,1
indel_-3,False,False,1,1
indel_-5,False,False,2,0
t-7c,True,False,4,0


In [23]:
n_miotto = MIOTTO2014_MUTATIONS_SUMMARY.R.sum() + MIOTTO2014_MUTATIONS_SUMMARY.S.sum()
print("There are a total of " + str(n_miotto) + " samples in the Miotto2014 dataset")

There are a total of 755 samples in the Miotto2014 dataset


In [24]:
MIOTTO2014_MUTATIONS_SUMMARY.reset_index(inplace=True)
n_miotto_nonsyn = MIOTTO2014_MUTATIONS_SUMMARY[(MIOTTO2014_MUTATIONS_SUMMARY.IS_SNP) & (MIOTTO2014_MUTATIONS_SUMMARY.IN_CDS)]
print("There are " + str(len(n_miotto_nonsyn)) + " non-synoymous mutations in " + str(n_miotto_nonsyn.R.sum() + n_miotto_nonsyn.S.sum()) + " samples")
MIOTTO2014_MUTATIONS_SUMMARY.set_index(['MUTATION', 'IS_SNP', 'IN_CDS'], inplace=True)
MIOTTO2014_MUTATIONS_SUMMARY[:3]

There are 191 non-synoymous mutations in 704 samples


Unnamed: 0_level_0,Unnamed: 1_level_0,PHENOTYPE,R,S
MUTATION,IS_SNP,IN_CDS,Unnamed: 3_level_1,Unnamed: 4_level_1
A102P,True,True,0,1
A102R,True,True,0,1
A102T,True,True,1,0


In [133]:
MIOTTO2014_MUTATIONS_SUMMARY.to_csv('data/clinical-samples/ds-miotto2014.csv',index=True)

## Whitfield2015

> Whitfield, M. G., Soeters, H. M., Warren, R. M., York, T., Sampson, S. L., Streicher, E. M., Van Helden, P. D., & Van Rie, A. (2015). A global perspective on pyrazinamide resistance: Systematic review and meta-analysis. PLoS ONE, 10(7), 1–16. https://doi.org/10.1371/journal.pone.0133869

Load in the Whitfield data -- this was copied from `zjm999094564so1.pdf`, missing out any promoter/indels, into an Excel sheet.

In [134]:
WHITFIELD2015 = pandas.read_excel('./data/source-data/zjm999094564so1.xlsx', header=1)

def parse_whitfield2015(row):

    mut = row.MUTATION.rstrip()
    IS_SNP = True
    IN_CDS = True

    if mut[:4]== 'stop':
        ref = amino_acid_lookup[mut[:4]]
        pos = mut[4:-3]
        alt = amino_acid_lookup[mut[-3:]]
    elif mut[-4:] == 'stop':
        ref = amino_acid_lookup[mut[:3]]
        pos = mut[3:-4]
        alt = amino_acid_lookup[mut[-4:]]
    else:
        ref = amino_acid_lookup[mut[:3]]
        pos = mut[3:-3]
        alt = amino_acid_lookup[mut[-3:]]

    mutation = ref + pos + alt
    if int(pos) <= 186:
        assert pncA_amino_acid_sequence[int(pos)-1] == ref, mut
    else:
        assert '!' == ref , mut

    
    return(pandas.Series([mutation, IS_SNP, IN_CDS]))

WHITFIELD2015[['MUTATION', 'IS_SNP', 'IN_CDS']] = WHITFIELD2015.apply(parse_whitfield2015, axis=1)
# WHITFIELD2015.rename(columns={'R': 'Rw', 'S': 'Sw'}, inplace=True)
WHITFIELD2015 = WHITFIELD2015[WHITFIELD2015.MUTATION.notna()][['MUTATION', 'IS_SNP', 'IN_CDS', 'R', 'S']]
WHITFIELD2015.set_index(['MUTATION', 'IS_SNP', 'IN_CDS'], inplace=True)
WHITFIELD2015.to_csv('data/clinical-samples/ds-whitfield2015.csv')
WHITFIELD2015

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,R,S
MUTATION,IS_SNP,IN_CDS,Unnamed: 3_level_1,Unnamed: 4_level_1
I5T,True,True,0,1
I6L,True,True,0,128
V9V,True,True,0,1
N11D,True,True,0,1
D12A,True,True,26,2
...,...,...,...,...
T177P,True,True,1,1
L182S,True,True,2,1
L182W,True,True,0,1
!187G,True,True,1,1


In [135]:
print("There are " + str(len(WHITFIELD2015)) + " non-synonymous mutations in " + str(WHITFIELD2015.Rw.sum() + WHITFIELD2015.Sw.sum()) + " samples in the Whitfield2015 dataset.")

AttributeError: 'DataFrame' object has no attribute 'Rw'