# `01-parse-original-data`

This notebook reads the original (if possible) source of the data, such as an Excel sheet provided as Supplementary Information to the cited reference. Since all the data sources use slightly different conventions there is a substantial amount of cleaning to get each into the same, simple format that the next notebooks then consume.

In [1]:
import pandas

pandas.options.display.max_columns=999

Define a `dict` so we can convert 3-letter amino acids to 1-letter codes

In [2]:
amino_acid_lookup = {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K', 'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N', 'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W', 'Ala': 'A', 'Val':'V', 'Glu': 'E', 'Tyr': 'Y', 'Met': 'M', 'stop': '!'}

Load in the amino acid sequence of version 3 of H37Rv / NC_000962. Retrieved from mycobrowser.epfl.ch on 6 Jan 2023.

In [3]:
with open('data/reference/NC_000962.3_pncA_aminoacids.fasta','r') as INPUT:
    INPUT.readline()
    pncA_amino_acid_sequence = INPUT.readline()


# 1. Test/Train dataset

This is derived from three sources
(a). an in vitro mutagenesis study by Yadon et al. (Yadon2017)
(b). the NEJM resistance catalogue (NEJM2018)
(c). the WHO resistance catalogue (WHO2021)



### 1 (a). Yadon2017

In [4]:
YADON = pandas.read_excel('./data/source-data/41467_2017_721_MOESM4_ESM.xlsx')

# rename the columns to not contain spaces
YADON.rename(columns={  'Substitution':'MUTATION',
                        'Under-Represented?': 'UNDER_REPRESENTED',
                        'Catalogue': 'CATALOGUE',
                        'Miotto et. al.': 'MIOTTO2014',
                        'Walker et. al.': 'WALKER2015'}, inplace=True)

# make a Boolean IS_NONSYN columns
def define_nonsyn(row):
    IS_NONSYN = False
    IS_SYN = False
    CONTAINS_STOP = False
    if row['MUTATION'][-1]=='*':
        CONTAINS_STOP=True
    if row['MUTATION'][0]==row['MUTATION'][-1]:
        IS_SYN=True
    elif row['MUTATION'][0]!=row['MUTATION'][-1]:
        IS_NONSYN=True
    return pandas.Series([IS_NONSYN,IS_SYN,CONTAINS_STOP])

YADON[['IS_NONSYNONYMOUS','IS_SYNONYMOUS','CONTAINS_STOP']] = YADON.apply(define_nonsyn, axis=1)       
YADON['UNDER_REPRESENTED'] = YADON.UNDER_REPRESENTED=='Y'

# replace the NaNs with zeros and then convert to True/False                        
for i in ['in vitro Resistant', 'in vitro Susceptible', 'in vivo Resistant', 'in vivo Susceptible','MIOTTO2014','WALKER2015']:
    YADON[i] = YADON[i].fillna(0).astype('bool')                        

# combine the one-hot encoded columns
def assign_phenotypes(row, string):
    if row[string+' Resistant'] and not row[string+' Susceptible']:
        return('R')
    elif not row[string+' Resistant'] and row[string+' Susceptible']:
        return('S')
    else:
        return(None)

YADON['IN_VITRO_PHENOTYPE'] = YADON.apply(assign_phenotypes, args=('in vitro',), axis=1)
YADON['IN_VIVO_PHENOTYPE'] = YADON.apply(assign_phenotypes, args=('in vivo',), axis=1)

# drop the one-hot columns
YADON.drop(columns=['in vitro Resistant', 'in vitro Susceptible', 'in vivo Resistant', 'in vivo Susceptible'], inplace=True)

def interpet_catalogue(row):
    if row.CATALOGUE=='E':
        return('R')
    elif row.CATALOGUE=='D':
        return('S')

YADON['YADON_RESULT'] = YADON.apply(interpet_catalogue, axis=1)       

def replace_stop_codon(row):
    if "*" in row.MUTATION:
        return row.MUTATION.replace("*","!")
    else:
        return row.MUTATION

YADON['MUTATION'] = YADON.apply(replace_stop_codon, axis=1)

# reorder the dataset
YADON = YADON[['MUTATION', 'UNDER_REPRESENTED', 'IN_VITRO_PHENOTYPE', 'IN_VIVO_PHENOTYPE', 'YADON_RESULT', 'MIOTTO2014',
       'WALKER2015', 'IS_NONSYNONYMOUS','IS_SYNONYMOUS','CONTAINS_STOP']]

YADON = YADON[(~YADON.UNDER_REPRESENTED) & (YADON.YADON_RESULT.notna())]
YADON.set_index('MUTATION', inplace=True, verify_integrity=True)
YADON[['YADON_RESULT']].to_csv('data/catalogues/ds-yadon2017.csv', index=True)
YADON[['YADON_RESULT']][:3]

  warn(msg)


Unnamed: 0_level_0,YADON_RESULT
MUTATION,Unnamed: 1_level_1
M1I,R
M1L,R
M1V,S


### 1 (b). NEJM2018

In [5]:
nejm2018 = pandas.read_csv('data/source-data/NC_000962.3_NEJM2018_v1.1_GARC1_RUS.csv')

nejm2018 = nejm2018[(nejm2018.DRUG=='PZA') & (nejm2018.PREDICTION.isin(['R','S'])) & (~nejm2018.MUTATION.str.contains('*', regex=False))]

def infer_type(row):
    mut = row.MUTATION
    cols = mut.split('@')
    IS_SNP = True
    IN_CDS = True
    if 'ins' in mut:
        IS_SNP=False
    elif 'del' in mut:
        IS_SNP=False
    elif '-' in mut:
        IN_CDS=False
    return pandas.Series([IS_SNP, IN_CDS, cols[0], cols[1]])        

nejm2018[['IS_SNP', 'IN_CDS', 'GENE', 'MUTATION']] = nejm2018.apply(infer_type, axis=1)    
nejm2018.set_index('MUTATION', inplace=True, verify_integrity=True)
nejm2018[['PREDICTION']].to_csv('data/catalogues/ds-nejm2018.csv', index=True)
nejm2018[['PREDICTION']][:3]

Unnamed: 0_level_0,PREDICTION
MUTATION,Unnamed: 1_level_1
-3_indel,S
145_indel,S
185_indel,R


### 1 (c). WHO2021

In [6]:
who = pandas.read_csv('data/source-data/NC_000962.3_WHO-UCN-GTB-PCI-2021.7_v1.0_GARC1_RUS.csv')

who = who[(who.DRUG=='PZA') & (who.PREDICTION.isin(['R','S'])) & (~who.MUTATION.str.contains('*', regex=False))]

def infer_type(row):
    mut = row.MUTATION
    cols = mut.split('@')
    gene = cols[0]
    mutation = cols[1]
    IS_SNP = True
    IN_CDS = True
    if ('ins' in mut) or ('del' in mut):
        IS_SNP=False
        bits = mutation.split('_')
        mutation = bits[0]+"_indel"
    elif '-' in mut:
        IN_CDS=False
    return pandas.Series([IS_SNP, IN_CDS, gene, mutation])        

who[['IS_SNP', 'IN_CDS', 'GENE', 'MUTATION']] = who.apply(infer_type, axis=1)    

# By using a generic `X_indel` description there are now several different indels at a specific position, however they all confer resistance according to the catalogue so we can drop the duplicate rows
foo = who[~who.IS_SNP].MUTATION.value_counts()
print(who[who.MUTATION.isin(foo[foo>1].index)].PREDICTION.value_counts())

who.set_index('MUTATION', inplace=True, verify_integrity=False)
who = who.loc[~who.index.duplicated(keep='first')]
who.reset_index(inplace=True)
who.set_index('MUTATION', inplace=True, verify_integrity=True)
who[['PREDICTION']].to_csv('data/catalogues/ds-who2021.csv', index=True)
who[['PREDICTION']][:3]

R    38
Name: PREDICTION, dtype: int64


Unnamed: 0_level_0,PREDICTION
MUTATION,Unnamed: 1_level_1
-5_indel,R
108_indel,R
116_indel,R


# 2. Validation dataset

This is derived from three sets of clinical samples with DST data

### 2 (a). CRyPTIC2021

In [7]:
DST_CORE = pandas.read_csv('data/source-data/cryptic1-dst.csv')
DST_CORE.set_index('UNIQUEID', inplace=True)

LINEAGES_CORE = pandas.read_csv('data/source-data/cryptic1-lineages.csv')
LINEAGES_CORE.set_index('UNIQUEID', inplace=True)

MUTATIONS_CORE = pandas.read_csv('data/source-data/cryptic1-mutations.csv')
MUTATIONS_CORE.set_index('UNIQUEID', inplace=True)
MUTATIONS_CORE = MUTATIONS_CORE.join(LINEAGES_CORE, how='left')

print("There are %i samples in the CRyPTIC dataset with both genetics and a pncA phenotype" % len(DST_CORE))

There are 22842 samples in the CRyPTIC dataset with both genetics and a pncA phenotype


If a sample has a mutation in *pncA*, it could have one, two or more than two mutations.

In [8]:
MUTATIONS_CORE.reset_index(inplace=True)
MUTATIONS_CORE_COUNTS=MUTATIONS_CORE[['UNIQUEID','SITEID']].groupby('UNIQUEID').count()
MUTATIONS_CORE.set_index('UNIQUEID',inplace=True)
MUTATIONS_CORE_COUNTS.rename(columns={'SITEID':'N_MUTATIONS'},inplace=True)
MUTATIONS_CORE_COUNTS
MUTATIONS_CORE_COUNTS.N_MUTATIONS.value_counts()

1    6622
2     228
3       1
Name: N_MUTATIONS, dtype: int64

Some of these are likely to be lineage-associated. Let's check if any of the synoymous mutations are potential lineage-defining/associated mutations

In [9]:
mutation_counts = MUTATIONS_CORE.MUTATION.value_counts()

df = MUTATIONS_CORE[MUTATIONS_CORE.MUTATION.isin(mutation_counts[mutation_counts>=3].index) & MUTATIONS_CORE.IS_SYNONYMOUS]

pandas.crosstab(df.MUTATION, df.MYKROBE_LINEAGE_NAME_1)

MYKROBE_LINEAGE_NAME_1,Lineage 3,Lineage 4,Lineage 5,Lineage Bovis,Mixed,Unknown
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A38A,0,17,0,0,0,0
G75G,0,5,0,0,0,0
G97G,0,1,2,0,0,0
L117L,3,0,0,0,0,0
L35L,1,1,0,0,0,0
S65S,2872,0,0,0,21,1
S67S,2,0,0,0,0,0
S74S,0,0,0,1,0,0
T100T,2,0,0,0,0,0


Synonymous mutations S65S and A38A are entirely associated with different lineages and have n>10 so we will assume are lineage-associated and so can be ignored. This reduces the number of samples with >=1 mutation present.

In [10]:
MUTATIONS_CORE = MUTATIONS_CORE[~MUTATIONS_CORE.MUTATION.isin(['S65S','A38A'])]
MUTATIONS_CORE.reset_index(inplace=True)
MUTATIONS_CORE_COUNTS=MUTATIONS_CORE[['UNIQUEID','SITEID']].groupby('UNIQUEID').count()
MUTATIONS_CORE.set_index('UNIQUEID',inplace=True)
MUTATIONS_CORE_COUNTS.rename(columns={'SITEID':'N_MUTATIONS'},inplace=True)
MUTATIONS_CORE_COUNTS
MUTATIONS_CORE_COUNTS.N_MUTATIONS.value_counts()

1    3578
2      33
Name: N_MUTATIONS, dtype: int64

Overall we end up with 3,611 samples with DST data and one or more mutations

In [11]:
samples_with_mutations=set(MUTATIONS_CORE.index)
DST_CORE.loc[DST_CORE.index.isin(samples_with_mutations), 'HAS_MUTATION']=True
DST_CORE.HAS_MUTATION.fillna(False,inplace=True)
DST_CORE.HAS_MUTATION.value_counts()

False    19231
True      3611
Name: HAS_MUTATION, dtype: int64

In [12]:
df = pandas.crosstab(DST_CORE.HAS_MUTATION, DST_CORE.PHENOTYPE, margins=True)
df

PHENOTYPE,R,S,All
HAS_MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,684,18547,19231
True,2667,944,3611
All,3351,19491,22842


In [13]:
prop_R_mut = 100*df['R'][True] / df['All'][True]
prop_R_nomut = 100*df['R'][False] / df['All'][False]
print("%.1f %% of samples with a mutation are resistant, whilst only %.1f %% of samples without a mutation are resistant" % (prop_R_mut, prop_R_nomut))

73.9 % of samples with a mutation are resistant, whilst only 3.6 % of samples without a mutation are resistant


Let's take a quick look at the most common mutations

In [14]:
MUTATIONS_CORE.MUTATION.value_counts()[:10]

H57D     242
a-11g    134
I6L      128
Q10R      98
Q10P      82
H51R      79
G97D      58
L4S       47
H57R      46
V139A     46
Name: MUTATION, dtype: int64

Now we need to identify those samples which have one (and only one) mutation in *pncA*

In [15]:
DST_CORE=DST_CORE.join(MUTATIONS_CORE_COUNTS,how='left')
DST_CORE.N_MUTATIONS.fillna(0,inplace=True) 
DST_CORE['IS_SOLO']=DST_CORE.N_MUTATIONS==1
SOLOS=MUTATIONS_CORE.join(DST_CORE[DST_CORE.IS_SOLO][['PHENOTYPE', 'SOURCE']],how='inner')
SOLOS[:3]

Unnamed: 0_level_0,GENE,MUTATION,POSITION,AMINO_ACID_NUMBER,GENOME_INDEX,NUCLEOTIDE_NUMBER,REF,ALT,IS_SNP,IS_INDEL,IN_CDS,IN_PROMOTER,IS_SYNONYMOUS,IS_NONSYNONYMOUS,IS_HET,IS_NULL,IS_FILTER_PASS,ELEMENT_TYPE,MUTATION_TYPE,INDEL_LENGTH,INDEL_1,INDEL_2,SITEID,NUMBER_NUCLEOTIDE_CHANGES,MYKROBE_LINEAGE_NAME_1,MYKROBE_LINEAGE_NAME_2,PHENOTYPE,SOURCE
UNIQUEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
site.00.subj.1000347.lab.H111540004.iso.1,pncA,Q141P,141.0,141.0,,,cag,ccg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,0,1,Lineage 2,lineage2.2.10,R,NEJM2018
site.00.subj.1000595.lab.H123460044.iso.1,pncA,D12A,12.0,12.0,,,gac,gcc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,0,1,Lineage 2,lineage2.2.9,S,NEJM2018
site.00.subj.1004213.lab.H111060034.iso.1,pncA,392_indel,392.0,131.0,2288850.0,392.0,,,False,True,True,False,False,False,False,False,True,GENE,INDEL,2.0,392_ins,392_ins_2,0,0,Lineage 2,lineage2.2.6,R,NEJM2018


Now we can summarise and write to disc

In [16]:
SOLO_MUTATIONS_CROSSTAB=pandas.crosstab([SOLOS.MUTATION,SOLOS.IS_SNP,SOLOS.IN_CDS],SOLOS.PHENOTYPE,margins=False,margins_name='TOTAL')
SOLO_MUTATIONS_CROSSTAB.to_csv('data/clinical-samples/ds-cryptic2021.csv')
SOLO_MUTATIONS_CROSSTAB

Unnamed: 0_level_0,Unnamed: 1_level_0,PHENOTYPE,R,S
MUTATION,IS_SNP,IN_CDS,Unnamed: 3_level_1,Unnamed: 4_level_1
-29_indel,False,False,0,1
-2_indel,False,False,0,12
-32_indel,False,False,0,1
-3_indel,False,False,2,0
-4_indel,False,False,10,1
...,...,...,...,...
g-9a,True,False,0,1
t-10c,True,False,0,2
t-12c,True,False,9,1
t-7c,True,False,5,0


### 2 (b). Miotto2014 data

> Miotto, P., Cabibbe, A. M., Feuerriegel, S., Casali, N., Drobniewski, F., Rodionova, Y., Bakonyte, D., Stakenas, P., Pimkina, E., Augustynowicz-Kopeć, E., Degano, M., Ambrosi, A., Hoffner, S., Mansjö, M., Werngren, J., Rüsch-Gerdes, S., Niemann, S., & Cirillo, D. M. (2014). Mycobacterium tuberculosis pyrazinamide resistance determinants: a multicenter study. mBio, 5(5), e01819-14. https://doi.org/10.1128/mBio.01819-14

Load the raw Excel file from the supplement of this paper.

In [17]:
MIOTTO2014 = pandas.read_excel('./data/source-data/mbo005142032st1.xlsx', header=2)

def lookup_mutation(row):

    mutation = None
    IS_SNP = None
    IN_CDS = None
    
    # skipping rows with a comma skips samples with multiple mutations
    if row['pncA aa'] != 'WT' and ',' not in row['pncA nt'] and ',' not in row['pncA aa'] and row['pncA nt'] != 'del >200 nt':
        if row['pncA aa'][:3] not in amino_acid_lookup.keys():
            if 'promoter' in row['pncA aa']: # or 'shift' not in row['pncA aa']:
                IN_CDS = False
                if row['pncA nt'][:3]  in ['Ins', 'Del']: 
                    IS_SNP = False
                    mut = row['pncA nt']
                    pos = mut.split(' ')[0][3:]
                    mutation = (str(int(pos))+'_indel')
                else:
                    IS_SNP = True
                    mut = row['pncA nt']
                    assert mut[0] in ['a','t','c','g'], mut
                    assert mut[-1] in ['a','t','c','g'], mut
                    mutation = (row['pncA nt'])
        else:
            mut = row['pncA aa']
            IS_SNP = True
            IN_CDS = True
            if 'stop' in mut:
                if mut[:4] == 'stop':
                    mutation = (amino_acid_lookup[mut[:4]] + mut[4:-3] + amino_acid_lookup[mut[-3:]])
                else:
                    mutation = (amino_acid_lookup[mut[:3]] + mut[3:-4] + amino_acid_lookup[mut[-4:]])
            else:
                ref = amino_acid_lookup[mut[:3]]
                pos = mut[3:-3]
                alt = amino_acid_lookup[mut[-3:]]
                assert pncA_amino_acid_sequence[int(pos)-1] == ref, mut

                mutation = (ref + pos + alt)
    return pandas.Series([mutation, IS_SNP, IN_CDS])
    
MIOTTO2014[['MUTATION', 'IS_SNP', 'IN_CDS']] = MIOTTO2014.apply(lookup_mutation, axis=1)
MIOTTO2014.rename(columns = {'PZA': 'PHENOTYPE'}, inplace=True)
MIOTTO2014_MUTATIONS_SUMMARY=pandas.crosstab([MIOTTO2014.MUTATION,MIOTTO2014.IS_SNP,MIOTTO2014.IN_CDS],MIOTTO2014.PHENOTYPE,margins=False)
MIOTTO2014_MUTATIONS_SUMMARY

Unnamed: 0_level_0,Unnamed: 1_level_0,PHENOTYPE,R,S
MUTATION,IS_SNP,IN_CDS,Unnamed: 3_level_1,Unnamed: 4_level_1
-3_indel,False,False,1,1
-5_indel,False,False,2,0
A102P,True,True,0,1
A102R,True,True,0,1
A102T,True,True,1,0
...,...,...,...,...
a-11g,True,False,31,4
a-11t,True,False,0,1
g-13t,True,False,0,1
t-7c,True,False,4,0


In [18]:
n_miotto = MIOTTO2014_MUTATIONS_SUMMARY.R.sum() + MIOTTO2014_MUTATIONS_SUMMARY.S.sum()
print("There are a total of " + str(n_miotto) + " samples in the Miotto2014 dataset")

There are a total of 755 samples in the Miotto2014 dataset


In [19]:
MIOTTO2014_MUTATIONS_SUMMARY.reset_index(inplace=True)
n_miotto_nonsyn = MIOTTO2014_MUTATIONS_SUMMARY[(MIOTTO2014_MUTATIONS_SUMMARY.IS_SNP) & (MIOTTO2014_MUTATIONS_SUMMARY.IN_CDS)]
print("There are " + str(len(n_miotto_nonsyn)) + " non-synoymous mutations in " + str(n_miotto_nonsyn.R.sum() + n_miotto_nonsyn.S.sum()) + " samples")
MIOTTO2014_MUTATIONS_SUMMARY.set_index(['MUTATION', 'IS_SNP', 'IN_CDS'], inplace=True)
MIOTTO2014_MUTATIONS_SUMMARY.to_csv('data/clinical-samples/ds-miotto2014.csv',index=True)
MIOTTO2014_MUTATIONS_SUMMARY[:3]

There are 191 non-synoymous mutations in 704 samples


Unnamed: 0_level_0,Unnamed: 1_level_0,PHENOTYPE,R,S
MUTATION,IS_SNP,IN_CDS,Unnamed: 3_level_1,Unnamed: 4_level_1
-3_indel,False,False,1,1
-5_indel,False,False,2,0
A102P,True,True,0,1


### 2 (c). Whitfield2015

> Whitfield, M. G., Soeters, H. M., Warren, R. M., York, T., Sampson, S. L., Streicher, E. M., Van Helden, P. D., & Van Rie, A. (2015). A global perspective on pyrazinamide resistance: Systematic review and meta-analysis. PLoS ONE, 10(7), 1–16. https://doi.org/10.1371/journal.pone.0133869

Load in the Whitfield data -- this was copied from `zjm999094564so1.pdf`, missing out any promoter/indels, into an Excel sheet.

In [20]:
WHITFIELD2015 = pandas.read_excel('./data/source-data/zjm999094564so1.xlsx', header=1)

def parse_whitfield2015(row):

    mut = row.MUTATION.rstrip()
    IS_SNP = True
    IN_CDS = True

    if mut[:4]== 'stop':
        ref = amino_acid_lookup[mut[:4]]
        pos = mut[4:-3]
        alt = amino_acid_lookup[mut[-3:]]
    elif mut[-4:] == 'stop':
        ref = amino_acid_lookup[mut[:3]]
        pos = mut[3:-4]
        alt = amino_acid_lookup[mut[-4:]]
    else:
        ref = amino_acid_lookup[mut[:3]]
        pos = mut[3:-3]
        alt = amino_acid_lookup[mut[-3:]]

    mutation = ref + pos + alt
    if int(pos) <= 186:
        assert pncA_amino_acid_sequence[int(pos)-1] == ref, mut
    else:
        assert '!' == ref , mut

    
    return(pandas.Series([mutation, IS_SNP, IN_CDS]))

WHITFIELD2015[['MUTATION', 'IS_SNP', 'IN_CDS']] = WHITFIELD2015.apply(parse_whitfield2015, axis=1)
WHITFIELD2015 = WHITFIELD2015[WHITFIELD2015.MUTATION.notna()][['MUTATION', 'IS_SNP', 'IN_CDS', 'R', 'S']]
WHITFIELD2015.set_index(['MUTATION', 'IS_SNP', 'IN_CDS'], inplace=True, verify_integrity=True)
WHITFIELD2015.to_csv('data/clinical-samples/ds-whitfield2015.csv')
print("There are " + str(len(WHITFIELD2015)) + " non-synonymous mutations in " + str(WHITFIELD2015.R.sum() + WHITFIELD2015.S.sum()) + " samples in the Whitfield2015 dataset.")
WHITFIELD2015[:3]

There are 65 non-synonymous mutations in 634 samples in the Whitfield2015 dataset.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,R,S
MUTATION,IS_SNP,IN_CDS,Unnamed: 3_level_1,Unnamed: 4_level_1
I5T,True,True,0,1
I6L,True,True,0,128
V9V,True,True,0,1
