# `1-create-dataset-testtrain`

Read in two catalogues of mutations with assigned phenotypic effects and one in vitro study to form the TRAIN/TEST dataset

In [1]:
import sbmlcore, pandas, numpy, copy
import itertools

pandas.options.display.max_columns=999

The CSV files read in below are created by the previous notebook, `0-parse-original-data`.

In [2]:
filestem = 'data/ds-traintest'

testtrain_dfs = {}
testtrain_sets = {}
testtrain_csvs=[]

testtrain_csvs.append('yadon')
testtrain_csvs.append('who')
testtrain_csvs.append('nejm')

testtrain_dfs['yadon'] = pandas.read_csv('data/catalogues/ds-yadon2017.csv')
testtrain_dfs['yadon'].set_index('MUTATION', inplace=True)
testtrain_dfs['yadon'].rename(columns={'YADON_RESULT':'YADON_PHENOTYPE'}, inplace=True)
testtrain_sets['yadon'] = set(testtrain_dfs['yadon'].index)

testtrain_dfs['who'] = pandas.read_csv('data/catalogues/ds-who2021.csv')
testtrain_dfs['who'].set_index('MUTATION', inplace=True)
testtrain_dfs['who'].rename(columns={'PREDICTION':'WHO_PHENOTYPE'}, inplace=True)
testtrain_sets['who'] = set(testtrain_dfs['who'].index)

testtrain_dfs['nejm'] = pandas.read_csv('data/catalogues/ds-nejm2018.csv')
testtrain_dfs['nejm'].set_index('MUTATION', inplace=True)
testtrain_dfs['nejm'].rename(columns={'PREDICTION':'NEJM_PHENOTYPE'}, inplace=True)
testtrain_sets['nejm'] = set(testtrain_dfs['nejm'].index)

Let's do some simple statistics of how many samples each has etc and what the overlaps are

In [3]:
df_set = {}
for dataset in ['yadon', 'who', 'nejm']:

    df = testtrain_dfs[dataset]
    n_total = len(df)
    n_r = df[dataset.upper() + '_PHENOTYPE'].value_counts(dropna=False).R
    n_s = df[dataset.upper() + '_PHENOTYPE'].value_counts(dropna=False).S
    df_set[dataset] = set(df.index)

    print("The %s dataset contains %i samples of which %i are resistant and %i susceptible." % (dataset, n_total, n_r, n_s))

The yadon dataset contains 781 samples of which 351 are resistant and 430 susceptible.
The who dataset contains 322 samples of which 306 are resistant and 16 susceptible.
The nejm dataset contains 437 samples of which 398 are resistant and 39 susceptible.


In [4]:
total = 0
n = len(df_set['yadon'] & df_set['who'] & df_set['nejm'])
total += n
print("%i samples occured in all three datasets" % n)

n = len(df_set['yadon'] & df_set['who'] - df_set['nejm'])
total += n
print("%i samples occured in yadon and who but not nejm" % n)

n = len(df_set['yadon'] & df_set['nejm'] - df_set['who'])
total += n
print("%i samples occured in yadon and nejm but not who" % n)

n = len(df_set['who'] & df_set['nejm'] - df_set['yadon'])
total += n
print("%i samples occured in nejm and who but not yadon" % n)

n = len(df_set['yadon'] - df_set['who'] - df_set['nejm'])
total += n
print("%i samples occured in yadon and not in either of who and nejm" % n)

n = len(df_set['who'] - df_set['yadon'] - df_set['nejm'])
total += n
print("%i samples occured in who and not in either of yadon and nejm" % n)

n = len(df_set['nejm'] - df_set['yadon'] - df_set['who'])
total += n
print("%i samples occured in nejm and not in either of who and yadon" % n)

print("Overall there were %i samples" % total)

117 samples occured in all three datasets
4 samples occured in yadon and who but not nejm
242 samples occured in yadon and nejm but not who
42 samples occured in nejm and who but not yadon
418 samples occured in yadon and not in either of who and nejm
159 samples occured in who and not in either of yadon and nejm
36 samples occured in nejm and not in either of who and yadon
Overall there were 1018 samples


To reduce errors, we will only keep mutations where there is no disagreement between any of the three catalogues

In [5]:
TESTTRAIN=None
for i in testtrain_csvs:
    if TESTTRAIN is None:
        TESTTRAIN = copy.deepcopy(testtrain_dfs[i])
    else:
        TESTTRAIN = TESTTRAIN.join(testtrain_dfs[i], how='outer')    

number_definites = len(testtrain_csvs)

def consistent_phenotype(row):
    yadon = row.YADON_PHENOTYPE
    who = row.WHO_PHENOTYPE
    nejm = row.NEJM_PHENOTYPE

    if number_definites==1:
        if use_yadon:
            return yadon
        elif use_who:
            return who
        elif use_nejm:
            return nejm

    # count how many Nones there are    
    number_of_nones = 0
    if isinstance(yadon, float):
        number_of_nones+=1
    if isinstance(who, float):
        number_of_nones+=1
    if isinstance(nejm, float):
        number_of_nones+=1

    # if all three are None, no result is possible
    if number_of_nones==3:
        return None

    # there are 2 Nones so any string is the result
    elif number_of_nones==2:
        if isinstance(yadon,str):
            return(yadon)
        elif isinstance(who,str):
            return(who)
        elif isinstance(nejm,str):
            return(nejm)

    # there is only 1 None, so either there are two datasets, in which case check for concordance
    elif number_of_nones==1:
        if isinstance(yadon,float):
            if who==nejm:
                return who
        elif isinstance(who,float):
            if yadon==nejm:
                return yadon
        elif isinstance(nejm,float):
            if yadon==who:
                return yadon
    
    # if there are three results, only permit exact concordance
    elif number_of_nones==0:
        if yadon==who==nejm:
            return yadon

TESTTRAIN['CONSISTENT_PHENOTYPE'] = TESTTRAIN.apply(consistent_phenotype, axis=1)    
TESTTRAIN.CONSISTENT_PHENOTYPE.value_counts(dropna=False)


R       546
S       447
None     25
Name: CONSISTENT_PHENOTYPE, dtype: int64

There are 25 samples with inconsistent phenotypes between the three catalogues

In [6]:
TESTTRAIN[TESTTRAIN.CONSISTENT_PHENOTYPE.isna()]

Unnamed: 0_level_0,YADON_PHENOTYPE,WHO_PHENOTYPE,NEJM_PHENOTYPE,CONSISTENT_PHENOTYPE
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
145_indel,,R,S,
189_indel,,R,S,
231_indel,,R,S,
395_indel,,R,S,
47_indel,,R,S,
A102P,R,R,S,
A146T,R,R,S,
D129N,R,,S,
D136N,,S,R,
D53E,R,,S,


Let's remove these 25 samples as we cannot be sure what phenotype to give them

In [7]:
DATASET = TESTTRAIN[TESTTRAIN.CONSISTENT_PHENOTYPE.notna()]
DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

R    546
S    447
Name: CONSISTENT_PHENOTYPE, dtype: int64

With the remainder, let's classify the types of mutations we have

In [8]:
def classify_variant(row):
    is_cds=False
    is_snp=False
    is_nonsyn=False
    is_missense=False
    cols = row.MUTATION.split('_')
    if '-' not in row.MUTATION:
        is_cds=True        
        if len(cols)!=3 and 'indel' not in row.MUTATION:
            is_snp=True
            if row.MUTATION[0]!=row.MUTATION[-1]:
                is_nonsyn=True
                if row.MUTATION[-1]!='!':
                    is_missense=True
    else:
        if len(cols)!=3 and 'indel' not in row.MUTATION:
            is_snp=True
    
    return pandas.Series([is_cds,is_snp,is_nonsyn,is_missense]) 

DATASET.reset_index(inplace=True)
DATASET[['IN_CDS', 'IS_SNP', 'IS_NONSYN', 'IS_MISSENSE']] = DATASET.apply(classify_variant, axis=1)
DATASET.set_index('MUTATION',inplace=True)

pandas.crosstab(DATASET.IN_CDS,[DATASET.IS_SNP, DATASET.IS_NONSYN, DATASET.IS_MISSENSE])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET[['IN_CDS', 'IS_SNP', 'IS_NONSYN', 'IS_MISSENSE']] = DATASET.apply(classify_variant, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET[['IN_CDS', 'IS_SNP', 'IS_NONSYN', 'IS_MISSENSE']] = DATASET.apply(classify_variant, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATAS

IS_SNP,False,True,True,True
IS_NONSYN,False,False,True,True
IS_MISSENSE,False,False,False,True
IN_CDS,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
False,4,12,0,0
True,132,144,33,668


In [9]:
print("Table 1: There are %i non-redundant missense mutations in this dataset" % (len(DATASET[ DATASET.IS_MISSENSE])))

Table 1: There are 668 non-redundant missense mutations in this dataset


But we cannot structurally model (i) mutations in the Stop codon and also (ii) mutations in resid 186 since it is not resolved in the protein structure

In [10]:
DATASET.reset_index(inplace=True)

def valid_for_structure(row):
    if row.IS_MISSENSE:
        if "!" in row.MUTATION:
            return False
        elif '186' in row.MUTATION:
            return False
        else:
            return True
    else:
        return False

DATASET['STRUCTURALLY_VALID'] = DATASET.apply(valid_for_structure, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET['STRUCTURALLY_VALID'] = DATASET.apply(valid_for_structure, axis=1)


In [11]:
print("There are %i non-redundant missense mutations but only %i can be mapped onto the protein structure" % (len(DATASET[DATASET.IS_MISSENSE]), len(DATASET[(DATASET.IS_MISSENSE) & (DATASET.STRUCTURALLY_VALID)])))

There are 668 non-redundant missense mutations but only 664 can be mapped onto the protein structure


Let's subset down to those we can model onto the structure and write to disc

In [12]:
DATASET = DATASET[(DATASET.STRUCTURALLY_VALID) & (DATASET.IS_MISSENSE)]
DATASET.drop(columns=['STRUCTURALLY_VALID'],inplace=True)

DATASET.to_csv(filestem+'-full.csv',index=False)

DATASET[['MUTATION','CONSISTENT_PHENOTYPE']].to_csv(filestem+'-phen.csv',index=False)

DATASET[['MUTATION']].to_csv(filestem+'-muts.csv',index=False, header=False)

DATASET['SEGID']='A'
DATASET[['SEGID','MUTATION']].to_csv(filestem+'-semu.csv',index=False, header=False, sep=' ')

print("This leaves %i non-redundant missense mutations in this dataset" % (len(DATASET)) )

DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

This leaves 664 non-redundant missense mutations in this dataset


R    349
S    315
Name: CONSISTENT_PHENOTYPE, dtype: int64

In [13]:
DATASET

Unnamed: 0,MUTATION,YADON_PHENOTYPE,WHO_PHENOTYPE,NEJM_PHENOTYPE,CONSISTENT_PHENOTYPE,IN_CDS,IS_SNP,IS_NONSYN,IS_MISSENSE,SEGID
138,A102V,S,,,S,True,True,True,True,A
140,A134D,,,S,S,True,True,True,True,A
141,A134P,R,,R,R,True,True,True,True,A
142,A134S,S,,,S,True,True,True,True,A
143,A134V,R,R,R,R,True,True,True,True,A
...,...,...,...,...,...,...,...,...,...,...
974,Y95N,S,,,S,True,True,True,True,A
977,Y99C,S,,,S,True,True,True,True,A
978,Y99D,S,,,S,True,True,True,True,A
979,Y99F,S,,,S,True,True,True,True,A
