# `03-create-dataset-validation`

Read in three sets of clinical samples with mutations and recorded phenotypes to form the VALIDATION dataset

In [29]:
import pandas, copy
import itertools

pandas.options.display.max_columns=999
pandas.options.display.max_rows=170

The CSV files read in below are created by the previous notebook, `0-parse-original-data`.

In [3]:
validation_csvs = ['miotto2014','whitfield2015','cryptic2021']
stem='data/clinical-samples/ds-'
validation_dfs = {}
validation_sets = {}
for i in validation_csvs:
    validation_dfs[i] = pandas.read_csv(stem+i+'.csv')
    validation_dfs[i].set_index('MUTATION', inplace=True)
    validation_sets[i] = set(validation_dfs[i].index)
    validation_dfs[i].reset_index(inplace=True)
    validation_dfs[i].set_index(['MUTATION', 'IS_SNP', 'IN_CDS'], inplace=True)
    validation_dfs[i]['total'] = validation_dfs[i].R + validation_dfs[i].S

print("The number of mutations and samples in each set are:")
for i in validation_csvs:
    print(i, len(validation_sets[i]), validation_dfs[i].total.sum())

print("\n..and the intersection in numbers of mutations between sets are:")
if len(validation_csvs)>1:
    for i in list(itertools.combinations(validation_csvs,2)):
        print(i, len(validation_sets[i[0]].intersection(validation_sets[i[1]])))

print("\n..and the number of mutations in all three sets is:")
if len(validation_csvs)>2:
    for i in list(itertools.combinations(validation_csvs,3)):
        print(i, len(validation_sets[i[0]] & validation_sets[i[1]] & validation_sets[i[2]]))

The number of mutations and samples in each set are:
miotto2014 199 755
whitfield2015 65 634
cryptic2021 561 3578

..and the intersection in numbers of mutations between sets are:
('miotto2014', 'whitfield2015') 41
('miotto2014', 'cryptic2021') 179
('whitfield2015', 'cryptic2021') 47

..and the number of mutations in all three sets is:
('miotto2014', 'whitfield2015', 'cryptic2021') 35


Let's join them all and aggregate the AST results

In [4]:
VALIDATION = validation_dfs['cryptic2021'].join(validation_dfs['miotto2014'], lsuffix='s', rsuffix = 'm', how='outer')
VALIDATION = VALIDATION.join(validation_dfs['whitfield2015'], rsuffix='w',how='outer')
VALIDATION.rename(columns={'R': 'Rw', 'S': 'Sw', 'total': 'totalw'}, inplace=True)
VALIDATION.fillna(0, inplace=True)   
VALIDATION = VALIDATION.astype('int')
VALIDATION['R'] = VALIDATION['Rs'] + VALIDATION['Rm'] + VALIDATION['Rw']
VALIDATION['S'] = VALIDATION['Ss'] + VALIDATION['Sm'] + VALIDATION['Sw']
VALIDATION['TOTAL'] = VALIDATION['R'] + VALIDATION['S']
VALIDATION.drop(columns=['Rs', 'Ss', 'Rm', 'Sm', 'Rw', 'Sw', 'totals', 'totalm', 'totalw'], inplace=True)
VALIDATION.reset_index(inplace=True)
VALIDATION['PROP_R']=VALIDATION['R']/VALIDATION['TOTAL']
VALIDATION['PROP_S']=VALIDATION['S']/VALIDATION['TOTAL']
VALIDATION.columns.name='index'
VALIDATION

index,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S
0,!187G,True,True,1,1,2,0.5,0.5
1,!187R,True,True,0,1,1,0.0,1.0
2,-29_indel,False,False,0,1,1,0.0,1.0
3,-2_indel,False,False,0,12,12,0.0,1.0
4,-32_indel,False,False,0,1,1,0.0,1.0
...,...,...,...,...,...,...,...,...
588,g-9a,True,False,0,1,1,0.0,1.0
589,t-10c,True,False,0,2,2,0.0,1.0
590,t-12c,True,False,9,1,10,0.9,0.1
591,t-7c,True,False,9,0,9,1.0,0.0


In [5]:
print("There are %i samples in total with %i unique mutations" % (VALIDATION.TOTAL.sum(), len(VALIDATION.MUTATION.unique())))

There are 4967 samples in total with 593 unique mutations


Apply the arbitrary rules described in the Methods to assign an overall phenotype 

In [6]:
def reliable_phenotype(row):
    reliable_phenotype=False
    phenotype='U'
    if row['TOTAL']>=4:
        if row['PROP_R']>=0.75:
            reliable_phenotype=True
            phenotype='R'
        elif row['PROP_S']>=0.75:
            reliable_phenotype=True
            phenotype='S'
    elif row['TOTAL']>=2:
        if row['R']==row['TOTAL']:
            reliable_phenotype=True
            phenotype='R'
        elif row['S']==row['TOTAL']:
            reliable_phenotype=True
            phenotype='S'

    return pandas.Series([reliable_phenotype, phenotype]) 

VALIDATION[['RELIABLE_PHENOTYPE', 'CONSISTENT_PHENOTYPE']]=VALIDATION.apply(reliable_phenotype,axis=1)   

print("%i samples cannot be assigned a phenotype" % (VALIDATION[VALIDATION.RELIABLE_PHENOTYPE].TOTAL.sum()))

VALIDATION[VALIDATION.RELIABLE_PHENOTYPE].CONSISTENT_PHENOTYPE.value_counts(dropna=False)

3811 samples cannot be assigned a phenotype


CONSISTENT_PHENOTYPE
R    221
S     65
Name: count, dtype: int64

In [7]:
VALIDATION

index,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,CONSISTENT_PHENOTYPE
0,!187G,True,True,1,1,2,0.5,0.5,False,U
1,!187R,True,True,0,1,1,0.0,1.0,False,U
2,-29_indel,False,False,0,1,1,0.0,1.0,False,U
3,-2_indel,False,False,0,12,12,0.0,1.0,True,S
4,-32_indel,False,False,0,1,1,0.0,1.0,False,U
...,...,...,...,...,...,...,...,...,...,...
588,g-9a,True,False,0,1,1,0.0,1.0,False,U
589,t-10c,True,False,0,2,2,0.0,1.0,True,S
590,t-12c,True,False,9,1,10,0.9,0.1,True,R
591,t-7c,True,False,9,0,9,1.0,0.0,True,R


In [8]:
def classify_variant(row):
    is_cds=False
    is_snp=False
    is_nonsyn=False
    is_missense=False
    cols = row.MUTATION.split('_')
    if '-' not in row.MUTATION:
        is_cds=True        
        if len(cols)!=3 and 'indel' not in row.MUTATION:
            is_snp=True
            if row.MUTATION[0]!=row.MUTATION[-1]:
                is_nonsyn=True
                if row.MUTATION[-1]!='!':
                    is_missense=True
    else:
        if len(cols)!=3 and 'indel' not in row.MUTATION:
            is_snp=True
    
    return pandas.Series([is_cds,is_snp,is_nonsyn,is_missense])    

VALIDATION[['IN_CDS', 'IS_SNP', 'IS_NONSYN', 'IS_MISSENSE']] = VALIDATION.apply(classify_variant, axis=1)

pandas.crosstab(VALIDATION.IN_CDS,[VALIDATION.IS_SNP, VALIDATION.IS_NONSYN, VALIDATION.IS_MISSENSE])

IS_SNP,False,True,True,True
IS_NONSYN,False,False,True,True
IS_MISSENSE,False,False,False,True
IN_CDS,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
False,7,14,0,0
True,144,40,19,369


Hence we find there are 21 promoter mutations (incl. 7 indels) and 144 indels in the CDS, with 40 synoymous mutations, 19 nonsense mutations and 369 missense mutations

In [9]:
print("Table 1: There are %i non-redundant missense mutations in this dataset" % (len(VALIDATION[ VALIDATION.IS_MISSENSE])))

Table 1: There are 369 non-redundant missense mutations in this dataset


But we cannot structurally model (i) mutations in the Stop codon and also (ii) mutations in resid 186 since it is not resolved in the protein structure so let's identify these

In [10]:
def valid_for_structure(row):
    if row.IN_CDS and row.IS_SNP and row.IS_NONSYN and row.IS_MISSENSE:
        if "!" in row.MUTATION:
            return False
        elif '186' in row.MUTATION:
            return False
        else:
            return True
    else:
        return False

VALIDATION['STRUCTURALLY_VALID'] = VALIDATION.apply(valid_for_structure, axis=1)        

But how many of these can we map onto the protein structure?

In [11]:
VALIDATION[(VALIDATION.STRUCTURALLY_VALID) & (VALIDATION.IS_MISSENSE)].CONSISTENT_PHENOTYPE.value_counts(dropna=False)

CONSISTENT_PHENOTYPE
U    168
R    155
S     44
Name: count, dtype: int64

Let's subset down to only those mutations we can train a protein-structure based model on

In [12]:
DATASET = VALIDATION[(VALIDATION.STRUCTURALLY_VALID) & (VALIDATION.IS_MISSENSE)]
DATASET.drop(columns=['STRUCTURALLY_VALID', 'IS_MISSENSE', 'PROP_R', 'PROP_S'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET.drop(columns=['STRUCTURALLY_VALID', 'IS_MISSENSE', 'PROP_R', 'PROP_S'], inplace=True)


In [13]:
filestem = 'data/ds-validation-mutations'

DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.isin(['R','S','U'])].to_csv(filestem+'-full.csv',index=False)

DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.isin(['R','S'])][['MUTATION','CONSISTENT_PHENOTYPE']].to_csv(filestem+'-phen.csv',index=False)

DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.isin(['R','S','U']))][['MUTATION']].to_csv(filestem+'-muts.csv',index=False, header=False)

DATASET['SEGID']='A'
DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.isin(['R','S','U']))][['SEGID','MUTATION']].to_csv(filestem+'-semu.csv',index=False, header=False, sep=' ')

DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET['SEGID']='A'


CONSISTENT_PHENOTYPE
U    168
R    155
S     44
Name: count, dtype: int64

In [14]:
AGGREGATED_DATASET=copy.deepcopy(VALIDATION[(VALIDATION.STRUCTURALLY_VALID) & (VALIDATION.IS_MISSENSE)])
# AGGREGATED_DATASET.rename(columns={'PHENOTYPE':'CONSISTENT_PHENOTYPE'}, inplace=True)
AGGREGATED_DATASET.set_index(['MUTATION', 'CONSISTENT_PHENOTYPE'], inplace=True)
df = AGGREGATED_DATASET[['R','S']].stack()
DATASET = df.index.repeat(df).to_frame(index=False)
DATASET.rename(columns={'index':'PHENOTYPE'}, inplace=True)
DATASET

Unnamed: 0,MUTATION,CONSISTENT_PHENOTYPE,PHENOTYPE
0,A102P,U,R
1,A102P,U,R
2,A102P,U,R
3,A102P,U,R
4,A102P,U,R
...,...,...,...
4022,Y64S,U,R
4023,Y64S,U,S
4024,Y64S,U,S
4025,Y95D,U,R


In [15]:
pandas.crosstab(DATASET.PHENOTYPE, DATASET.CONSISTENT_PHENOTYPE)

CONSISTENT_PHENOTYPE,R,S,U
PHENOTYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R,2281,31,472
S,202,605,436


In [16]:
filestem = 'data/ds-validation-samples'

DATASET.loc[DATASET.PHENOTYPE.notna()].to_csv(filestem+'-full.csv',index=False)

DATASET.loc[DATASET.PHENOTYPE.notna()][['MUTATION','PHENOTYPE']].to_csv(filestem+'-phen.csv',index=False)

DATASET.loc[(DATASET.PHENOTYPE.notna())][['MUTATION']].to_csv(filestem+'-muts.csv',index=False, header=False)

DATASET['SEGID']='A'
DATASET.loc[(DATASET.PHENOTYPE.notna())][['SEGID','MUTATION']].to_csv(filestem+'-semu.csv',index=False, header=False, sep=' ')

DATASET.PHENOTYPE.value_counts(dropna=False)

PHENOTYPE
R    2784
S    1243
Name: count, dtype: int64

In [17]:
AGGREGATED_DATASET

Unnamed: 0_level_0,index,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,IS_NONSYN,IS_MISSENSE,STRUCTURALLY_VALID
MUTATION,CONSISTENT_PHENOTYPE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A102P,U,True,True,8,4,12,0.666667,0.333333,False,True,True,True
A102R,S,True,True,0,3,3,0.000000,1.000000,True,True,True,True
A102T,R,True,True,3,0,3,1.000000,0.000000,True,True,True,True
A102V,U,True,True,12,13,25,0.480000,0.520000,False,True,True,True
A134D,U,True,True,0,1,1,0.000000,1.000000,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
Y41C,U,True,True,0,1,1,0.000000,1.000000,False,True,True,True
Y64D,U,True,True,2,4,6,0.333333,0.666667,False,True,True,True
Y64S,U,True,True,1,2,3,0.333333,0.666667,False,True,True,True
Y95D,U,True,True,1,0,1,1.000000,0.000000,False,True,True,True


In [18]:
DATASET.loc[DATASET.PHENOTYPE.notna()]

Unnamed: 0,MUTATION,CONSISTENT_PHENOTYPE,PHENOTYPE,SEGID
0,A102P,U,R,A
1,A102P,U,R,A
2,A102P,U,R,A
3,A102P,U,R,A
4,A102P,U,R,A
...,...,...,...,...
4022,Y64S,U,R,A
4023,Y64S,U,S,A
4024,Y64S,U,S,A
4025,Y95D,U,R,A


In [19]:
AGGREGATED_DATASET.reset_index(inplace=True)

In [20]:
AGGREGATED_DATASET[AGGREGATED_DATASET.CONSISTENT_PHENOTYPE=='U'].TOTAL.sum(), len(AGGREGATED_DATASET[AGGREGATED_DATASET.CONSISTENT_PHENOTYPE=='U'])

(908, 168)

In [21]:
AGGREGATED_DATASET[AGGREGATED_DATASET.CONSISTENT_PHENOTYPE!='U'].TOTAL.sum()

3119

In [22]:

AGGREGATED_DATASET

index,MUTATION,CONSISTENT_PHENOTYPE,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,IS_NONSYN,IS_MISSENSE,STRUCTURALLY_VALID
0,A102P,U,True,True,8,4,12,0.666667,0.333333,False,True,True,True
1,A102R,S,True,True,0,3,3,0.000000,1.000000,True,True,True,True
2,A102T,R,True,True,3,0,3,1.000000,0.000000,True,True,True,True
3,A102V,U,True,True,12,13,25,0.480000,0.520000,False,True,True,True
4,A134D,U,True,True,0,1,1,0.000000,1.000000,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,Y41C,U,True,True,0,1,1,0.000000,1.000000,False,True,True,True
363,Y64D,U,True,True,2,4,6,0.333333,0.666667,False,True,True,True
364,Y64S,U,True,True,1,2,3,0.333333,0.666667,False,True,True,True
365,Y95D,U,True,True,1,0,1,1.000000,0.000000,False,True,True,True


In [56]:
traintest = pandas.read_csv('data/ds-traintest-phen.csv')

In [60]:
TRAINTEST = set(traintest.MUTATION)
VALIDATION = set(AGGREGATED_DATASET.MUTATION)
VALIDATIONNOU = set(AGGREGATED_DATASET[AGGREGATED_DATASET.CONSISTENT_PHENOTYPE!='U'].MUTATION)

In [61]:
len(TRAINTEST), len(VALIDATION), len(VALIDATIONNOU)

(664, 367, 199)

In [63]:
len(TRAINTEST.intersection(VALIDATION)), len(TRAINTEST.intersection(VALIDATIONNOU))

(266, 169)

In [64]:
AGGREGATED_DATASET[AGGREGATED_DATASET.MUTATION.isin(TRAINTEST.intersection(VALIDATIONNOU))].TOTAL.sum()

2958

In [27]:
AGGREGATED_DATASET[~AGGREGATED_DATASET.MUTATION.isin(TRAINTEST.intersection(VALIDATION))].TOTAL.sum()

360

In [51]:
AGGREGATED_DATASET[(AGGREGATED_DATASET.CONSISTENT_PHENOTYPE=='U') & (AGGREGATED_DATASET.TOTAL>=4)].TOTAL.sum()

748

In [55]:
AGGREGATED_DATASET[(AGGREGATED_DATASET.CONSISTENT_PHENOTYPE=='U') & (AGGREGATED_DATASET.TOTAL<4)].TOTAL.sum()

160

In [67]:
DATASET[:3]

Unnamed: 0,MUTATION,CONSISTENT_PHENOTYPE,PHENOTYPE,SEGID
0,A102P,U,R,A
1,A102P,U,R,A
2,A102P,U,R,A


In [69]:
DATASET[DATASET.MUTATION.isin(TRAINTEST.intersection(VALIDATIONNOU))]

Unnamed: 0,MUTATION,CONSISTENT_PHENOTYPE,PHENOTYPE,SEGID
47,A134V,R,R,A
48,A134V,R,R,A
49,A134V,R,R,A
50,A134V,R,R,A
51,A134V,R,R,A
...,...,...,...,...
4009,Y34D,R,R,A
4010,Y34D,R,R,A
4011,Y34D,R,R,A
4012,Y34D,R,S,A


In [74]:
traintest.rename(columns={'CONSISTENT_PHENOTYPE': 'TRAINTEST_PHENOTYPE'}, inplace=True)
traintest

Unnamed: 0,MUTATION,TRAINTEST_PHENOTYPE
0,A102V,S
1,A134D,S
2,A134P,R
3,A134S,S
4,A134V,R
...,...,...
659,Y95N,S
660,Y99C,S
661,Y99D,S
662,Y99F,S


In [82]:
AGGREGATED_DATASET.set_index('MUTATION', inplace=True)

In [75]:
traintest.set_index('MUTATION', inplace=True)
DATASET.set_index('MUTATION', inplace=True)

In [83]:
df = AGGREGATED_DATASET[AGGREGATED_DATASET.index.isin(TRAINTEST.intersection(VALIDATIONNOU))]
df = df.join(traintest,how='left')
pandas.crosstab(df.CONSISTENT_PHENOTYPE, df.TRAINTEST_PHENOTYPE)

TRAINTEST_PHENOTYPE,R,S
CONSISTENT_PHENOTYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
R,138,2
S,6,23


In [85]:
df[:3]

Unnamed: 0_level_0,CONSISTENT_PHENOTYPE,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,IS_NONSYN,IS_MISSENSE,STRUCTURALLY_VALID,TRAINTEST_PHENOTYPE
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A134V,R,True,True,23,2,25,0.92,0.08,True,True,True,True,R
A143G,R,True,True,6,0,6,1.0,0.0,True,True,True,True,R
A143V,S,True,True,0,6,6,0.0,1.0,True,True,True,True,S


In [86]:
DATASET.CONSISTENT_PHENOTYPE.value_counts()

CONSISTENT_PHENOTYPE
R    2483
U     908
S     636
Name: count, dtype: int64