In [20]:
import sbmlcore, pandas, numpy, copy
import itertools

pandas.options.display.max_columns=999
pandas.options.display.max_rows=150

## Read in three sets of clinical samples with mutations and recorded phenotypes to form the VALIDATION dataset

In [21]:
filestem = 'data/ds-validation'

validation_csvs = ['miotto2014','whitfield2015','cryptic2021']
stem='data/clinical-samples/ds-'
validation_dfs = {}
validation_sets = {}
for i in validation_csvs:
    validation_dfs[i] = pandas.read_csv(stem+i+'.csv')
    validation_dfs[i].set_index('MUTATION', inplace=True)
    validation_sets[i] = set(validation_dfs[i].index)
    validation_dfs[i].reset_index(inplace=True)
    validation_dfs[i].set_index(['MUTATION', 'IS_SNP', 'IN_CDS'], inplace=True)
    validation_dfs[i]['total'] = validation_dfs[i].R + validation_dfs[i].S

print("The number of mutations and samples in each set are:")
for i in validation_csvs:
    print(i, len(validation_sets[i]), validation_dfs[i].total.sum())

print("\n..and the intersection in numbers of mutations between sets are:")
if len(validation_csvs)>1:
    for i in list(itertools.combinations(validation_csvs,2)):
        print(i, len(validation_sets[i[0]].intersection(validation_sets[i[1]])))

print("\n..and the number of mutations in all three sets is:")
if len(validation_csvs)>2:
    for i in list(itertools.combinations(validation_csvs,3)):
        print(i, len(validation_sets[i[0]] & validation_sets[i[1]] & validation_sets[i[2]]))

The number of mutations and samples in each set are:
miotto2014 199 755
whitfield2015 65 634
cryptic2021 561 3578

..and the intersection in numbers of mutations between sets are:
('miotto2014', 'whitfield2015') 41
('miotto2014', 'cryptic2021') 179
('whitfield2015', 'cryptic2021') 47

..and the number of mutations in all three sets is:
('miotto2014', 'whitfield2015', 'cryptic2021') 35


Let's join them all and aggregate the AST results

In [22]:
VALIDATION = validation_dfs['cryptic2021'].join(validation_dfs['miotto2014'], lsuffix='s', rsuffix = 'm', how='outer')
VALIDATION = VALIDATION.join(validation_dfs['whitfield2015'], rsuffix='w',how='outer')
VALIDATION.rename(columns={'R': 'Rw', 'S': 'Sw', 'total': 'totalw'}, inplace=True)
VALIDATION.fillna(0, inplace=True)   
VALIDATION = VALIDATION.astype('int')
VALIDATION['R'] = VALIDATION['Rs'] + VALIDATION['Rm'] + VALIDATION['Rw']
VALIDATION['S'] = VALIDATION['Ss'] + VALIDATION['Sm'] + VALIDATION['Sw']
VALIDATION['TOTAL'] = VALIDATION['R'] + VALIDATION['S']
VALIDATION.drop(columns=['Rs', 'Ss', 'Rm', 'Sm', 'Rw', 'Sw', 'totals', 'totalm', 'totalw'], inplace=True)
VALIDATION.reset_index(inplace=True)
VALIDATION['PROP_R']=VALIDATION['R']/VALIDATION['TOTAL']
VALIDATION['PROP_S']=VALIDATION['S']/VALIDATION['TOTAL']
VALIDATION.columns.name='index'
VALIDATION

index,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S
0,!187G,True,True,1,1,2,0.5,0.5
1,!187R,True,True,0,1,1,0.0,1.0
2,-29_indel,False,False,0,1,1,0.0,1.0
3,-2_indel,False,False,0,12,12,0.0,1.0
4,-32_indel,False,False,0,1,1,0.0,1.0
...,...,...,...,...,...,...,...,...
588,g-9a,True,False,0,1,1,0.0,1.0
589,t-10c,True,False,0,2,2,0.0,1.0
590,t-12c,True,False,9,1,10,0.9,0.1
591,t-7c,True,False,9,0,9,1.0,0.0


Apply the arbitrary rules described in the Methods to assign an overall phenotype 

In [23]:
def reliable_phenotype(row):
    reliable_phenotype=False
    phenotype='U'
    if row['TOTAL']>=4:
        if row['PROP_R']>=0.75:
            reliable_phenotype=True
            phenotype='R'
        elif row['PROP_S']>=0.75:
            reliable_phenotype=True
            phenotype='S'
    elif row['TOTAL']>=2:
        if row['R']==row['TOTAL']:
            reliable_phenotype=True
            phenotype='R'
        elif row['S']==row['TOTAL']:
            reliable_phenotype=True
            phenotype='S'

    return pandas.Series([reliable_phenotype, phenotype]) 

VALIDATION[['RELIABLE_PHENOTYPE', 'PHENOTYPE']]=VALIDATION.apply(reliable_phenotype,axis=1)   

print("%i samples cannot be assigned a phenotype" % ((~VALIDATION.RELIABLE_PHENOTYPE).sum()))

VALIDATION[VALIDATION.RELIABLE_PHENOTYPE].PHENOTYPE.value_counts()

307 samples cannot be assigned a phenotype


R    221
S     65
Name: PHENOTYPE, dtype: int64

Let's take a look at the mutations that cannot be assigned a phenotype using these rules

In [24]:
VALIDATION[~VALIDATION.RELIABLE_PHENOTYPE][:10]

index,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,PHENOTYPE
0,!187G,True,True,1,1,2,0.5,0.5,False,U
1,!187R,True,True,0,1,1,0.0,1.0,False,U
2,-29_indel,False,False,0,1,1,0.0,1.0,False,U
4,-32_indel,False,False,0,1,1,0.0,1.0,False,U
8,-9_indel,False,False,0,1,1,0.0,1.0,False,U
9,102_indel,False,True,1,0,1,1.0,0.0,False,U
10,108_indel,False,True,1,0,1,1.0,0.0,False,U
11,109_indel,False,True,1,0,1,1.0,0.0,False,U
12,116_indel,False,True,1,0,1,1.0,0.0,False,U
13,117_indel,False,True,1,0,1,1.0,0.0,False,U


Many of them only have a single measurement:

In [25]:
VALIDATION[~VALIDATION.RELIABLE_PHENOTYPE].TOTAL.value_counts().sort_index()

1     229
2      14
3      16
4       3
5       2
6       8
7       3
8       3
9       3
10      2
11      1
12      2
13      1
14      1
15      2
17      1
18      1
19      1
20      1
22      1
23      2
24      1
25      2
27      1
29      1
51      1
55      1
59      1
76      1
84      1
Name: TOTAL, dtype: int64

Looking at the mutations with more than one sample we find an ambigious distribution of R and S results suggesting perhaps these are close to the MGIT breakpoint.

In [26]:
VALIDATION[(~VALIDATION.RELIABLE_PHENOTYPE) & (VALIDATION.TOTAL>10)]

index,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,PHENOTYPE
153,A102P,True,True,8,4,12,0.666667,0.333333,False,U
156,A102V,True,True,12,13,25,0.48,0.52,False,U
173,A171T,True,True,4,8,12,0.333333,0.666667,False,U
186,A46V,True,True,13,6,19,0.684211,0.315789,False,U
194,C14G,True,True,8,9,17,0.470588,0.529412,False,U
207,D12G,True,True,17,6,23,0.73913,0.26087,False,U
213,D136G,True,True,11,13,24,0.458333,0.541667,False,U
229,D63G,True,True,6,7,13,0.461538,0.538462,False,U
256,F58L,True,True,14,13,27,0.518519,0.481481,False,U
263,F94L,True,True,17,6,23,0.73913,0.26087,False,U


Let's remove all these samples where a phenotype cannot be inferred using the arbitary rules

In [27]:
VALIDATION = VALIDATION[VALIDATION.RELIABLE_PHENOTYPE]

In [28]:
def classify_variant(row):
    is_cds=False
    is_snp=False
    is_nonsyn=False
    is_missense=False
    cols = row.MUTATION.split('_')
    if '-' not in row.MUTATION:
        is_cds=True        
        if len(cols)!=3 and 'indel' not in row.MUTATION:
            is_snp=True
            if row.MUTATION[0]!=row.MUTATION[-1]:
                is_nonsyn=True
                if row.MUTATION[-1]!='!':
                    is_missense=True
    else:
        if len(cols)!=3 and 'indel' not in row.MUTATION:
            is_snp=True
    
    return pandas.Series([is_cds,is_snp,is_nonsyn,is_missense])    

VALIDATION[['IN_CDS', 'IS_SNP', 'IS_NONSYN', 'IS_MISSENSE']] = VALIDATION.apply(classify_variant, axis=1)

pandas.crosstab(VALIDATION.IN_CDS,[VALIDATION.IS_SNP, VALIDATION.IS_NONSYN, VALIDATION.IS_MISSENSE])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VALIDATION[['IN_CDS', 'IS_SNP', 'IS_NONSYN', 'IS_MISSENSE']] = VALIDATION.apply(classify_variant, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VALIDATION[['IN_CDS', 'IS_SNP', 'IS_NONSYN', 'IS_MISSENSE']] = VALIDATION.apply(classify_variant, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

IS_SNP,False,True,True,True
IS_NONSYN,False,False,True,True
IS_MISSENSE,False,False,False,True
IN_CDS,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
False,4,9,0,0
True,46,15,13,199


Hence we find there are 12 promoter mutations (incl. 4 indels) and 45 indels in the CDS, with 12 synoymous mutations, 13 nonsense mutations and 195 missense mutations

In [29]:
print("Table 1: There are %i non-redundant missense mutations in this dataset" % (len(VALIDATION[ VALIDATION.IS_MISSENSE])))

Table 1: There are 199 non-redundant missense mutations in this dataset


But we cannot structurally model (i) mutations in the Stop codon and also (ii) mutations in resid 186 since it is not resolved in the protein structure so let's identify these

In [30]:
def valid_for_structure(row):
    if row.IN_CDS and row.IS_SNP and row.IS_NONSYN and row.IS_MISSENSE:
        if "!" in row.MUTATION:
            return False
        elif '186' in row.MUTATION:
            return False
        else:
            return True
    else:
        return False

VALIDATION['STRUCTURALLY_VALID'] = VALIDATION.apply(valid_for_structure, axis=1)        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VALIDATION['STRUCTURALLY_VALID'] = VALIDATION.apply(valid_for_structure, axis=1)


In [31]:
VALIDATION[ VALIDATION.IS_MISSENSE].PHENOTYPE.value_counts()

R    155
S     44
Name: PHENOTYPE, dtype: int64

But how many of these can we not map onto the protein structure?

In [32]:
VALIDATION[(VALIDATION.STRUCTURALLY_VALID) & (VALIDATION.IS_MISSENSE)].PHENOTYPE.value_counts(dropna=False)

R    155
S     44
Name: PHENOTYPE, dtype: int64

None are lost for this dataset, but let's subset down anyway

In [33]:
DATASET=VALIDATION[(VALIDATION.RELIABLE_PHENOTYPE) & (VALIDATION.STRUCTURALLY_VALID) & (VALIDATION.IS_MISSENSE)]
DATASET.rename(columns={'PHENOTYPE':'CONSISTENT_PHENOTYPE'}, inplace=True)
DATASET

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET.rename(columns={'PHENOTYPE':'CONSISTENT_PHENOTYPE'}, inplace=True)


index,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,CONSISTENT_PHENOTYPE,IS_NONSYN,IS_MISSENSE,STRUCTURALLY_VALID
154,A102R,True,True,0,3,3,0.000000,1.000000,True,S,True,True,True
155,A102T,True,True,3,0,3,1.000000,0.000000,True,R,True,True,True
159,A134V,True,True,23,2,25,0.920000,0.080000,True,R,True,True,True
161,A143G,True,True,6,0,6,1.000000,0.000000,True,R,True,True,True
163,A143T,True,True,1,6,7,0.142857,0.857143,True,S,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,W68S,True,True,2,0,2,1.000000,0.000000,True,R,True,True,True
562,Y103D,True,True,4,0,4,1.000000,0.000000,True,R,True,True,True
563,Y103H,True,True,5,0,5,1.000000,0.000000,True,R,True,True,True
564,Y103S,True,True,2,0,2,1.000000,0.000000,True,R,True,True,True


In [34]:
DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.notna()].to_csv(filestem+'-full.csv',index=False)

DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.notna()][['MUTATION','CONSISTENT_PHENOTYPE']].to_csv(filestem+'-phen.csv',index=False)

DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.notna())][['MUTATION']].to_csv(filestem+'-muts.csv',index=False, header=False)

DATASET['SEGID']='A'
DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.notna())][['SEGID','MUTATION']].to_csv(filestem+'-semu.csv',index=False, header=False, sep=' ')

DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET['SEGID']='A'


R    155
S     44
Name: CONSISTENT_PHENOTYPE, dtype: int64