In [1]:
import sbmlcore, pandas, numpy, copy
import itertools

pandas.options.display.max_columns=999
pandas.options.display.max_rows=150

## Read in three sets of clinical samples with mutations and recorded phenotypes to form the VALIDATION dataset

In [2]:
filestem = 'data/ds-validation'

validation_csvs = ['miotto2014','whitfield2015','cryptic2021']
stem='data/clinical-samples/ds-'
validation_dfs = {}
validation_sets = {}
for i in validation_csvs:
    validation_dfs[i] = pandas.read_csv(stem+i+'.csv')
    validation_dfs[i].set_index('MUTATION', inplace=True)
    validation_sets[i] = set(validation_dfs[i].index)
    validation_dfs[i].reset_index(inplace=True)
    validation_dfs[i].set_index(['MUTATION', 'IS_SNP', 'IN_CDS'], inplace=True)
    validation_dfs[i]['total'] = validation_dfs[i].R + validation_dfs[i].S

print("The number of mutations and samples in each set are:")
for i in validation_csvs:
    print(i, len(validation_sets[i]), validation_dfs[i].total.sum())

print("\n..and the intersection in numbers of mutations between sets are:")
if len(validation_csvs)>1:
    for i in list(itertools.combinations(validation_csvs,2)):
        print(i, len(validation_sets[i[0]].intersection(validation_sets[i[1]])))

print("\n..and the number of mutations in all three sets is:")
if len(validation_csvs)>2:
    for i in list(itertools.combinations(validation_csvs,3)):
        print(i, len(validation_sets[i[0]] & validation_sets[i[1]] & validation_sets[i[2]]))

The number of mutations and samples in each set are:
miotto2014 199 755
whitfield2015 65 634
cryptic2021 561 3578

..and the intersection in numbers of mutations between sets are:
('miotto2014', 'whitfield2015') 41
('miotto2014', 'cryptic2021') 179
('whitfield2015', 'cryptic2021') 47

..and the number of mutations in all three sets is:
('miotto2014', 'whitfield2015', 'cryptic2021') 35


Let's join them all and aggregate the AST results

In [3]:
VALIDATION = validation_dfs['cryptic2021'].join(validation_dfs['miotto2014'], lsuffix='s', rsuffix = 'm', how='outer')
VALIDATION = VALIDATION.join(validation_dfs['whitfield2015'], rsuffix='w',how='outer')
VALIDATION.rename(columns={'R': 'Rw', 'S': 'Sw', 'total': 'totalw'}, inplace=True)
VALIDATION.fillna(0, inplace=True)   
VALIDATION = VALIDATION.astype('int')
VALIDATION['R'] = VALIDATION['Rs'] + VALIDATION['Rm'] + VALIDATION['Rw']
VALIDATION['S'] = VALIDATION['Ss'] + VALIDATION['Sm'] + VALIDATION['Sw']
VALIDATION['TOTAL'] = VALIDATION['R'] + VALIDATION['S']
VALIDATION.drop(columns=['Rs', 'Ss', 'Rm', 'Sm', 'Rw', 'Sw', 'totals', 'totalm', 'totalw'], inplace=True)
VALIDATION.reset_index(inplace=True)
VALIDATION['PROP_R']=VALIDATION['R']/VALIDATION['TOTAL']
VALIDATION['PROP_S']=VALIDATION['S']/VALIDATION['TOTAL']
VALIDATION.columns.name='index'
VALIDATION

index,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S
0,!187G,True,True,1,1,2,0.5,0.5
1,!187R,True,True,0,1,1,0.0,1.0
2,-29_indel,False,False,0,1,1,0.0,1.0
3,-2_indel,False,False,0,12,12,0.0,1.0
4,-32_indel,False,False,0,1,1,0.0,1.0
...,...,...,...,...,...,...,...,...
588,g-9a,True,False,0,1,1,0.0,1.0
589,t-10c,True,False,0,2,2,0.0,1.0
590,t-12c,True,False,9,1,10,0.9,0.1
591,t-7c,True,False,9,0,9,1.0,0.0


In [4]:
VALIDATION.TOTAL.sum()

4967

Apply the arbitrary rules described in the Methods to assign an overall phenotype 

In [5]:
def reliable_phenotype(row):
    reliable_phenotype=False
    phenotype='U'
    if row['TOTAL']>=4:
        if row['PROP_R']>=0.75:
            reliable_phenotype=True
            phenotype='R'
        elif row['PROP_S']>=0.75:
            reliable_phenotype=True
            phenotype='S'
    elif row['TOTAL']>=2:
        if row['R']==row['TOTAL']:
            reliable_phenotype=True
            phenotype='R'
        elif row['S']==row['TOTAL']:
            reliable_phenotype=True
            phenotype='S'

    return pandas.Series([reliable_phenotype, phenotype]) 

VALIDATION[['RELIABLE_PHENOTYPE', 'PHENOTYPE']]=VALIDATION.apply(reliable_phenotype,axis=1)   

print("%i samples cannot be assigned a phenotype" % ((~VALIDATION.RELIABLE_PHENOTYPE).sum()))

VALIDATION[VALIDATION.RELIABLE_PHENOTYPE].PHENOTYPE.value_counts(dropna=False)

307 samples cannot be assigned a phenotype


R    221
S     65
Name: PHENOTYPE, dtype: int64

In [6]:
def classify_variant(row):
    is_cds=False
    is_snp=False
    is_nonsyn=False
    is_missense=False
    cols = row.MUTATION.split('_')
    if '-' not in row.MUTATION:
        is_cds=True        
        if len(cols)!=3 and 'indel' not in row.MUTATION:
            is_snp=True
            if row.MUTATION[0]!=row.MUTATION[-1]:
                is_nonsyn=True
                if row.MUTATION[-1]!='!':
                    is_missense=True
    else:
        if len(cols)!=3 and 'indel' not in row.MUTATION:
            is_snp=True
    
    return pandas.Series([is_cds,is_snp,is_nonsyn,is_missense])    

VALIDATION[['IN_CDS', 'IS_SNP', 'IS_NONSYN', 'IS_MISSENSE']] = VALIDATION.apply(classify_variant, axis=1)

pandas.crosstab(VALIDATION.IN_CDS,[VALIDATION.IS_SNP, VALIDATION.IS_NONSYN, VALIDATION.IS_MISSENSE])

IS_SNP,False,True,True,True
IS_NONSYN,False,False,True,True
IS_MISSENSE,False,False,False,True
IN_CDS,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
False,7,14,0,0
True,144,40,19,369


Hence we find there are 12 promoter mutations (incl. 4 indels) and 45 indels in the CDS, with 12 synoymous mutations, 13 nonsense mutations and 195 missense mutations

In [7]:
print("Table 1: There are %i non-redundant missense mutations in this dataset" % (len(VALIDATION[ VALIDATION.IS_MISSENSE])))

Table 1: There are 369 non-redundant missense mutations in this dataset


But we cannot structurally model (i) mutations in the Stop codon and also (ii) mutations in resid 186 since it is not resolved in the protein structure so let's identify these

In [8]:
def valid_for_structure(row):
    if row.IN_CDS and row.IS_SNP and row.IS_NONSYN and row.IS_MISSENSE:
        if "!" in row.MUTATION:
            return False
        elif '186' in row.MUTATION:
            return False
        else:
            return True
    else:
        return False

VALIDATION['STRUCTURALLY_VALID'] = VALIDATION.apply(valid_for_structure, axis=1)        

In [9]:
VALIDATION[ VALIDATION.IS_MISSENSE].PHENOTYPE.value_counts()

U    170
R    155
S     44
Name: PHENOTYPE, dtype: int64

But how many of these can we not map onto the protein structure?

In [10]:
VALIDATION[(VALIDATION.STRUCTURALLY_VALID) & (VALIDATION.IS_MISSENSE)].PHENOTYPE.value_counts(dropna=False)

U    168
R    155
S     44
Name: PHENOTYPE, dtype: int64

None are lost for this dataset, but let's subset down anyway

In [11]:
AGGREGATED_DATASET=VALIDATION[(VALIDATION.STRUCTURALLY_VALID) & (VALIDATION.IS_MISSENSE)]
AGGREGATED_DATASET.rename(columns={'PHENOTYPE':'CONSISTENT_PHENOTYPE'}, inplace=True)
AGGREGATED_DATASET.set_index('MUTATION', inplace=True)
AGGREGATED_DATASET

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  AGGREGATED_DATASET.rename(columns={'PHENOTYPE':'CONSISTENT_PHENOTYPE'}, inplace=True)


index,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,CONSISTENT_PHENOTYPE,IS_NONSYN,IS_MISSENSE,STRUCTURALLY_VALID
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A102P,True,True,8,4,12,0.666667,0.333333,False,U,True,True,True
A102R,True,True,0,3,3,0.000000,1.000000,True,S,True,True,True
A102T,True,True,3,0,3,1.000000,0.000000,True,R,True,True,True
A102V,True,True,12,13,25,0.480000,0.520000,False,U,True,True,True
A134D,True,True,0,1,1,0.000000,1.000000,False,U,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
Y41C,True,True,0,1,1,0.000000,1.000000,False,U,True,True,True
Y64D,True,True,2,4,6,0.333333,0.666667,False,U,True,True,True
Y64S,True,True,1,2,3,0.333333,0.666667,False,U,True,True,True
Y95D,True,True,1,0,1,1.000000,0.000000,False,U,True,True,True


In [21]:
df = AGGREGATED_DATASET[['R','S']].stack()
DATASET = df.index.repeat(df).to_frame(index=False)
DATASET.rename(columns={'index':'CONSISTENT_PHENOTYPE'}, inplace=True)
DATASET

Unnamed: 0,MUTATION,CONSISTENT_PHENOTYPE
0,A102P,R
1,A102P,R
2,A102P,R
3,A102P,R
4,A102P,R
...,...,...
4022,Y64S,R
4023,Y64S,S
4024,Y64S,S
4025,Y95D,R


In [22]:
DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

R    2784
S    1243
Name: CONSISTENT_PHENOTYPE, dtype: int64

In [23]:
DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.notna()].to_csv(filestem+'-full.csv',index=False)

DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.notna()][['MUTATION','CONSISTENT_PHENOTYPE']].to_csv(filestem+'-phen.csv',index=False)

DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.notna())][['MUTATION']].to_csv(filestem+'-muts.csv',index=False, header=False)

DATASET['SEGID']='A'
DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.notna())][['SEGID','MUTATION']].to_csv(filestem+'-semu.csv',index=False, header=False, sep=' ')

DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

R    2784
S    1243
Name: CONSISTENT_PHENOTYPE, dtype: int64

Not sure if we need the below - think might be for SuspectPZA prediction

In [11]:
df1 = pandas.read_csv('data/ds-traintest-semu.csv', names=['chain','mutation'], sep=' ')
df2 = pandas.read_csv('data/ds-validation-semu.csv', names=['chain','mutation'], sep=' ')

In [12]:
df1

Unnamed: 0,chain,mutation
0,A,A102V
1,A,A134D
2,A,A134P
3,A,A134S
4,A,A134V
...,...,...
659,A,Y95N
660,A,Y99C
661,A,Y99D
662,A,Y99F


In [15]:
validation = set(df2.mutation)
tt = set(df1.mutation)

In [22]:
df = pandas.DataFrame(list(validation | tt), columns=['mutation'])
df['chain'] = 'A'
df = df[['chain', 'mutation']]

In [23]:
df

Unnamed: 0,chain,mutation
0,A,W68G
1,A,M1V
2,A,P62A
3,A,A36V
4,A,F58S
...,...,...
689,A,D12H
690,A,S59F
691,A,R123C
692,A,Q141E


In [24]:
df.to_csv('data/ds-all-semu.csv',index=False, header=False, sep=' ')