In [2]:
import sbmlcore, pandas, numpy, copy
import itertools

pandas.options.display.max_columns=999

## Read in three sets of clinical samples with mutations and recorded phenotypes

In [3]:
filestem = 'data/ds-validation'

clinical_sample_csvs = ['miotto2014','whitfield2015','cryptic2021']
stem='data/clinical-samples/ds-'
clinical_sample_dfs = {}
clinical_sample_sets = {}
for i in clinical_sample_csvs:
    clinical_sample_dfs[i] = pandas.read_csv(stem+i+'.csv')
    clinical_sample_dfs[i].set_index('MUTATION', inplace=True)
    clinical_sample_sets[i] = set(clinical_sample_dfs[i].index)
    clinical_sample_dfs[i].reset_index(inplace=True)
    clinical_sample_dfs[i].set_index(['MUTATION', 'IS_SNP', 'IN_CDS'], inplace=True)
    clinical_sample_dfs[i]['total'] = clinical_sample_dfs[i].R + clinical_sample_dfs[i].S

for i in clinical_sample_csvs:
    print(i, len(clinical_sample_sets[i]), clinical_sample_dfs[i].total.sum())
if len(clinical_sample_csvs)>1:
    for i in list(itertools.combinations(clinical_sample_csvs,2)):
        print(i, len(clinical_sample_sets[i[0]].intersection(clinical_sample_sets[i[1]])))
if len(clinical_sample_csvs)>2:
    for i in list(itertools.combinations(clinical_sample_csvs,3)):
        print(i, len(clinical_sample_sets[i[0]] & clinical_sample_sets[i[1]] & clinical_sample_sets[i[2]]))

miotto2014 199 755
whitfield2015 65 634
cryptic2021 526 3525
('miotto2014', 'whitfield2015') 41
('miotto2014', 'cryptic2021') 178
('whitfield2015', 'cryptic2021') 46
('miotto2014', 'whitfield2015', 'cryptic2021') 35


In [4]:
CLINCAL_SAMPLES = clinical_sample_dfs['cryptic2021'].join(clinical_sample_dfs['miotto2014'], lsuffix='s', rsuffix = 'm', how='outer')
CLINCAL_SAMPLES = CLINCAL_SAMPLES.join(clinical_sample_dfs['whitfield2015'], rsuffix='w',how='outer')
CLINCAL_SAMPLES.rename(columns={'R': 'Rw', 'S': 'Sw', 'total': 'totalw'}, inplace=True)
CLINCAL_SAMPLES.fillna(0, inplace=True)   
CLINCAL_SAMPLES = CLINCAL_SAMPLES.astype('int')
CLINCAL_SAMPLES['R'] = CLINCAL_SAMPLES['Rs'] + CLINCAL_SAMPLES['Rm'] + CLINCAL_SAMPLES['Rw']
CLINCAL_SAMPLES['S'] = CLINCAL_SAMPLES['Ss'] + CLINCAL_SAMPLES['Sm'] + CLINCAL_SAMPLES['Sw']
CLINCAL_SAMPLES['TOTAL'] = CLINCAL_SAMPLES['R'] + CLINCAL_SAMPLES['S']
CLINCAL_SAMPLES.drop(columns=['Rs', 'Ss', 'Rm', 'Sm', 'Rw', 'Sw', 'totals', 'totalm', 'totalw'], inplace=True)
CLINCAL_SAMPLES.reset_index(inplace=True)
CLINCAL_SAMPLES['PROP_R']=CLINCAL_SAMPLES['R']/CLINCAL_SAMPLES['TOTAL']
CLINCAL_SAMPLES['PROP_S']=CLINCAL_SAMPLES['S']/CLINCAL_SAMPLES['TOTAL']
CLINCAL_SAMPLES.columns.name='index'
CLINCAL_SAMPLES[:3]

index,MUTATION,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S
0,!187G,True,True,1,1,2,0.5,0.5
1,!187R,True,True,0,1,1,0.0,1.0
2,-29_indel,False,False,0,1,1,0.0,1.0


In [5]:
def classify(row):
    reliable_phenotype=False
    phenotype='U'
    if row['TOTAL']>=4:
        if row['PROP_R']>=0.75:
            reliable_phenotype=True
            phenotype='R'
        elif row['PROP_S']>=0.75:
            reliable_phenotype=True
            phenotype='S'
    elif row['TOTAL']>=2:
        if row['R']==row['TOTAL']:
            reliable_phenotype=True
            phenotype='R'
        elif row['S']==row['TOTAL']:
            reliable_phenotype=True
            phenotype='S'

    return pandas.Series([reliable_phenotype, phenotype]) 

def valid_for_structure(row):
    if "!" in row.MUTATION:
        return False
    elif '186' in row.MUTATION:
        return False
    elif row.MUTATION[0]==row.MUTATION[-1]:
        return False
    elif not row.IS_SNP:
        return False
    elif not row.IN_CDS:
        return False
    else:
        return True

CLINCAL_SAMPLES[['RELIABLE_PHENOTYPE', 'PHENOTYPE']]=CLINCAL_SAMPLES.apply(classify,axis=1)   
CLINCAL_SAMPLES.set_index('MUTATION', inplace=True)
CLINCAL_SAMPLES[:3]

index,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,PHENOTYPE
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
!187G,True,True,1,1,2,0.5,0.5,False,U
!187R,True,True,0,1,1,0.0,1.0,False,U
-29_indel,False,False,0,1,1,0.0,1.0,False,U


In [6]:
CLINCAL_SAMPLES[(CLINCAL_SAMPLES.RELIABLE_PHENOTYPE) & (CLINCAL_SAMPLES.IS_SNP) & (CLINCAL_SAMPLES.IN_CDS)].PHENOTYPE.value_counts(dropna=False)

R    168
S     45
Name: PHENOTYPE, dtype: int64

In [7]:
DATASET = CLINCAL_SAMPLES[(CLINCAL_SAMPLES.RELIABLE_PHENOTYPE) & (CLINCAL_SAMPLES.IS_SNP) & (CLINCAL_SAMPLES.IN_CDS)]
DATASET.rename(columns={'PHENOTYPE':'CONSISTENT_PHENOTYPE'}, inplace=True)
DATASET

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET.rename(columns={'PHENOTYPE':'CONSISTENT_PHENOTYPE'}, inplace=True)


index,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,CONSISTENT_PHENOTYPE
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A102R,True,True,0,3,3,0.000000,1.000000,True,S
A102T,True,True,3,0,3,1.000000,0.000000,True,R
A134V,True,True,23,2,25,0.920000,0.080000,True,R
A143G,True,True,6,0,6,1.000000,0.000000,True,R
A143T,True,True,1,6,7,0.142857,0.857143,True,S
...,...,...,...,...,...,...,...,...,...
Y34!,True,True,7,0,7,1.000000,0.000000,True,R
Y34D,True,True,11,2,13,0.846154,0.153846,True,R
Y64!,True,True,11,0,11,1.000000,0.000000,True,R
Y95!,True,True,5,1,6,0.833333,0.166667,True,R


In [8]:
def syn(row):
    if row.MUTATION[0]==row.MUTATION[-1]:
        return True
    else:
        return False

DATASET.reset_index(inplace=True)
DATASET['IS_SYN'] = DATASET.apply(syn, axis=1)
DATASET.set_index('MUTATION',inplace=True)
DATASET = DATASET.loc[~DATASET.IS_SYN]
DATASET[:3]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET['IS_SYN'] = DATASET.apply(syn, axis=1)


index,IS_SNP,IN_CDS,R,S,TOTAL,PROP_R,PROP_S,RELIABLE_PHENOTYPE,CONSISTENT_PHENOTYPE,IS_SYN
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A102R,True,True,0,3,3,0.0,1.0,True,S,False
A102T,True,True,3,0,3,1.0,0.0,True,R,False
A134V,True,True,23,2,25,0.92,0.08,True,R,False


In [9]:
DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

R    168
S     44
Name: CONSISTENT_PHENOTYPE, dtype: int64

In [10]:
DATASET.reset_index(inplace=True)

def valid_for_structure(row):
    if "!" in row.MUTATION:
        return False
    elif '186' in row.MUTATION:
        return False
    elif row.MUTATION[0]==row.MUTATION[-1]:
        return False
    elif not row.IS_SNP:
        return False
    elif not row.IN_CDS:
        return False
    else:
        return True

DATASET['STRUCTURALLY_VALID'] = DATASET.apply(valid_for_structure, axis=1)
DATASET=DATASET[DATASET.STRUCTURALLY_VALID]
DATASET.drop(columns=['STRUCTURALLY_VALID'],inplace=True)

DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.notna()].to_csv(filestem+'-full.csv',index=False)

DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.notna()][['MUTATION','CONSISTENT_PHENOTYPE']].to_csv(filestem+'-phen.csv',index=False)

DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.notna())][['MUTATION']].to_csv(filestem+'-muts.csv',index=False, header=False)

DATASET['SEGID']='A'
DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.notna())][['SEGID','MUTATION']].to_csv(filestem+'-semu.csv',index=False, header=False, sep=' ')

DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET['STRUCTURALLY_VALID'] = DATASET.apply(valid_for_structure, axis=1)


R    155
S     44
Name: CONSISTENT_PHENOTYPE, dtype: int64

In [11]:
DST = pandas.read_csv('data/source-data/cryptic1-dst.csv')
MUTATIONS = pandas.read_csv('data/source-data/cryptic1-mutations.csv')

In [12]:
DST

Unnamed: 0,UNIQUEID,DRUG,SOURCE,METHOD_1,METHOD_2,METHOD_3,METHOD_CC,METHOD_MIC,PHENOTYPE
0,site.24.subj.PT-2.lab.1997-606.iso.1,PZA,SEQTREAT2020,liquid media,MGIT,BACTEC460,100.0,,S
1,site.24.subj.PT-4.lab.1998-151.iso.1,PZA,SEQTREAT2020,liquid media,MGIT,BACTEC460,100.0,,S
2,site.24.subj.PT-5.lab.1998-713.iso.1,PZA,SEQTREAT2020,liquid media,MGIT,BACTEC460,100.0,,S
3,site.24.subj.PT-7.lab.1999-097.iso.1,PZA,SEQTREAT2020,liquid media,MGIT,BACTEC460,100.0,,S
4,site.24.subj.PT-8.lab.1999-131.iso.1,PZA,SEQTREAT2020,liquid media,MGIT,BACTEC460,100.0,,S
...,...,...,...,...,...,...,...,...,...
22837,site.00.subj.LE10KTB_21.lab.7627886.iso.1,PZA,NEJM2018,liquid media,MGIT,,,,S
22838,site.00.subj.LE10KTB_8.lab.7627900.iso.1,PZA,NEJM2018,liquid media,MGIT,,,,S
22839,site.00.subj.LE10KTB_12.lab.7628121.iso.1,PZA,NEJM2018,liquid media,MGIT,,,,S
22840,site.00.subj.LE10KTB_14.lab.7628143.iso.1,PZA,NEJM2018,liquid media,MGIT,,,,S


In [14]:
len(MUTATIONS.UNIQUEID.unique())

3549

In [15]:
len(MUTATIONS)

3573

In [20]:
len(MUTATIONS[MUTATIONS.AMINO_ACID_NUMBER.notna()].UNIQUEID.unique())

3331

In [23]:
foo = MUTATIONS[MUTATIONS.AMINO_ACID_NUMBER.notna()][['UNIQUEID', 'POSITION']].groupby('UNIQUEID').count()

In [25]:
foo.POSITION.value_counts()

1    3314
2      17
Name: POSITION, dtype: int64

In [29]:
singles = foo[foo.POSITION==1].index

In [32]:

MUTATIONS[(MUTATIONS.AMINO_ACID_NUMBER.notna()) & (MUTATIONS.UNIQUEID.isin(singles))].IS_INDEL.value_counts()

False    2911
True      403
Name: IS_INDEL, dtype: int64

In [43]:

indels = MUTATIONS[(MUTATIONS.AMINO_ACID_NUMBER.notna()) & (MUTATIONS.UNIQUEID.isin(singles)) & (MUTATIONS.IS_INDEL | MUTATIONS.IN_PROMOTER)].UNIQUEID.unique()


In [45]:
MUTATIONS[(MUTATIONS.AMINO_ACID_NUMBER.notna()) & (MUTATIONS.UNIQUEID.isin(singles)) & (MUTATIONS.IN_PROMOTER)].UNIQUEID.unique()

array([], dtype=object)

In [41]:
MUTATIONS

Unnamed: 0,UNIQUEID,GENE,MUTATION,POSITION,AMINO_ACID_NUMBER,GENOME_INDEX,NUCLEOTIDE_NUMBER,REF,ALT,IS_SNP,IS_INDEL,IN_CDS,IN_PROMOTER,IS_SYNONYMOUS,IS_NONSYNONYMOUS,IS_HET,IS_NULL,IS_FILTER_PASS,ELEMENT_TYPE,MUTATION_TYPE,INDEL_LENGTH,INDEL_1,INDEL_2,SITEID,NUMBER_NUCLEOTIDE_CHANGES
0,site.05.subj.PSLM-0791.lab.SLM-049.iso.1,pncA,L120P,120.0,120.0,,,ctg,ccg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,05,1
1,site.05.subj.LR-2032.lab.FN-00407-15.iso.1,pncA,W119L,119.0,119.0,,,tgg,ttg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,05,1
2,site.05.subj.PMK-1015.lab.MK-1781.iso.1,pncA,F58L,58.0,58.0,,,ttc,ctc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,05,1
3,site.05.subj.LR-2417.lab.FN-01304-17.iso.1,pncA,H51R,51.0,51.0,,,cac,cgc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,05,1
4,site.05.subj.LR-2162.lab.FN-00284-16.iso.1,pncA,V139L,139.0,139.0,,,gtg,ctg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,05,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3568,site.10.subj.YA00125758.lab.YA00125758.iso.1,pncA,467_indel,467.0,156.0,2288775.0,467.0,,,False,True,True,False,False,False,False,False,True,GENE,INDEL,1.0,467_ins,467_ins_1,10,0
3569,site.10.subj.SADG00497215_S8.lab.DG00497215_S8...,pncA,C138R,138.0,138.0,,,tgt,cgt,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,10,1
3570,site.10.subj.KD01666167.lab.KD01666167.iso.1,pncA,H71Y,71.0,71.0,,,cat,tat,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,10,1
3571,site.10.subj.YA00029870.lab.YA00029870.iso.1,pncA,a-11g,-11.0,,2289252.0,-11.0,a,g,True,False,False,True,False,False,False,False,True,GENE,SNP,,,,10,0


In [44]:

DST[DST.UNIQUEID.isin(indels)].PHENOTYPE.value_counts()

R    355
S     48
Name: PHENOTYPE, dtype: int64