In [86]:
import sbmlcore, pandas, numpy, copy
import itertools

pandas.options.display.max_columns=999
pandas.options.display.max_rows=150

## Read in three catalogues of mutations with assigned phenotypic effects

In [87]:
filestem = 'data/ds-traintest'

definite_sample_dfs = {}
definite_sample_sets = {}
definite_sample_csvs=[]

definite_sample_csvs.append('yadon')
definite_sample_csvs.append('who')
definite_sample_csvs.append('nejm')

definite_sample_dfs['yadon'] = pandas.read_csv('data/catalogues/ds-yadon2017.csv')
definite_sample_dfs['yadon'].set_index('MUTATION', inplace=True)
definite_sample_dfs['yadon'].rename(columns={'YADON_RESULT':'YADON_PHENOTYPE'}, inplace=True)
definite_sample_sets['yadon'] = set(definite_sample_dfs['yadon'].index)

definite_sample_dfs['who'] = pandas.read_csv('data/catalogues/ds-who2021.csv')
definite_sample_dfs['who'].set_index('MUTATION', inplace=True)
definite_sample_dfs['who'].rename(columns={'PREDICTION':'WHO_PHENOTYPE'}, inplace=True)
definite_sample_sets['who'] = set(definite_sample_dfs['who'].index)

definite_sample_dfs['nejm'] = pandas.read_csv('data/catalogues/ds-nejm2018.csv')
definite_sample_dfs['nejm'].set_index('MUTATION', inplace=True)
definite_sample_dfs['nejm'].rename(columns={'PREDICTION':'NEJM_PHENOTYPE'}, inplace=True)
definite_sample_sets['nejm'] = set(definite_sample_dfs['nejm'].index)

In [88]:
df_set = {}
for dataset in ['yadon', 'who', 'nejm']:

    df = definite_sample_dfs[dataset]
    n_total = len(df)
    n_r = df[dataset.upper() + '_PHENOTYPE'].value_counts(dropna=False).R
    n_s = df[dataset.upper() + '_PHENOTYPE'].value_counts(dropna=False).S
    df_set[dataset] = set(df.index)

    print("The %s dataset contains %i samples of which %i are resistant and %i susceptible." % (dataset, n_total, n_r, n_s))

The yadon dataset contains 781 samples of which 351 are resistant and 430 susceptible.
The who dataset contains 342 samples of which 326 are resistant and 16 susceptible.
The nejm dataset contains 437 samples of which 398 are resistant and 39 susceptible.


In [89]:
total = 0
n = len(df_set['yadon'] & df_set['who'] & df_set['nejm'])
total += n
print("%i samples occured in all three datasets" % n)

n = len(df_set['yadon'] & df_set['who'] - df_set['nejm'])
total += n
print("%i samples occured in yadon and who but not nejm" % n)

n = len(df_set['yadon'] & df_set['nejm'] - df_set['who'])
total += n
print("%i samples occured in yadon and nejm but not who" % n)

n = len(df_set['who'] & df_set['nejm'] - df_set['yadon'])
total += n
print("%i samples occured in nejm and who but not yadon" % n)

n = len(df_set['yadon'] - df_set['who'] - df_set['nejm'])
total += n
print("%i samples occured in yadon and not in either of who and nejm" % n)

n = len(df_set['who'] - df_set['yadon'] - df_set['nejm'])
total += n
print("%i samples occured in who and not in either of yadon and nejm" % n)

n = len(df_set['nejm'] - df_set['yadon'] - df_set['who'])
total += n
print("%i samples occured in nejm and not in either of who and yadon" % n)

print("Overall there were %i samples" % total)

117 samples occured in all three datasets
4 samples occured in yadon and who but not nejm
242 samples occured in yadon and nejm but not who
31 samples occured in nejm and who but not yadon
418 samples occured in yadon and not in either of who and nejm
190 samples occured in who and not in either of yadon and nejm
47 samples occured in nejm and not in either of who and yadon
Overall there were 1049 samples


In [90]:
DEFINITE_SAMPLES=None
for i in definite_sample_csvs:
    if DEFINITE_SAMPLES is None:
        DEFINITE_SAMPLES = copy.deepcopy(definite_sample_dfs[i])
    else:
        DEFINITE_SAMPLES = DEFINITE_SAMPLES.join(definite_sample_dfs[i], how='outer')    

number_definites = len(definite_sample_csvs)

def consistent_phenotype(row):
    yadon = row.YADON_PHENOTYPE
    who = row.WHO_PHENOTYPE
    nejm = row.NEJM_PHENOTYPE

    if number_definites==1:
        if use_yadon:
            return yadon
        elif use_who:
            return who
        elif use_nejm:
            return nejm

    # count how many Nones there are    
    number_of_nones = 0
    if isinstance(yadon, float):
        number_of_nones+=1
    if isinstance(who, float):
        number_of_nones+=1
    if isinstance(nejm, float):
        number_of_nones+=1

    # if all three are None, no result is possible
    if number_of_nones==3:
        return None

    # there are 2 Nones so any string is the result
    elif number_of_nones==2:
        if isinstance(yadon,str):
            return(yadon)
        elif isinstance(who,str):
            return(who)
        elif isinstance(nejm,str):
            return(nejm)

    # there is only 1 None, so either there are two datasets, in which case check for concordance
    elif number_of_nones==1:
        if isinstance(yadon,float):
            if who==nejm:
                return who
        elif isinstance(who,float):
            if yadon==nejm:
                return yadon
        elif isinstance(nejm,float):
            if yadon==who:
                return yadon
    
    # if there are three results, only permit exact concordance
    elif number_of_nones==0:
        if yadon==who==nejm:
            return yadon

DEFINITE_SAMPLES['definite_samples_phenotype'] = DEFINITE_SAMPLES.apply(consistent_phenotype, axis=1)             
DEFINITE_SAMPLES[:3]


Unnamed: 0_level_0,YADON_PHENOTYPE,WHO_PHENOTYPE,NEJM_PHENOTYPE,definite_samples_phenotype
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
!187W,S,,,S
!187Y,S,,,S
-126_del_c,,S,,S


In [91]:
DEFINITE_SAMPLES.definite_samples_phenotype.value_counts(dropna=False)

R       577
S       452
None     20
Name: definite_samples_phenotype, dtype: int64

In [92]:
DATASET = DEFINITE_SAMPLES[['definite_samples_phenotype']]
DATASET.rename(columns={'definite_samples_phenotype':'CONSISTENT_PHENOTYPE'}, inplace=True)
DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET.rename(columns={'definite_samples_phenotype':'CONSISTENT_PHENOTYPE'}, inplace=True)


R       577
S       452
None     20
Name: CONSISTENT_PHENOTYPE, dtype: int64

In [93]:
DATASET=DATASET[DATASET.CONSISTENT_PHENOTYPE.notna()]

In [94]:
def classify(row):
    is_cds=False
    is_snp=False
    is_nonsyn=False
    is_missense=False
    if '-' not in row.MUTATION:
        is_cds=True
        cols = row.MUTATION.split('_')
        if len(cols)!=3 and 'indel' not in row.MUTATION:
            is_snp=True
            if row.MUTATION[0]!=row.MUTATION[-1]:
                is_nonsyn=True
                if row.MUTATION[-1]!='!':
                    is_missense=True
    
    return pandas.Series([is_cds,is_snp,is_nonsyn,is_missense])    


DATASET.reset_index(inplace=True)
DATASET[['IN_CDS', 'IS_SNP', 'IS_NONSYN', 'IS_MISSENSE']] = DATASET.apply(classify, axis=1)
DATASET.set_index('MUTATION',inplace=True)
DATASET[:3]

Unnamed: 0_level_0,CONSISTENT_PHENOTYPE,IN_CDS,IS_SNP,IS_NONSYN,IS_MISSENSE
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
!187W,S,True,True,True,True
!187Y,S,True,True,True,True
-126_del_c,S,False,False,False,False


In [95]:
DATASET[DATASET.IN_CDS & DATASET.IS_SNP & DATASET.IS_NONSYN & DATASET.IS_MISSENSE]

Unnamed: 0_level_0,CONSISTENT_PHENOTYPE,IN_CDS,IS_SNP,IS_NONSYN,IS_MISSENSE
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
!187W,S,True,True,True,True
!187Y,S,True,True,True,True
A102V,S,True,True,True,True
A134D,S,True,True,True,True
A134P,R,True,True,True,True
...,...,...,...,...,...
Y95N,S,True,True,True,True
Y99C,S,True,True,True,True
Y99D,S,True,True,True,True
Y99F,S,True,True,True,True


In [96]:
DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

R    577
S    452
Name: CONSISTENT_PHENOTYPE, dtype: int64

In [97]:
DATASET.reset_index(inplace=True)

def valid_for_structure(row):
    if row.IN_CDS and row.IS_SNP and row.IS_NONSYN and row.IS_MISSENSE:
        if "!" in row.MUTATION:
            return False
        elif '186' in row.MUTATION:
            return False
        else:
            return True
    else:
        return False


DATASET['STRUCTURALLY_VALID'] = DATASET.apply(valid_for_structure, axis=1)

DATASET = DATASET[DATASET.STRUCTURALLY_VALID]

DATASET.drop(columns=['STRUCTURALLY_VALID'],inplace=True)

DATASET.to_csv(filestem+'-full.csv',index=False)

DATASET[['MUTATION','CONSISTENT_PHENOTYPE']].to_csv(filestem+'-phen.csv',index=False)

DATASET[['MUTATION']].to_csv(filestem+'-muts.csv',index=False, header=False)

DATASET['SEGID']='A'
DATASET[['SEGID','MUTATION']].to_csv(filestem+'-semu.csv',index=False, header=False, sep=' ')

DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

R    349
S    315
Name: CONSISTENT_PHENOTYPE, dtype: int64