In [27]:
import sbmlcore, pandas, numpy, copy
import itertools

pandas.options.display.max_columns=999

## Read in three catalogues of mutations with assigned phenotypic effects

In [28]:
filestem = 'data/ds-traintest'

definite_sample_dfs = {}
definite_sample_sets = {}
definite_sample_csvs=[]

definite_sample_csvs.append('yadon')
definite_sample_csvs.append('who')
definite_sample_csvs.append('nejm')

definite_sample_dfs['yadon'] = pandas.read_csv('data/catalogues/ds-yadon2017.csv')
definite_sample_dfs['yadon'].set_index('MUTATION', inplace=True)
definite_sample_dfs['yadon'].rename(columns={'YADON_RESULT':'YADON_PHENOTYPE'}, inplace=True)
definite_sample_sets['yadon'] = set(definite_sample_dfs['yadon'].index)

definite_sample_dfs['who'] = pandas.read_csv('data/catalogues/ds-who2021.csv')
definite_sample_dfs['who'].set_index('MUTATION', inplace=True)
definite_sample_dfs['who'].rename(columns={'PREDICTION':'WHO_PHENOTYPE'}, inplace=True)
definite_sample_sets['who'] = set(definite_sample_dfs['who'].index)

definite_sample_dfs['nejm'] = pandas.read_csv('data/catalogues/ds-nejm2018.csv')
definite_sample_dfs['nejm'].set_index('MUTATION', inplace=True)
definite_sample_dfs['nejm'].rename(columns={'PREDICTION':'NEJM_PHENOTYPE'}, inplace=True)
definite_sample_sets['nejm'] = set(definite_sample_dfs['nejm'].index)

In [29]:
df_set = {}
for dataset in ['yadon', 'who', 'nejm']:

    df = definite_sample_dfs[dataset]
    n_total = len(df)
    n_r = df[dataset.upper() + '_PHENOTYPE'].value_counts(dropna=False).R
    n_s = df[dataset.upper() + '_PHENOTYPE'].value_counts(dropna=False).S
    df_set[dataset] = set(df.index)

    print("The %s dataset contains %i samples of which %i are resistant and %i susceptible." % (dataset, n_total, n_r, n_s))

The yadon dataset contains 643 samples of which 333 are resistant and 310 susceptible.
The who dataset contains 182 samples of which 170 are resistant and 12 susceptible.
The nejm dataset contains 413 samples of which 384 are resistant and 29 susceptible.


In [30]:
total = 0
n = len(df_set['yadon'] & df_set['who'] & df_set['nejm'])
total += n
print("%i samples occured in all three datasets" % n)

n = len(df_set['yadon'] & df_set['who'] - df_set['nejm'])
total += n
print("%i samples occured in yadon and who but not nejm" % n)

n = len(df_set['yadon'] & df_set['nejm'] - df_set['who'])
total += n
print("%i samples occured in yadon and nejm but not who" % n)

n = len(df_set['who'] & df_set['nejm'] - df_set['yadon'])
total += n
print("%i samples occured in nejm and who but not yadon" % n)

n = len(df_set['yadon'] - df_set['who'] - df_set['nejm'])
total += n
print("%i samples occured in yadon and not in either of who and nejm" % n)

n = len(df_set['who'] - df_set['yadon'] - df_set['nejm'])
total += n
print("%i samples occured in who and not in either of yadon and nejm" % n)

n = len(df_set['nejm'] - df_set['yadon'] - df_set['who'])
total += n
print("%i samples occured in nejm and not in either of who and yadon" % n)

print("Overall there were %i samples" % total)

117 samples occured in all three datasets
4 samples occured in yadon and who but not nejm
224 samples occured in yadon and nejm but not who
31 samples occured in nejm and who but not yadon
298 samples occured in yadon and not in either of who and nejm
30 samples occured in who and not in either of yadon and nejm
41 samples occured in nejm and not in either of who and yadon
Overall there were 745 samples


In [31]:
DEFINITE_SAMPLES=None
for i in definite_sample_csvs:
    if DEFINITE_SAMPLES is None:
        DEFINITE_SAMPLES = copy.deepcopy(definite_sample_dfs[i])
    else:
        DEFINITE_SAMPLES = DEFINITE_SAMPLES.join(definite_sample_dfs[i], how='outer')    

number_definites = len(definite_sample_csvs)

def consistent_phenotype(row):
    yadon = row.YADON_PHENOTYPE
    who = row.WHO_PHENOTYPE
    nejm = row.NEJM_PHENOTYPE

    if number_definites==1:
        if use_yadon:
            return yadon
        elif use_who:
            return who
        elif use_nejm:
            return nejm
    
    number_of_nones = 0
    if isinstance(yadon, float):
        number_of_nones+=1
    if isinstance(who, float):
        number_of_nones+=1
    if isinstance(nejm, float):
        number_of_nones+=1

    if number_of_nones==3:
        return None
    # there are 2 Nones so any string is the result
    elif number_of_nones==2:
        if isinstance(yadon,str):
            return(yadon)
        elif isinstance(who,str):
            return(who)
        elif isinstance(nejm,str):
            return(nejm)
    # there is only 1 None, so either there are two datasets, in which case
    elif number_of_nones==1:
        if isinstance(yadon,float):
            if who==nejm:
                return who
        elif isinstance(who,float):
            if yadon==nejm:
                return yadon
        elif isinstance(nejm,float):
            if yadon==who:
                return yadon
    elif number_of_nones==0:
        if yadon==who==nejm:
            return yadon

DEFINITE_SAMPLES['definite_samples_phenotype'] = DEFINITE_SAMPLES.apply(consistent_phenotype, axis=1)             
DEFINITE_SAMPLES[:3]


Unnamed: 0_level_0,YADON_PHENOTYPE,WHO_PHENOTYPE,NEJM_PHENOTYPE,definite_samples_phenotype
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
!187W,S,,,S
!187Y,S,,,S
A102P,R,R,S,


In [32]:
DEFINITE_SAMPLES.definite_samples_phenotype.value_counts(dropna=False)

R       407
S       318
None     20
Name: definite_samples_phenotype, dtype: int64

In [33]:
DATASET = DEFINITE_SAMPLES[['definite_samples_phenotype']]
DATASET.rename(columns={'definite_samples_phenotype':'CONSISTENT_PHENOTYPE'}, inplace=True)
DATASET[:3]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET.rename(columns={'definite_samples_phenotype':'CONSISTENT_PHENOTYPE'}, inplace=True)


Unnamed: 0_level_0,CONSISTENT_PHENOTYPE
MUTATION,Unnamed: 1_level_1
!187W,S
!187Y,S
A102P,


In [34]:
def syn(row):
    if row.MUTATION[0]==row.MUTATION[-1]:
        return True
    else:
        return False

DATASET.reset_index(inplace=True)
DATASET['IS_SYN'] = DATASET.apply(syn, axis=1)
DATASET.set_index('MUTATION',inplace=True)
DATASET = DATASET.loc[~DATASET.IS_SYN]
DATASET[:3]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET['IS_SYN'] = DATASET.apply(syn, axis=1)


Unnamed: 0_level_0,CONSISTENT_PHENOTYPE,IS_SYN
MUTATION,Unnamed: 1_level_1,Unnamed: 2_level_1
!187W,S,False
!187Y,S,False
A102P,,False


In [35]:
DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

R       383
S       318
None     20
Name: CONSISTENT_PHENOTYPE, dtype: int64

In [36]:
DATASET.reset_index(inplace=True)

def valid_for_structure(row):
    if "!" in row.MUTATION:
        return False
    elif '186' in row.MUTATION:
        return False
    else:
        return True

DATASET['STRUCTURALLY_VALID'] = DATASET.apply(valid_for_structure, axis=1)
DATASET=DATASET[DATASET.STRUCTURALLY_VALID]
DATASET.drop(columns=['STRUCTURALLY_VALID'],inplace=True)

DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.notna()].to_csv(filestem+'-full.csv',index=False)

DATASET.loc[DATASET.CONSISTENT_PHENOTYPE.notna()][['MUTATION','CONSISTENT_PHENOTYPE']].to_csv(filestem+'-phen.csv',index=False)

DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.notna())][['MUTATION']].to_csv(filestem+'-muts.csv',index=False, header=False)

DATASET['SEGID']='A'
DATASET.loc[(DATASET.CONSISTENT_PHENOTYPE.notna())][['SEGID','MUTATION']].to_csv(filestem+'-semu.csv',index=False, header=False, sep=' ')

In [37]:
DATASET.CONSISTENT_PHENOTYPE.value_counts(dropna=False)

R       349
S       315
None     20
Name: CONSISTENT_PHENOTYPE, dtype: int64

In [90]:

DATASET.reset_index(inplace=True)
def find_position(row):
    return int(row.MUTATION[1:-1])

DATASET['POSITION'] = DATASET.apply(find_position, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATASET['POSITION'] = DATASET.apply(find_position, axis=1)


In [93]:
DATASET.set_index('POSITION', inplace=True)

In [96]:
DATASET.sort_index()

Unnamed: 0_level_0,MUTATION,CONSISTENT_PHENOTYPE
POSITION,Unnamed: 1_level_1,Unnamed: 2_level_1
1,M1T,R
1,M1L,R
1,M1I,
1,M1V,S
2,R2Q,S
3,A3A,R
3,A3V,S
3,A3S,S
3,A3E,R
4,L4!,R
