In [1]:
import pandas

pandas.options.display.max_columns=999

Load mutations data set

In [35]:
MUTATIONS = pandas.read_pickle('tables/MUTATIONS.pkl.gz')
MUTATIONS.reset_index(inplace=True)
MUTATIONS = MUTATIONS[(MUTATIONS.IS_FILTER_PASS) & (~MUTATIONS.IS_HET) & (~MUTATIONS.IS_NULL)]
MUTATIONS[:4]

Unnamed: 0,UNIQUEID,GENE,MUTATION,POSITION,AMINO_ACID_NUMBER,GENOME_INDEX,NUCLEOTIDE_NUMBER,REF,ALT,IS_SNP,IS_INDEL,IN_CDS,IN_PROMOTER,IS_SYNONYMOUS,IS_NONSYNONYMOUS,IS_HET,IS_NULL,IS_FILTER_PASS,ELEMENT_TYPE,MUTATION_TYPE,INDEL_LENGTH,INDEL_1,INDEL_2,SITEID,NUMBER_NUCLEOTIDE_CHANGES
0,site.02.subj.0958.lab.22A197.iso.1,rpoB,P45S,45.0,45.0,,,ccg,tcg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,1
1,site.02.subj.0958.lab.22A197.iso.1,rpoB,S450L,450.0,450.0,,,tcg,ttg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,1
2,site.02.subj.0958.lab.22A197.iso.1,rpoB,A1075A,1075.0,1075.0,,,gct,gcc,True,False,True,False,True,False,False,False,True,GENE,AAM,,,,2,1
3,site.02.subj.0958.lab.22A197.iso.1,rpoC,D271E,271.0,271.0,,,gac,gag,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,1


Load effects data set

In [252]:
EFFECTS = pandas.read_pickle('tables/EFFECTS.pkl.gz')
EFFECTS.reset_index(inplace=True)
EFFECTS[:20]

pandas.core.frame.DataFrame

Find all samples with resistance mutations in EFFECTS and save UNIQUEID

In [17]:
res_samples = EFFECTS[(EFFECTS.PREDICTION=='R') & (~EFFECTS.MUTATION.str[-1].isin(['O','X']))].UNIQUEID
res_samples.head()

3         site.02.subj.0958.lab.22A197.iso.1
16        site.02.subj.0918.lab.22A153.iso.1
45    site.02.subj.1033.lab.2013185075.iso.1
51    site.02.subj.0739.lab.2013221518.iso.1
89        site.02.subj.0104.lab.22A057.iso.1
Name: UNIQUEID, dtype: object

Filter out all mutations in samples that have resistance mutations

In [51]:
co_occurr=pandas.DataFrame()
co_occurr['mutation'] = MUTATIONS.loc[MUTATIONS['UNIQUEID'].isin(res_samples)].MUTATION
co_occurr['gene']=MUTATIONS.loc[MUTATIONS['UNIQUEID'].isin(res_samples)].GENE
co_occurr

Unnamed: 0,mutation,gene
0,P45S,rpoB
1,S450L,rpoB
2,A1075A,rpoB
3,D271E,rpoC
8,D435G,rpoB
...,...,...
517503,A172V,rpoC
517504,R173R,rpoC
517505,V1252L,rpoC
517509,D435V,rpoB


Remove mutations from list that are resistance mutations

In [53]:
res_mut = pandas.DataFrame()
res_mut['mutation'] = EFFECTS[(EFFECTS.PREDICTION=='R') & (~EFFECTS.MUTATION.str[-1].isin(['O','X']))].MUTATION
res_mut['gene'] = EFFECTS[(EFFECTS.PREDICTION=='R') & (~EFFECTS.MUTATION.str[-1].isin(['O','X']))].GENE
co_occurr_only = co_occurr.loc[~co_occurr['mutation'].isin(res_mut['mutation'])]
co_occurr_only.value_counts()[:20]

mutation  gene
A1075A    rpoB    12240
A542A     rpoC     2880
E1092D    rpoC     2439
G594E     rpoC     1864
G876G     rpoB     1458
c-61t     rpoB     1402
V483G     rpoC     1236
D103D     rpoB     1041
I491V     rpoC      666
V483A     rpoC      599
R173R     rpoC      595
A172V     rpoC      589
I491T     rpoC      457
P1040R    rpoC      400
F452S     rpoC      346
C62C      rpoC      337
E761D     rpoB      304
P601L     rpoC      248
E561E     rpoB      229
L731P     rpoB      226
dtype: int64

**Either** look only at co-occuring mutations that do not occur without resistance mutations:

- first determine how often mutations that co-occurr are found in general

In [151]:
gen_occurr=pandas.DataFrame()
gen_occurr['mutation']=MUTATIONS.loc[MUTATIONS['MUTATION'].isin(co_occurr_only.mutation)].MUTATION
gen_occurr['gene']=MUTATIONS.loc[MUTATIONS['MUTATION'].isin(co_occurr_only.mutation)].GENE
gen_occurr.value_counts()

mutation  gene
A1075A    rpoB    37400
G594E     rpoC    10610
A542A     rpoC    10544
G876G     rpoB     9236
c-61t     rpoB     8752
                  ...  
S401P     rpoC        1
F438L     sigA        1
F424I     rpoB        1
F424C     rpoB        1
E830Q     rpoC        1
Length: 3149, dtype: int64

In [263]:
df_gen=gen_occurr.value_counts().reset_index(name='count')
df_res=co_occurr_only.value_counts().reset_index(name='count')
df_gen
#gen_occurr.value_counts()-co_occurr_only.value_counts()

Unnamed: 0,mutation,gene,count
0,A1075A,rpoB,37400
1,G594E,rpoC,10610
2,A542A,rpoC,10544
3,G876G,rpoB,9236
4,c-61t,rpoB,8752
...,...,...,...
3144,S401P,rpoC,1
3145,F438L,sigA,1
3146,F424I,rpoB,1
3147,F424C,rpoB,1


- then identify if difference of the general occurrence and occurrence with resistance mutations is zero, if yes, mutations only occurr with resistance mutations

In [271]:
#df_gen.set_index('mutation')
#df_res.set_index('mutation')
df_gen.sort_values('mutation',inplace=True)
df_res.sort_values('mutation',inplace=True)

df_gen['diff']=df_gen['count']-df_res['count']
df_gen[(df_gen['diff']==0)]
#df_gen[(df_gen['diff']==0)&(df_gen['count']>1)]

Unnamed: 0,mutation,gene,count,diff
2619,-11_indel,rpoZ,1,0.0
2617,-3_indel,rpoA,1,0.0
2632,1023_indel,rpoA,1,0.0
2624,1286_indel,rpoB,1,0.0
2744,1361_indel,rpoB,1,0.0
...,...,...,...,...
2637,g-60a,rpoZ,1,0.0
2640,g-7a,sigA,1,0.0
2611,g-92t,rpoB,1,0.0
2626,t-45g,rpoB,1,0.0


**Or** determine if co-occurrence is significant compared to general occurrence in mutations dataset:

- find number of samples in mutations set

In [159]:
len(MUTATIONS.UNIQUEID.value_counts()) 

64722

- find number of samples with resistance mutations in effects dataset

In [160]:
len(EFFECTS[(EFFECTS.PREDICTION=='R')].UNIQUEID.value_counts())

21046

- determine relative occurrence of mutations in co-occurr list in general mutations set and in resistant samples

In [161]:
co_occurr_only.mutation

0           P45S
2         A1075A
3          D271E
9          I491L
10        A1075A
           ...  
517502      Y61Y
517503     A172V
517504     R173R
517505    V1252L
517510    A1075A
Name: mutation, Length: 48118, dtype: object

In [162]:
#gen_occurr=pandas.DataFrame()
#gen_occurr['mutation']=MUTATIONS.loc[MUTATIONS['MUTATION'].isin(co_occurr_only.mutation)].MUTATION
#gen_occurr['gene']=MUTATIONS.loc[MUTATIONS['MUTATION'].isin(co_occurr_only.mutation)].GENE
#gen_occurr.value_counts()[:20]

In [163]:
rel_occurr_res=co_occurr_only.value_counts().div(21046)
rel_occurr_gen=gen_occurr.value_counts().div(64722)
print(rel_occurr_res[:20])
print(rel_occurr_gen[:20])

mutation  gene
A1075A    rpoB    0.581583
A542A     rpoC    0.136843
E1092D    rpoC    0.115889
G594E     rpoC    0.088568
G876G     rpoB    0.069277
c-61t     rpoB    0.066616
V483G     rpoC    0.058728
D103D     rpoB    0.049463
I491V     rpoC    0.031645
V483A     rpoC    0.028461
R173R     rpoC    0.028271
A172V     rpoC    0.027986
I491T     rpoC    0.021714
P1040R    rpoC    0.019006
F452S     rpoC    0.016440
C62C      rpoC    0.016013
E761D     rpoB    0.014445
P601L     rpoC    0.011784
E561E     rpoB    0.010881
L731P     rpoB    0.010738
dtype: float64
mutation  gene
A1075A    rpoB    0.577856
G594E     rpoC    0.163932
A542A     rpoC    0.162912
G876G     rpoB    0.142703
c-61t     rpoB    0.135224
D103D     rpoB    0.104400
R173R     rpoC    0.103643
A172V     rpoC    0.101496
E1092D    rpoC    0.061818
P601L     rpoC    0.060180
P54P      rpoC    0.020828
V483G     rpoC    0.019205
V228V     rpoB    0.016795
A621T     rpoC    0.015142
E319K     rpoA    0.014400
I491V     

In [164]:
concat_rel_occurr=pandas.DataFrame()
concat_rel_occurr['resistance']=rel_occurr_res
concat_rel_occurr['general']=rel_occur_gen
concat_rel_occurr

Unnamed: 0_level_0,Unnamed: 1_level_0,resistance,general
mutation,gene,Unnamed: 2_level_1,Unnamed: 3_level_1
A1075A,rpoB,0.581583,
A542A,rpoC,0.136843,
E1092D,rpoC,0.115889,
G594E,rpoC,0.088568,
G876G,rpoB,0.069277,
...,...,...,...
R665W,rpoB,0.000048,
L181I,rpoC,0.000048,
L194L,rpoA,0.000048,
F1175C,rpoC,0.000048,
