# tb-rnap-compensation

In [1]:
import pandas

pandas.options.display.max_columns=999

Let's load in the `MUTATIONS` table and have a look. Importantly, this table also records `NULL`s (where there are no reads at an amino acid so we have no evidence of what is there) and `FILTER_FAIL`s (where is some evidence but not enough to be statistically significant). These need excluding.

In [5]:
MUTATIONS = pandas.read_pickle('tables/MUTATIONS.pkl.gz')
MUTATIONS.reset_index(inplace=True)
MUTATIONS = MUTATIONS[(MUTATIONS.IS_FILTER_PASS) & (~MUTATIONS.IS_HET) & (~MUTATIONS.IS_NULL)]
MUTATIONS[:4]

Unnamed: 0,UNIQUEID,GENE,MUTATION,POSITION,AMINO_ACID_NUMBER,GENOME_INDEX,NUCLEOTIDE_NUMBER,REF,ALT,IS_SNP,IS_INDEL,IN_CDS,IN_PROMOTER,IS_SYNONYMOUS,IS_NONSYNONYMOUS,IS_HET,IS_NULL,IS_FILTER_PASS,ELEMENT_TYPE,MUTATION_TYPE,INDEL_LENGTH,INDEL_1,INDEL_2,SITEID,NUMBER_NUCLEOTIDE_CHANGES
0,site.02.subj.0958.lab.22A197.iso.1,rpoB,P45S,45.0,45.0,,,ccg,tcg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,1
1,site.02.subj.0958.lab.22A197.iso.1,rpoB,S450L,450.0,450.0,,,tcg,ttg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,1
2,site.02.subj.0958.lab.22A197.iso.1,rpoB,A1075A,1075.0,1075.0,,,gct,gcc,True,False,True,False,True,False,False,False,True,GENE,AAM,,,,2,1
3,site.02.subj.0958.lab.22A197.iso.1,rpoC,D271E,271.0,271.0,,,gac,gag,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,1


To get a quick feel for the amount of reversion that may be happening, let's do a quick cross-tab

In [29]:
pandas.crosstab(MUTATIONS.GENE, MUTATIONS.NUMBER_NUCLEOTIDE_CHANGES)

NUMBER_NUCLEOTIDE_CHANGES,0,1,2,3
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rpoA,219,15384,429,41
rpoB,9491,115155,3053,911
rpoC,1175,101868,3631,1108
rpoZ,102,1593,32,1
sigA,343,13279,395,79


Sure enough there are a good number of codons in `rpoB` and `rpoC` which have two or three bases different to the reference genome.

Let's look at those were two bases are different as they fit our hypothesis (harder to explain three!)

In [7]:
MUTATIONS[MUTATIONS.NUMBER_NUCLEOTIDE_CHANGES == 2]

Unnamed: 0,UNIQUEID,GENE,MUTATION,POSITION,AMINO_ACID_NUMBER,GENOME_INDEX,NUCLEOTIDE_NUMBER,REF,ALT,IS_SNP,IS_INDEL,IN_CDS,IN_PROMOTER,IS_SYNONYMOUS,IS_NONSYNONYMOUS,IS_HET,IS_NULL,IS_FILTER_PASS,ELEMENT_TYPE,MUTATION_TYPE,INDEL_LENGTH,INDEL_1,INDEL_2,SITEID,NUMBER_NUCLEOTIDE_CHANGES
817,site.02.subj.0926.lab.22A161.iso.1,rpoB,S450F,450.0,450.0,,,tcg,ttc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,02,2
859,site.02.subj.0893.lab.22A127.iso.1,rpoB,D435F,435.0,435.0,,,gac,ttc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,02,2
1022,site.02.subj.0197.lab.2013221241.iso.1,sigA,A55S,55.0,55.0,,,gcc,tcg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,02,2
1506,site.02.subj.0074.lab.22A026.iso.1,rpoB,H445C,445.0,445.0,,,cac,tgc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,02,2
2244,site.05.subj.LR-2335.lab.FN-01418-18.iso.1,rpoB,S450M,450.0,450.0,,,tcg,atg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,05,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515444,site.10.subj.YA00134694.lab.YA00134694.iso.1,rpoB,H445G,445.0,445.0,,,cac,ggc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,10,2
516308,site.10.subj.YA00128989.lab.YA00128989.iso.1,rpoB,S441V,441.0,441.0,,,tcg,gtg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,10,2
516789,site.10.subj.YA00022288.lab.YA00022288.iso.1,rpoB,H445G,445.0,445.0,,,cac,ggc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,10,2
517411,site.10.subj.LA00835602.lab.LA00835602.iso.1,rpoB,S450Q,450.0,450.0,,,tcg,cag,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,10,2


In [31]:
MUTATIONS[MUTATIONS.NUMBER_NUCLEOTIDE_CHANGES == 2][:5]

Unnamed: 0,UNIQUEID,GENE,MUTATION,POSITION,AMINO_ACID_NUMBER,GENOME_INDEX,NUCLEOTIDE_NUMBER,REF,ALT,IS_SNP,IS_INDEL,IN_CDS,IN_PROMOTER,IS_SYNONYMOUS,IS_NONSYNONYMOUS,IS_HET,IS_NULL,IS_FILTER_PASS,ELEMENT_TYPE,MUTATION_TYPE,INDEL_LENGTH,INDEL_1,INDEL_2,SITEID,NUMBER_NUCLEOTIDE_CHANGES
817,site.02.subj.0926.lab.22A161.iso.1,rpoB,S450F,450.0,450.0,,,tcg,ttc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,2
859,site.02.subj.0893.lab.22A127.iso.1,rpoB,D435F,435.0,435.0,,,gac,ttc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,2
1022,site.02.subj.0197.lab.2013221241.iso.1,sigA,A55S,55.0,55.0,,,gcc,tcg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,2
1506,site.02.subj.0074.lab.22A026.iso.1,rpoB,H445C,445.0,445.0,,,cac,tgc,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,2,2
2244,site.05.subj.LR-2335.lab.FN-01418-18.iso.1,rpoB,S450M,450.0,450.0,,,tcg,atg,True,False,True,False,False,True,False,False,True,GENE,AAM,,,,5,2


In [12]:
df = MUTATIONS[(MUTATIONS.GENE == 'rpoB') & (MUTATIONS.NUMBER_NUCLEOTIDE_CHANGES == 2)]
a = df.MUTATION.value_counts()
a[a>50]



S450F    182
H656A    134
E640K    134
R662H     97
V243T     87
D435F     82
A670E     73
H445C     70
D545K     69
H445S     63
L443L     59
S641A     53
S431S     52
Name: MUTATION, dtype: int64

In [14]:
MUTATIONS[(MUTATIONS.GENE == 'rpoB') & (MUTATIONS.NUMBER_NUCLEOTIDE_CHANGES == 2) & (MUTATIONS.AMINO_ACID_NUMBER == 450)].MUTATION.value_counts()

S450F    182
S450Q     15
S450M      9
S450V      7
S450Y      7
S450C      6
S450G      5
S450L      1
Name: MUTATION, dtype: int64

In [17]:
EFFECTS = pandas.read_pickle('tables/EFFECTS.pkl.gz')
EFFECTS.reset_index(inplace=True)
EFFECTS[:5]

Unnamed: 0,UNIQUEID,DRUG,GENE,MUTATION,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,SITEID,PREDICTION,DEFAULT_CATALOGUE
0,site.02.subj.0958.lab.22A197.iso.1,RFB,rpoB,P45S,CRyPTIC,v1.31,GARC1,2,U,True
1,site.02.subj.0958.lab.22A197.iso.1,RIF,rpoB,P45S,CRyPTIC,v1.31,GARC1,2,U,True
2,site.02.subj.0958.lab.22A197.iso.1,RFB,rpoB,S450L,CRyPTIC,v1.31,GARC1,2,U,True
3,site.02.subj.0958.lab.22A197.iso.1,RIF,rpoB,S450L,CRyPTIC,v1.31,GARC1,2,R,True
4,site.02.subj.0958.lab.22A197.iso.1,RFB,rpoB,A1075A,CRyPTIC,v1.31,GARC1,2,S,True


In [23]:
df = EFFECTS[(EFFECTS.PREDICTION=='R') & (~EFFECTS.MUTATION.str[-1].isin(['O','X']))].MUTATION.value_counts()

In [25]:
df[df>5]

S450L         13609
D435V          1430
H445Y           756
H445D           693
D435Y           569
L452P           549
L430P           411
S450W           304
H445L           281
I491F           232
H445N           224
D435G           212
H445R           184
S450F           182
V170F           141
D435F            82
H445C            70
H445S            63
S441L            55
Q432P            54
Q432K            54
D545E            43
Q432L            43
M434I            38
L430R            33
H445Q            32
V359A            25
N437D            22
S441Q            20
D435A            20
1296_indel       19
H445G            18
Q429L            18
S441A            16
Q429H            15
S450Q            15
A451V            14
K446Q            14
S431G            12
L449M            11
D435E            11
M434V            10
S428R             9
Q432E             9
H445P             9
S450M             9
1292_indel        8
H445T             7
T427A             7
S450Y             7
