In [1]:
import gumpy, numpy, copy, piezo

In [2]:
numpy.__version__, gumpy.__version__

('1.25.2', '1.2.4')

The key object in `gumpy` is the `Genome` which you instantiate with a GenBank file -- this can take 1-2 min

In [3]:
reference = gumpy.Genome('NC_000962.3.gbk')

In [4]:
reference

NC_000962
NC_000962.3
Mycobacterium tuberculosis H37Rv, complete genome
4411532 bases
ttgacc...acgtcg
metadata for all genes/loci have been included

One of the things you can do with a `Genome` object is build `Gene` object(s)

In [5]:
pnca_reference = reference.build_gene('pncA')

`__repr__`  is overloaded so you can do this

In [6]:
pnca_reference

pncA gene
601 nucleotides, codes for protein
['g' 'c' 'a' ... 'c' 'g' 't']
[-40 -39 -38 ...  -3  -2  -1]
['M' 'R' 'A' ... 'S' 'S' '!']
[  1   2   3 ... 185 186 187]

Then various propoerties, like the amino acid sequence is natively exposed as a `numpy` array

In [7]:
pnca_reference.amino_acid_sequence

array(['M', 'R', 'A', 'L', 'I', 'I', 'V', 'D', 'V', 'Q', 'N', 'D', 'F',
       'C', 'E', 'G', 'G', 'S', 'L', 'A', 'V', 'T', 'G', 'G', 'A', 'A',
       'L', 'A', 'R', 'A', 'I', 'S', 'D', 'Y', 'L', 'A', 'E', 'A', 'A',
       'D', 'Y', 'H', 'H', 'V', 'V', 'A', 'T', 'K', 'D', 'F', 'H', 'I',
       'D', 'P', 'G', 'D', 'H', 'F', 'S', 'G', 'T', 'P', 'D', 'Y', 'S',
       'S', 'S', 'W', 'P', 'P', 'H', 'C', 'V', 'S', 'G', 'T', 'P', 'G',
       'A', 'D', 'F', 'H', 'P', 'S', 'L', 'D', 'T', 'S', 'A', 'I', 'E',
       'A', 'V', 'F', 'Y', 'K', 'G', 'A', 'Y', 'T', 'G', 'A', 'Y', 'S',
       'G', 'F', 'E', 'G', 'V', 'D', 'E', 'N', 'G', 'T', 'P', 'L', 'L',
       'N', 'W', 'L', 'R', 'Q', 'R', 'G', 'V', 'D', 'E', 'V', 'D', 'V',
       'V', 'G', 'I', 'A', 'T', 'D', 'H', 'C', 'V', 'R', 'Q', 'T', 'A',
       'E', 'D', 'A', 'V', 'R', 'N', 'G', 'L', 'A', 'T', 'R', 'V', 'L',
       'V', 'D', 'L', 'T', 'A', 'G', 'V', 'S', 'A', 'D', 'T', 'T', 'V',
       'A', 'A', 'L', 'E', 'E', 'M', 'R', 'T', 'A', 'S', 'V', 'E

Or we can make a simple string

In [8]:
''.join(i for i in pnca_reference.amino_acid_sequence)


'MRALIIVDVQNDFCEGGSLAVTGGAALARAISDYLAEAADYHHVVATKDFHIDPGDHFSGTPDYSSSWPPHCVSGTPGADFHPSLDTSAIEAVFYKGAYTGAYSGFEGVDENGTPLLNWLRQRGVDEVDVVGIATDHCVRQTAEDAVRNGLATRVLVDLTAGVSADTTVAALEEMRTASVELVCSS!'

Because the gene has associated with it a promoter region, we have to remove that to get at the nucleotide sequence of the CDS

In [9]:
pnca_sequence = ''.join(i for i in pnca_reference.nucleotide_sequence[pnca_reference.nucleotide_number>0])
print(pnca_sequence)

atgcgggcgttgatcatcgtcgacgtgcagaacgacttctgcgagggtggctcgctggcggtaaccggtggcgccgcgctggcccgcgccatcagcgactacctggccgaagcggcggactaccatcacgtcgtggcaaccaaggacttccacatcgacccgggtgaccacttctccggcacaccggactattcctcgtcgtggccaccgcattgcgtcagcggtactcccggcgcggacttccatcccagtctggacacgtcggcaatcgaggcggtgttctacaagggtgcctacaccggagcgtacagcggcttcgaaggagtcgacgagaacggcacgccactgctgaattggctgcggcaacgcggcgtcgatgaggtcgatgtggtcggtattgccaccgatcattgtgtgcgccagacggccgaggacgcggtacgcaatggcttggccaccagggtgctggtggacctgacagcgggtgtgtcggccgataccaccgtcgccgcgctggaggagatgcgcaccgccagcgtcgagttggtttgcagctcctga


If we want to work out all 1SNP possible mutations we need to know the codon table

In [10]:
aminoacids = 'FFLLSSSSYY!!CC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
bases = ['t', 'c', 'a', 'g']
all_codons = numpy.array([a+b+c for a in bases for b in bases for c in bases])
codon_to_amino_acid = dict(zip(all_codons, aminoacids))

Note that the below won't allow a stop codon (here `!`) to be introduced

In [11]:
index = 1

maximum_amino_acids = []
amino_acid_position = []
possible_mutations = {}

for codon in [pnca_sequence[i:i+3] for i in range(0,len(pnca_sequence),3)]:
    
    aminoacid = codon_to_amino_acid[codon]
    mutations = []

    for pos in [0, 1, 2]:
        for i in bases:
            if codon[pos] != i:
                new_codon = codon[:pos] + i + codon[pos+1:]
                new_aminoacid = codon_to_amino_acid[new_codon]

                if new_aminoacid != aminoacid and new_aminoacid not in mutations and new_aminoacid!="!":
                    mutations.append(new_aminoacid)

    possible_mutations[index] = mutations
    maximum_amino_acids.append(len(mutations))
    amino_acid_position.append(index)
    
    index+=1

This gives us a `dict` of all the possible amino acid mutations that can arise at each position through a single nucleotide change

In [12]:
possible_mutations

{1: ['L', 'V', 'T', 'K', 'R', 'I'],
 2: ['W', 'G', 'L', 'P', 'Q'],
 3: ['S', 'P', 'T', 'V', 'E', 'G'],
 4: ['M', 'V', 'S', 'W', 'F'],
 5: ['F', 'L', 'V', 'T', 'N', 'S', 'M'],
 6: ['F', 'L', 'V', 'T', 'N', 'S', 'M'],
 7: ['F', 'L', 'I', 'A', 'D', 'G'],
 8: ['Y', 'H', 'N', 'V', 'A', 'G', 'E'],
 9: ['L', 'M', 'A', 'E', 'G'],
 10: ['K', 'E', 'L', 'P', 'R', 'H'],
 11: ['Y', 'H', 'D', 'I', 'T', 'S', 'K'],
 12: ['Y', 'H', 'N', 'V', 'A', 'G', 'E'],
 13: ['L', 'I', 'V', 'S', 'Y', 'C'],
 14: ['R', 'S', 'G', 'F', 'Y', 'W'],
 15: ['Q', 'K', 'V', 'A', 'G', 'D'],
 16: ['C', 'R', 'S', 'V', 'A', 'D'],
 17: ['C', 'R', 'S', 'V', 'A', 'D'],
 18: ['P', 'T', 'A', 'L', 'W'],
 19: ['M', 'V', 'P', 'Q', 'R'],
 20: ['S', 'P', 'T', 'V', 'E', 'G'],
 21: ['L', 'I', 'A', 'E', 'G'],
 22: ['S', 'P', 'A', 'I', 'N'],
 23: ['C', 'R', 'S', 'V', 'A', 'D'],
 24: ['C', 'R', 'S', 'V', 'A', 'D'],
 25: ['S', 'P', 'T', 'V', 'D', 'G'],
 26: ['S', 'P', 'T', 'V', 'E', 'G'],
 27: ['M', 'V', 'P', 'Q', 'R'],
 28: ['S', 'P', 'T', 'V',

We don't have to use `piezo` to load the resistance catalogue as it is just a CSV file but it does create some extra columns for us which makes it easier to select the rows we want

In [13]:
catalogue = piezo.ResistanceCatalogue('NC_000962.3_WHO-UCN-GTB-PCI-2021.7_v1.0_GARC1_RUS.csv')
catalogue.catalogue.rules

Unnamed: 0.1,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER,Unnamed: 0,GENE,POSITION,MUTATION_AFFECTS,MUTATION_TYPE,MINOR
0,RIF,1274_del_cggcac,R,{},"{'Present_SOLO_R': 0, 'Present_SOLO_SR': 1, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,rpoB,1274,CDS,INDEL,
1,RIF,1277_del_caccagcca,R,{},"{'Present_SOLO_R': 1, 'Present_SOLO_SR': 1, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,rpoB,1277,CDS,INDEL,
2,RIF,1278_del_accagc,R,{},"{'Present_SOLO_R': 1, 'Present_SOLO_SR': 1, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,rpoB,1278,CDS,INDEL,
3,RIF,1278_del_accagccagctg,R,{},"{'Present_SOLO_R': 0, 'Present_SOLO_SR': 0, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,rpoB,1278,CDS,INDEL,
4,RIF,1279_del_ccagcc,R,{},"{'Present_SOLO_R': 0, 'Present_SOLO_SR': 0, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,rpoB,1279,CDS,INDEL,
...,...,...,...,...,...,...,...,...,...,...,...,...
1685,STM,del_0.0,U,{},{},{},,rpsL,,GENE,INDEL,
1686,ETH,del_0.0,R,{},{},{},,ethA,,GENE,INDEL,
1687,CAP,del_0.0,U,{},{},{},,tlyA,,GENE,INDEL,
1688,ETH,del_0.0,U,{},{},{},,fabG1,,GENE,INDEL,


But the catalogue includes a wide range of rules, including variants known to associated with `S` as well as wild card rules etc.

To start with lets keep it simple and focus down to just SNPs in CDS with confer resistance to a drug

In [14]:
specifc_resistance_mutations = catalogue.catalogue.rules[ (catalogue.catalogue.rules.PREDICTION=='R') &\
                                                          (catalogue.catalogue.rules.MUTATION_TYPE=='SNP') &\
                                                          (catalogue.catalogue.rules.MUTATION_AFFECTS=='CDS') &\
                                                          (catalogue.catalogue.rules.POSITION!='*')]

In [15]:
specifc_resistance_mutations

Unnamed: 0.1,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER,Unnamed: 0,GENE,POSITION,MUTATION_AFFECTS,MUTATION_TYPE,MINOR
38,RIF,A451G,R,{},"{'Present_SOLO_R': 0, 'Present_SOLO_SR': 0, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,rpoB,451,CDS,SNP,
39,RIF,A451V,R,{},"{'Present_SOLO_R': 0, 'Present_SOLO_SR': 4, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,rpoB,451,CDS,SNP,
40,RIF,D435A,R,{},"{'Present_SOLO_R': 0, 'Present_SOLO_SR': 1, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,rpoB,435,CDS,SNP,
41,RIF,D435E,R,{},"{'Present_SOLO_R': 0, 'Present_SOLO_SR': 0, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,rpoB,435,CDS,SNP,
42,RIF,D435F,R,{},"{'Present_SOLO_R': 35, 'Present_SOLO_SR': 38, ...",{'FINAL_CONFIDENCE_GRADING': '1) Assoc w R'},,rpoB,435,CDS,SNP,
...,...,...,...,...,...,...,...,...,...,...,...,...
1347,RIF,A451?,R,{},{},{},,rpoB,451,CDS,SNP,
1348,RIF,L452?,R,{},{},{},,rpoB,452,CDS,SNP,
1438,PZA,H71D,R,{},{},{},,pncA,71,CDS,SNP,
1439,PZA,L116R,R,{},{},{},,pncA,116,CDS,SNP,


Now we can subset further to only those mutations in *pncA* which confer resistance to PZA.

In [16]:
pnca_resistant_mutations = specifc_resistance_mutations[(specifc_resistance_mutations.DRUG=='PZA') & (specifc_resistance_mutations.GENE=='pncA')]
pnca_resistant_mutations

Unnamed: 0.1,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER,Unnamed: 0,GENE,POSITION,MUTATION_AFFECTS,MUTATION_TYPE,MINOR
458,PZA,A102P,R,{},"{'Present_SOLO_R': 5, 'Present_SOLO_SR': 6, 'P...",{'FINAL_CONFIDENCE_GRADING': '1) Assoc w R'},,pncA,102,CDS,SNP,
459,PZA,A134V,R,{},"{'Present_SOLO_R': 12, 'Present_SOLO_SR': 13, ...",{'FINAL_CONFIDENCE_GRADING': '1) Assoc w R'},,pncA,134,CDS,SNP,
460,PZA,A143G,R,{},"{'Present_SOLO_R': 5, 'Present_SOLO_SR': 5, 'P...",{'FINAL_CONFIDENCE_GRADING': '1) Assoc w R'},,pncA,143,CDS,SNP,
461,PZA,A146T,R,{},"{'Present_SOLO_R': 7, 'Present_SOLO_SR': 9, 'P...",{'FINAL_CONFIDENCE_GRADING': '1) Assoc w R'},,pncA,146,CDS,SNP,
462,PZA,A146V,R,{},"{'Present_SOLO_R': 15, 'Present_SOLO_SR': 15, ...",{'FINAL_CONFIDENCE_GRADING': '1) Assoc w R'},,pncA,146,CDS,SNP,
...,...,...,...,...,...,...,...,...,...,...,...,...
623,PZA,Y64D,R,{},"{'Present_SOLO_R': 2, 'Present_SOLO_SR': 4, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,pncA,64,CDS,SNP,
624,PZA,Y99!,R,{},"{'Present_SOLO_R': 2, 'Present_SOLO_SR': 2, 'P...",{'FINAL_CONFIDENCE_GRADING': '2) Assoc w R - I...,,pncA,99,CDS,SNP,
1438,PZA,H71D,R,{},{},{},,pncA,71,CDS,SNP,
1439,PZA,L116R,R,{},{},{},,pncA,116,CDS,SNP,


In [17]:
list(pnca_resistant_mutations.MUTATION)

['A102P',
 'A134V',
 'A143G',
 'A146T',
 'A146V',
 'A171E',
 'A3E',
 'A46E',
 'A46V',
 'C138R',
 'C14!',
 'C14R',
 'C72R',
 'C72Y',
 'D12A',
 'D12E',
 'D12G',
 'D12N',
 'D49A',
 'D49E',
 'D49G',
 'D49N',
 'D63A',
 'D63G',
 'D8A',
 'D8E',
 'D8G',
 'D8N',
 'E15!',
 'E181!',
 'E91!',
 'F106S',
 'F13I',
 'F13L',
 'F58L',
 'F81V',
 'F94C',
 'F94L',
 'F94S',
 'G105D',
 'G105V',
 'G108R',
 'G132A',
 'G132D',
 'G132S',
 'G162D',
 'G17D',
 'G24D',
 'G97C',
 'G97D',
 'G97R',
 'G97S',
 'G97V',
 'H51D',
 'H51P',
 'H51Q',
 'H51R',
 'H51Y',
 'H57D',
 'H57R',
 'H57Y',
 'H71P',
 'H71R',
 'H71Y',
 'H82R',
 'I133T',
 'I31S',
 'I5S',
 'I6T',
 'I90S',
 'I90T',
 'K48E',
 'K48T',
 'K96E',
 'K96Q',
 'K96R',
 'K96T',
 'L116P',
 'L120P',
 'L120Q',
 'L120R',
 'L151S',
 'L159R',
 'L172P',
 'L172R',
 'L182S',
 'L182W',
 'L19P',
 'L27P',
 'L4S',
 'L4W',
 'L85P',
 'L85R',
 'M175T',
 'M175V',
 'M1T',
 'P54L',
 'P54Q',
 'P62L',
 'P62S',
 'P62T',
 'P69L',
 'Q10!',
 'Q10H',
 'Q10P',
 'Q10R',
 'Q122!',
 'Q141!',
 'Q141P