In [17]:
import gumpy, numpy, copy

In [16]:
numpy.__version__

'1.25.2'

In [4]:
reference = gumpy.Genome('NC_000962.3.gbk')

In [5]:
pnca_reference = reference.build_gene('pncA')

`__repr__`  is overloaded so you can do this

In [6]:
pnca_reference

pncA gene
601 nucleotides, codes for protein
['g' 'c' 'a' ... 'c' 'g' 't']
[-40 -39 -38 ...  -3  -2  -1]
['M' 'R' 'A' ... 'S' 'S' '!']
[  1   2   3 ... 185 186 187]

This is natively exposed as a `numpy` array

In [7]:
pnca_reference.amino_acid_sequence

array(['M', 'R', 'A', 'L', 'I', 'I', 'V', 'D', 'V', 'Q', 'N', 'D', 'F',
       'C', 'E', 'G', 'G', 'S', 'L', 'A', 'V', 'T', 'G', 'G', 'A', 'A',
       'L', 'A', 'R', 'A', 'I', 'S', 'D', 'Y', 'L', 'A', 'E', 'A', 'A',
       'D', 'Y', 'H', 'H', 'V', 'V', 'A', 'T', 'K', 'D', 'F', 'H', 'I',
       'D', 'P', 'G', 'D', 'H', 'F', 'S', 'G', 'T', 'P', 'D', 'Y', 'S',
       'S', 'S', 'W', 'P', 'P', 'H', 'C', 'V', 'S', 'G', 'T', 'P', 'G',
       'A', 'D', 'F', 'H', 'P', 'S', 'L', 'D', 'T', 'S', 'A', 'I', 'E',
       'A', 'V', 'F', 'Y', 'K', 'G', 'A', 'Y', 'T', 'G', 'A', 'Y', 'S',
       'G', 'F', 'E', 'G', 'V', 'D', 'E', 'N', 'G', 'T', 'P', 'L', 'L',
       'N', 'W', 'L', 'R', 'Q', 'R', 'G', 'V', 'D', 'E', 'V', 'D', 'V',
       'V', 'G', 'I', 'A', 'T', 'D', 'H', 'C', 'V', 'R', 'Q', 'T', 'A',
       'E', 'D', 'A', 'V', 'R', 'N', 'G', 'L', 'A', 'T', 'R', 'V', 'L',
       'V', 'D', 'L', 'T', 'A', 'G', 'V', 'S', 'A', 'D', 'T', 'T', 'V',
       'A', 'A', 'L', 'E', 'E', 'M', 'R', 'T', 'A', 'S', 'V', 'E

Or we can make a simple string

In [23]:
''.join(i for i in pnca_reference.amino_acid_sequence)


'MRALIIVDVQNDFCEGGSLAVTGGAALARAISDYLAEAADYHHVVATKDFHIDPGDHFSGTPDYSSSWPPHCVSGTPGADFHPSLDTSAIEAVFYKGAYTGAYSGFEGVDENGTPLLNWLRQRGVDEVDVVGIATDHCVRQTAEDAVRNGLATRVLVDLTAGVSADTTVAALEEMRTASVELVCSS!'

Because the gene has associated with it a promoter region, we have to remove that to get at the nucleotide sequence of the CDS

In [24]:
pnca_sequence = ''.join(i for i in pnca_reference.nucleotide_sequence[pnca_reference.nucleotide_number>0])
print(pnca_sequence)

atgcgggcgttgatcatcgtcgacgtgcagaacgacttctgcgagggtggctcgctggcggtaaccggtggcgccgcgctggcccgcgccatcagcgactacctggccgaagcggcggactaccatcacgtcgtggcaaccaaggacttccacatcgacccgggtgaccacttctccggcacaccggactattcctcgtcgtggccaccgcattgcgtcagcggtactcccggcgcggacttccatcccagtctggacacgtcggcaatcgaggcggtgttctacaagggtgcctacaccggagcgtacagcggcttcgaaggagtcgacgagaacggcacgccactgctgaattggctgcggcaacgcggcgtcgatgaggtcgatgtggtcggtattgccaccgatcattgtgtgcgccagacggccgaggacgcggtacgcaatggcttggccaccagggtgctggtggacctgacagcgggtgtgtcggccgataccaccgtcgccgcgctggaggagatgcgcaccgccagcgtcgagttggtttgcagctcctga


If we want to work out all 1SNP possible mutations we need to know the codon table

In [18]:
aminoacids = 'FFLLSSSSYY!!CC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
bases = ['t', 'c', 'a', 'g']
all_codons = numpy.array([a+b+c for a in bases for b in bases for c in bases])
codon_to_amino_acid = dict(zip(all_codons, aminoacids))

Note that the below won't allow a stop codon (here `!`) to be introduced

In [27]:
index = 1

maximum_amino_acids = []
amino_acid_position = []
possible_mutations = {}

for codon in [pnca_sequence[i:i+3] for i in range(0,len(pnca_sequence),3)]:
    
    aminoacid = codon_to_amino_acid[codon]
    mutations = []

    for pos in [0, 1, 2]:
        for i in bases:
            if codon[pos] != i:
                new_codon = codon[:pos] + i + codon[pos+1:]
                new_aminoacid = codon_to_amino_acid[new_codon]

                if new_aminoacid != aminoacid and new_aminoacid not in mutations and new_aminoacid!="!":
                    mutations.append(new_aminoacid)

    possible_mutations[index] = mutations
    maximum_amino_acids.append(len(mutations))
    amino_acid_position.append(index)
    
    index+=1

In [28]:
possible_mutations

{1: ['L', 'V', 'T', 'K', 'R', 'I'],
 2: ['W', 'G', 'L', 'P', 'Q'],
 3: ['S', 'P', 'T', 'V', 'E', 'G'],
 4: ['M', 'V', 'S', 'W', 'F'],
 5: ['F', 'L', 'V', 'T', 'N', 'S', 'M'],
 6: ['F', 'L', 'V', 'T', 'N', 'S', 'M'],
 7: ['F', 'L', 'I', 'A', 'D', 'G'],
 8: ['Y', 'H', 'N', 'V', 'A', 'G', 'E'],
 9: ['L', 'M', 'A', 'E', 'G'],
 10: ['K', 'E', 'L', 'P', 'R', 'H'],
 11: ['Y', 'H', 'D', 'I', 'T', 'S', 'K'],
 12: ['Y', 'H', 'N', 'V', 'A', 'G', 'E'],
 13: ['L', 'I', 'V', 'S', 'Y', 'C'],
 14: ['R', 'S', 'G', 'F', 'Y', 'W'],
 15: ['Q', 'K', 'V', 'A', 'G', 'D'],
 16: ['C', 'R', 'S', 'V', 'A', 'D'],
 17: ['C', 'R', 'S', 'V', 'A', 'D'],
 18: ['P', 'T', 'A', 'L', 'W'],
 19: ['M', 'V', 'P', 'Q', 'R'],
 20: ['S', 'P', 'T', 'V', 'E', 'G'],
 21: ['L', 'I', 'A', 'E', 'G'],
 22: ['S', 'P', 'A', 'I', 'N'],
 23: ['C', 'R', 'S', 'V', 'A', 'D'],
 24: ['C', 'R', 'S', 'V', 'A', 'D'],
 25: ['S', 'P', 'T', 'V', 'D', 'G'],
 26: ['S', 'P', 'T', 'V', 'E', 'G'],
 27: ['M', 'V', 'P', 'Q', 'R'],
 28: ['S', 'P', 'T', 'V',