In [5]:
# import sys
# !{sys.executable} -m pip install gumpy
# !{sys.executable} -m pip install piezo

In [3]:
import gumpy, piezo, numpy, copy, random

In [4]:
gene = 'pncA'
drug = 'PZA'
catalogue_file = 'NC_000962.3_WHO-UCN-GTB-PCI-2021.7_v1.0_GARC1_RUS.csv'

Some lookup `dict` and `list`s we will need later to translate from codons to amino acids and vice versa

In [6]:
aminoacids = 'FFLLSSSSYY!!CC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
bases = ['t', 'c', 'a', 'g']
all_codons = numpy.array([a+b+c for a in bases for b in bases for c in bases])
codon_to_amino_acid = dict(zip(all_codons, aminoacids))
amino_acid_to_codon={}
for i,j in zip(aminoacids, all_codons):
    if i in amino_acid_to_codon:
        amino_acid_to_codon[i].append(j)
    else:
        amino_acid_to_codon[i] = [j]

Here we are using a supplied catalogue to define the mutations associated with resistance

In [13]:
catalogue = piezo.ResistanceCatalogue(catalogue_file)

specific_resistance_mutations = catalogue.catalogue.rules[ (catalogue.catalogue.rules.PREDICTION=='R') &\
                                                          (catalogue.catalogue.rules.MUTATION_TYPE=='SNP') &\
                                                          (catalogue.catalogue.rules.MUTATION_AFFECTS=='CDS') &\
                                                          (catalogue.catalogue.rules.MUTATION.str[-1]!='!') &\
                                                          (catalogue.catalogue.rules.POSITION!='*')]

pnca_resistant_mutations = specific_resistance_mutations[(specific_resistance_mutations.DRUG==drug) & (specific_resistance_mutations.GENE==gene)]

In [17]:
print("For pncA there are %i mutations associated with resistance to PZA" % len(pnca_resistant_mutations) )

For pncA there are 154 mutations associated with resistance to PZA


..where as for the susceptible mutations we are simply randomly choosing a 1SNP amino acid mutation that results in a missense mutation anywhere in the protein (except at the codons where we are introducing resistance)

In [52]:
# TODO: explain susceptible mutation logic?? how can we call a 1SNP mutation anywhere in protein a susceptible mutation?

genbank_file = 'NC_000962.3.gbk'
n_samples = 10
proportion_resistant = 0.5
resistant_mutations = list(pnca_resistant_mutations.MUTATION)
susceptible_mutations = 'uniform'
n_res = 1
n_sus = 1
output = 'mutations' 
debug = False

Now let's build the `Genome` object so we can access the genes later

In [21]:
reference = gumpy.Genome(genbank_file)

We are only dealing with one gene here

In [23]:
reference_gene = reference.build_gene(gene)

In [26]:
reference_gene.amino_acid_number

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

Because we want to produce `n_samples` we have to wrap it all in a big `for` loop...

In [53]:
for n_sample in range(n_samples):

    # let's take a deepcopy so we don't accidentally alter the reference
    sample_gene = copy.deepcopy(reference_gene)

    # WORK OUT RESISTANT MUTATIONS
    if random.random() < proportion_resistant:
        label='R'
        if debug:
            print('Resistant Sample!')
        # assuming that a Poisson distribution describes the expected number
        number_resistant = numpy.random.poisson(n_res)
    else:
        label='S'
        if debug:
            print('Susceptible Sample!')
        number_resistant = 0

    # choose the resistant mutations we want to incorporate
    selected_resistant_mutations = random.choices(resistant_mutations, k=number_resistant)

    if debug:
        print("R: ", selected_resistant_mutations)

    # first, identify the codons being mutated as we will want to avoid these for susceptible mutations
    positions_altered = []
    for mutation in selected_resistant_mutations:
        aa_pos = int(mutation[1:-1])
        positions_altered.append(aa_pos)

    # Get amino acid positions that are not altered by selected resistant mutations
    remaining_aa_positions = sample_gene.amino_acid_number[~numpy.isin(sample_gene.amino_acid_number, positions_altered)]

    # WORK OUT SUSCEPTIBLE MUTATIONS
    number_susceptible = numpy.random.poisson(n_sus)

    selected_susceptible_mutations = []

    # now randomly choose some susceptible mutations (i.e. "uniformly")
    for susceptible_codon in random.choices(remaining_aa_positions, k=number_susceptible):
        
        ref_codon = sample_gene.codons[sample_gene.amino_acid_number==susceptible_codon][0]
        ref_aa = codon_to_amino_acid[ref_codon]

        possible_alt_aa = []

        for alt_codon in codon_to_amino_acid:

            # no synoymous mutations
            if codon_to_amino_acid[alt_codon] != ref_aa:
                possible_aa = codon_to_amino_acid[alt_codon]

                # no premature Stop codons -- may want to make this a parameter in future
                if possible_aa != "!":                
                    n_snps = sum(1 for a, b in zip(ref_codon, alt_codon) if a != b)

                    # only look for SNPs and mutations not in our list of resistance associated mutations
                    if n_snps == 1 and possible_aa not in resistant_mutations:
                        possible_alt_aa.append(possible_aa)

        if debug:
            print('Possible alternate amino acid for susceptible mutation:', possible_alt_aa)

        alt_aa = random.choice(possible_alt_aa)

        alt_mutation = ref_aa + str(susceptible_codon) + alt_aa

        selected_susceptible_mutations.append(alt_mutation)

    if debug:
        print("S: ", selected_susceptible_mutations)

    selected_mutations = selected_resistant_mutations + selected_susceptible_mutations


    # INTRODUCE MUTATIONS TO GENE
    for mutation in selected_mutations:

        ref_aa = mutation[0]
        alt_aa = mutation[-1]
        aa_pos = int(mutation[1:-1])

        if debug:
            print(mutation)

        ref_codon = sample_gene.codons[sample_gene.amino_acid_number==aa_pos][0]
    
    #! Work out what this is doing?
        alt_codon=None
        for codon in amino_acid_to_codon[alt_aa]:
            counter = sum(1 for a, b in zip(ref_codon, codon) if a != b)
            if counter==1:
                alt_codon = codon
                break

        base_pos = 3*aa_pos -2
        for i,j in zip(ref_codon, alt_codon):
            if i!=j:
                ref_base = i
                alt_base = j
                break
            base_pos+=1

        assert reference_gene.nucleotide_sequence[reference_gene.nucleotide_number==base_pos][0] == ref_base

        sample_gene.nucleotide_sequence[sample_gene.nucleotide_number==base_pos] = alt_base
    #!
    
    sample_gene._translate_sequence()
    print("SAMPLE %i, LABEL %s, %i resistant mutations, %i susceptible mutations" % (n_sample, label, number_resistant, number_susceptible))
    if output == 'allele':
        sample_amino_acid_sequence = ''.join(i for i in sample_gene.amino_acid_sequence)
        print(sample_amino_acid_sequence)
    elif output == 'mutations':
        diff = reference_gene - sample_gene
        for i in diff.mutations:
            print(i)
    else:
        raise ValueError('output can only be one of allele or mutations!')
    
    break


SAMPLE 0, LABEL R, 2 resistant mutations, 2 susceptible mutations
A20G
I90T
K96Q
T160R
