# Notebook to generate expected mutants

## Import libraries

In [1]:
import pandas as pd
import itertools
import numpy as np

## Specify paths

In [None]:
# Note, I used to import config entries as "params" (i.e. listed in the params statement of the rule), but...
# a bug led to params not being imported.. (snakemake object is imported but not the params attribute)
# Therefore, now I will import all config entries directly from the snakemake object
wtseq_path = (
    snakemake.input.wtseqs  # Projet-specific file containing the wild-type sequences
)
codon_table_path = snakemake.config["codon"][
    "table"  # Projet-specific file containing the genetic code
]
codon_mode = snakemake.config["codon"][
    "mode"  # Project-specific parameter to specify which degenerate codons were introduced
]
output_file = snakemake.output[0]

## Import codon table and codon mode

In [None]:
if codon_mode not in ["NNN", "NNK", "NNN x NNN", "NNK x NNK"]:
    raise Exception(f"Error.. check the codon mode specified in the config.")
elif codon_mode in ["NNN", "NNK"]:
    single_codon_mode = codon_mode
else:
    single_codon_mode = codon_mode.split(' x ')[0]

In [3]:
codon_table = pd.read_csv(codon_table_path, header=0)
codon_table["codon"] = codon_table["codon"].str.upper()
codon_table.head(3)

Unnamed: 0,codon,aminoacid,freq,number
0,TTT,F,26.26,76999
1,TTC,F,17.89,52459
2,TTA,L,26.31,77131


In [4]:
# Convert to dictionary
codon_dic = dict(zip(codon_table["codon"], codon_table["aminoacid"]))

## Define and test functions

In [5]:
def get_alt_codons(seq, codon_dic, mode="NNN"):
    """
    Based on a DNA sequence, the function returns two lists:
    1) A list containing all 0-based amino acid positions for the sequence
    2) A list containing all possible alternative codons (other than WT codon) at the matching positions
    For list 2, the mode defines which codons are acceptable: NNN by default, or NNK
    Codons are fetched in the provided codon table (dictionary)
    """

    if mode == "NNN":
        alt = [x for x in codon_dic.keys()]
    elif mode == "NNK":
        alt = [x for x in codon_dic.keys() if x[2] in ["G", "T"]]
    else:
        print("Pleae specify a correct mode: either NNN or NNK")

    pos_l = []
    var_l = []

    for i in range(0, len(seq), 3):
        list_var = [x for x in alt if x != seq[i : i + 3]]
        pos_l.append(i // 3)  # 0-based position (aa)
        var_l.append(list_var)  # list of possible codons other than WT

    return pos_l, var_l

In [6]:
print(get_alt_codons("TCTCCTGTT", codon_dic, "NNN"))

([0, 1, 2], [['TTT', 'TTC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTT', 'GTC', 'GTA', 'GTG', 'TCC', 'TCA', 'TCG', 'CCT', 'CCC', 'CCA', 'CCG', 'ACT', 'ACC', 'ACA', 'ACG', 'GCT', 'GCC', 'GCA', 'GCG', 'TAT', 'TAC', 'TAA', 'TAG', 'CAT', 'CAC', 'CAA', 'CAG', 'AAT', 'AAC', 'AAA', 'AAG', 'GAT', 'GAC', 'GAA', 'GAG', 'TGT', 'TGC', 'TGA', 'TGG', 'CGT', 'CGC', 'CGA', 'CGG', 'AGT', 'AGC', 'AGA', 'AGG', 'GGT', 'GGC', 'GGA', 'GGG'], ['TTT', 'TTC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTT', 'GTC', 'GTA', 'GTG', 'TCT', 'TCC', 'TCA', 'TCG', 'CCC', 'CCA', 'CCG', 'ACT', 'ACC', 'ACA', 'ACG', 'GCT', 'GCC', 'GCA', 'GCG', 'TAT', 'TAC', 'TAA', 'TAG', 'CAT', 'CAC', 'CAA', 'CAG', 'AAT', 'AAC', 'AAA', 'AAG', 'GAT', 'GAC', 'GAA', 'GAG', 'TGT', 'TGC', 'TGA', 'TGG', 'CGT', 'CGC', 'CGA', 'CGG', 'AGT', 'AGC', 'AGA', 'AGG', 'GGT', 'GGC', 'GGA', 'GGG'], ['TTT', 'TTC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTC', 'GTA', 'G

In [7]:
def get_nt_seq(seq, mut_dic):
    list_codons = [
        seq[i : i + 3] for i in range(0, len(seq), 3)
    ]  # Convert nucleotide sequence to list of codons
    seq_l = [
        mut_dic[a] if a in mut_dic.keys() else wtcodon
        for a, wtcodon in enumerate(list_codons)
    ]
    return "".join(seq_l)

In [8]:
get_nt_seq("TCTCCTGTT", {0: "TTC", 2: "TTA"})

'TTCCCTTTA'

## Import sequences

In [9]:
wt_df = pd.read_csv(wtseq_path)
wt_df["nt_seq"] = wt_df["WT_seq"].str.upper()
wt_df

Unnamed: 0,Mutated_seq,WT_seq,nt_seq
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...


## Get single mutants

### Get all allowed codons

In [None]:
wt_seq = wt_df.copy()
wt_seq["pos"], wt_seq["alt_codons"] = zip(
    *wt_seq.WT_seq.apply(lambda x: get_alt_codons(x, codon_dic, single_codon_mode))
)
wt_seq

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[TTT, TTG, CTT, CTG, ATT, ATG, GTT, GTG, TCG,..."


### Reshape

In [11]:
singles_compact = wt_seq.explode(["pos", "alt_codons"])
singles_compact.head(2)

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,"[TTT, TTG, CTT, CTG, ATT, ATG, GTT, GTG, TCG, ..."
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,1,"[TTT, TTG, CTT, CTG, ATT, ATG, GTT, GTG, TCT, ..."


In [12]:
singles_df = singles_compact.explode("alt_codons")
singles_df

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTT
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTG
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,CTT
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,CTG
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,ATT
...,...,...,...,...,...
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,CGG
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,AGT
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,AGG
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGT


In [13]:
singles_df["mutations"] = singles_df.apply(
    lambda row: {row[f"pos"]: row[f"alt_codons"]}, axis=1
)
singles_df

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons,mutations
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTT,{0: 'TTT'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTG,{0: 'TTG'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,CTT,{0: 'CTT'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,CTG,{0: 'CTG'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,ATT,{0: 'ATT'}
...,...,...,...,...,...,...
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,CGG,{69: 'CGG'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,AGT,{69: 'AGT'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,AGG,{69: 'AGG'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGT,{69: 'GGT'}


## Get double mutants (if specified in the config)

In [None]:
if codon_mode in ['NNN x NNN', 'NNK x NNK']:
    pairwise_df = wt_df.copy()

    # Get pairwise combinations of codons that will be mutated
    pairwise_df['combination'] = pairwise_df.WT_seq.apply(lambda x: [x for x in list(itertools.combinations(range(0,len(x)//3), 2)) for _ in range(2)])
    doubles_compact_df = pairwise_df.explode('combination')
    doubles_compact_df['mutated_codon'] = np.tile([1, 2], len(doubles_compact_df) // 2 + 1)[:len(doubles_compact_df)]
    doubles_compact_df['pos'] = doubles_compact_df.apply(lambda row: row.combination[row.mutated_codon - 1], axis=1)
    
    # Refer to a previously built dataframe to get alternative codons for each mutated codon (out of 2) at each position
    doubles_compact_df['alt_codons'] = doubles_compact_df.apply(lambda row: singles_compact.loc[(singles_compact.Mutated_seq == row.Mutated_seq) & (singles_compact.pos == row.pos), 'alt_codons'].item(), axis=1)

    # Dataframe is pivoted only to be able to use pd.explode(), then later on melted to go back to long format
    doubles_piv = doubles_compact_df.pivot_table(index = ['Mutated_seq', 'WT_seq', 'combination'], columns = 'mutated_codon', values = ['pos', 'alt_codons'], aggfunc = 'first').reset_index()
    doubles_piv.columns = [x[0] for x in doubles_piv.columns[:-4]] + [f"{x[0]}{x[1]}" for x in doubles_piv.columns[-4:]]
    
    # Reshape
    doubles_exp1 = doubles_piv.explode('alt_codons1')
    doubles_exp2 = doubles_exp1.explode('alt_codons2')
    doubles_df = doubles_exp2.reset_index(drop=True)
    
    # Create dictionary of mutations, here we go with numpy to be as efficient as possible, even with very large datasets
    keys = np.stack([doubles_df['pos1'].values, doubles_df['pos2'].values], axis=1)
    vals = np.stack([doubles_df['alt_codons1'].values, doubles_df['alt_codons2'].values], axis=1)
    doubles_df['mutations'] = [dict(zip(k, v)) for k, v in zip(keys, vals)]

## Concatenate single mutants and double mutants

In [15]:
mutants_df = pd.concat([singles_df, doubles_df], ignore_index=True)

## Construct sequences

In [18]:
# Again, we need an efficient way to apply our function to get the nucleotide sequences
mutants_df["nt_seq"] = list(itertools.starmap(get_nt_seq, zip(mutants_df["WT_seq"], mutants_df["mutations"])))
mutants_df

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons,mutations,combination,alt_codons1,alt_codons2,pos1,pos2
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTT,{0: 'TTT'},,,,,
1,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TTGCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTG,{0: 'TTG'},,,,,
2,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,CTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,CTT,{0: 'CTT'},,,,,
3,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,CTGCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,CTG,{0: 'CTG'},,,,,
4,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,ATTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,ATT,{0: 'ATT'},,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2387615,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,,,"{68: 'GGG', 69: 'CGG'}","(68, 69)",GGG,CGG,68.0,69.0
2387616,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,,,"{68: 'GGG', 69: 'AGT'}","(68, 69)",GGG,AGT,68.0,69.0
2387617,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,,,"{68: 'GGG', 69: 'AGG'}","(68, 69)",GGG,AGG,68.0,69.0
2387618,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,,,"{68: 'GGG', 69: 'GGT'}","(68, 69)",GGG,GGT,68.0,69.0


## Add wild-type nucleotide sequence

In [None]:
# Note: information on mutated codons is dropped so that we can annotate more types of mutants in downstream steps
expmut_df = pd.concat([wt_df, mutants_df.drop(["mutations", "pos", "alt_codons"], axis=1)], ignore_index=True)

## Export

In [None]:
expmut_df.to_csv(output_file)