# Notebook to generate expected mutants

## Import libraries

In [1]:
import pandas as pd

## Specify paths

In [2]:
# Note, I used to import config entries as "params" (i.e. listed in the params statement of the rule), but...
# a bug led to params not being imported.. (snakemake object is imported but not the params attribute)
# Therefore, now I will import all config entries directly from the snakemake object
wtseq_path = (
    snakemake.input.wtseqs  # Projet-specific file containing the wild-type sequences
)
codon_table_path = snakemake.config["codon"][
    "table"  # Projet-specific file containing the genetic code
]
codon_mode = snakemake.config["codon"][
    "mode"  # Project-specific parameter to specify which degenerate codons were introduced
]
output_file = snakemake.output[0]

## Import codon table

In [3]:
codon_table = pd.read_csv(codon_table_path, header=0)
codon_table["codon"] = codon_table["codon"].str.upper()
codon_table.head(3)

Unnamed: 0,codon,aminoacid,freq,number
0,TTT,F,26.26,76999
1,TTC,F,17.89,52459
2,TTA,L,26.31,77131


In [4]:
# Convert to dictionary
codon_dic = dict(zip(codon_table["codon"], codon_table["aminoacid"]))

## Define and test functions

In [5]:
def get_alt_codons(seq, codon_dic, mode="NNN"):
    """
    Based on a DNA sequence, the function returns two lists:
    1) A list containing all 0-based amino acid positions for the sequence
    2) A list containing all possible alternative codons (other than WT codon) at the matching positions
    For list 2, the mode defines which codons are acceptable: NNN by default, or NNK
    Codons are fetched in the provided codon table (dictionary)
    """

    if mode == "NNN":
        alt = [x for x in codon_dic.keys()]
    elif mode == "NNK":
        alt = [x for x in codon_dic.keys() if x[2] in ["G", "T"]]
    else:
        print("Pleae specify a correct mode: either NNN or NNK")

    pos_l = []
    var_l = []

    for i in range(0, len(seq), 3):
        list_var = [x for x in alt if x != seq[i : i + 3]]
        pos_l.append(i // 3)  # 0-based position (aa)
        var_l.append(list_var)  # list of possible codons other than WT

    return pos_l, var_l

In [6]:
print(get_alt_codons("TCTCCTGTT", codon_dic, "NNN"))

([0, 1, 2], [['TTT', 'TTC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTT', 'GTC', 'GTA', 'GTG', 'TCC', 'TCA', 'TCG', 'CCT', 'CCC', 'CCA', 'CCG', 'ACT', 'ACC', 'ACA', 'ACG', 'GCT', 'GCC', 'GCA', 'GCG', 'TAT', 'TAC', 'TAA', 'TAG', 'CAT', 'CAC', 'CAA', 'CAG', 'AAT', 'AAC', 'AAA', 'AAG', 'GAT', 'GAC', 'GAA', 'GAG', 'TGT', 'TGC', 'TGA', 'TGG', 'CGT', 'CGC', 'CGA', 'CGG', 'AGT', 'AGC', 'AGA', 'AGG', 'GGT', 'GGC', 'GGA', 'GGG'], ['TTT', 'TTC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTT', 'GTC', 'GTA', 'GTG', 'TCT', 'TCC', 'TCA', 'TCG', 'CCC', 'CCA', 'CCG', 'ACT', 'ACC', 'ACA', 'ACG', 'GCT', 'GCC', 'GCA', 'GCG', 'TAT', 'TAC', 'TAA', 'TAG', 'CAT', 'CAC', 'CAA', 'CAG', 'AAT', 'AAC', 'AAA', 'AAG', 'GAT', 'GAC', 'GAA', 'GAG', 'TGT', 'TGC', 'TGA', 'TGG', 'CGT', 'CGC', 'CGA', 'CGG', 'AGT', 'AGC', 'AGA', 'AGG', 'GGT', 'GGC', 'GGA', 'GGG'], ['TTT', 'TTC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTC', 'GTA', 'G

In [7]:
def get_nt_seq(seq, mut_dic):
    list_codons = [
        seq[i : i + 3] for i in range(0, len(seq), 3)
    ]  # Convert nucleotide sequence to list of codons
    seq_l = [
        mut_dic[a] if a in mut_dic.keys() else wtcodon
        for a, wtcodon in enumerate(list_codons)
    ]
    return "".join(seq_l)

In [8]:
get_nt_seq("TCTCCTGTT", {0: "TTC", 2: "TTA"})

'TTCCCTTTA'

## Import sequences

In [9]:
wt_seq = pd.read_csv(wtseq_path)
wt_seq["nt_seq"] = wt_seq["WT_seq"].str.upper()
wt_seq

Unnamed: 0,Mutated_seq,WT_seq,nt_seq
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...


## Get mutated sequences

### Get all allowed codons

In [10]:
wt_seq["pos"], wt_seq["alt_codons"] = zip(
    *wt_seq.WT_seq.apply(lambda x: get_alt_codons(x, codon_dic, codon_mode))
)
wt_seq

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[TTT, TTC, TTA, TTG, CTT, CTC, CTA, CTG, ATT,..."


### Reshape

In [11]:
singles_compact = wt_seq.explode(["pos", "alt_codons"])
singles_compact.head(2)

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,"[TTT, TTC, TTA, TTG, CTT, CTC, CTA, CTG, ATT, ..."
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,1,"[TTT, TTC, TTA, TTG, CTT, CTC, CTA, CTG, ATT, ..."


In [12]:
singles_df = singles_compact.explode("alt_codons")
singles_df

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTT
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTC
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTA
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTG
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,CTT
...,...,...,...,...,...
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,AGG
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGT
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGC
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGA


In [13]:
singles_df["mutations"] = singles_df.apply(
    lambda row: {row[f"pos"]: row[f"alt_codons"]}, axis=1
)
singles_df

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons,mutations
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTT,{0: 'TTT'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTC,{0: 'TTC'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTA,{0: 'TTA'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTG,{0: 'TTG'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,CTT,{0: 'CTT'}
...,...,...,...,...,...,...
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,AGG,{69: 'AGG'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGT,{69: 'GGT'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGC,{69: 'GGC'}
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGA,{69: 'GGA'}


### Construct sequences

In [14]:
singles_df["nt_seq"] = singles_df.apply(
    lambda row: get_nt_seq(row.WT_seq, row.mutations), axis=1
)
singles_df.drop(columns="mutations", inplace=True)
singles_df

Unnamed: 0,Mutated_seq,WT_seq,nt_seq,pos,alt_codons
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTT
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TTCCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTC
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TTACCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTA
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TTGCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,TTG
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,CTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,0,CTT
...,...,...,...,...,...
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,AGG
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGT
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGC
0,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,69,GGA


## Export

In [None]:
# Note: information on mutated codons is dropped so that we can annotate more types of mutants in downstream steps
singles_df.drop(['pos','alt_codons'], axis=1).to_csv(output_file)