# Notebook to format sample layout with annotated expected mutants

This notebook takes as input a dataframe containing all expected mutants for each mutated locus. It annotates said mutants and merges the result to the sample layout.

## Import libraries

In [None]:
import pandas as pd

## Specify paths

In [None]:
# Note, I used to import config entries as "params" (i.e. listed in the params statement of the rule), but...
# a bug led to params not being imported.. (snakemake object is imported but not the params attribute)
# Therefore, now I will import all config entries directly from the snakemake object
layout_path = (
    snakemake.input.layout  # Projet-specific file containing the sample layout
)
exp_mut_path = (
    snakemake.input.exp_mut  # Projet-specific file containing the expected mutated sequences
)
codon_table_path = snakemake.config["codon"][
    "table"  # Projet-specific file containing the genetic code
]
output_file = snakemake.output[0]

## Import codon table

In [None]:
codon_table = pd.read_csv(codon_table_path, header=0)
codon_table["codon"] = codon_table["codon"].str.upper()
codon_table.head(3)

In [None]:
# Convert to dictionary
codon_dic = dict(zip(codon_table["codon"], codon_table["aminoacid"]))

## Define and test functions

In [None]:
def get_aa_seq(seq, codon_dic):

    if len(seq) % 3 != 0:
        raise Exception(
            f"Error.. the length of the provided wild-type DNA sequence is not a multiple of 3."
        )

    for x in set(seq):
        if x not in ["A", "C", "G", "T"]:
            raise Exception(
                f"Error.. one of the provided nucleotide sequences contains an unrecognized character."
            )

    clist = [
        seq[i : i + 3]
        for i in range(0, len(seq), 3)  # Convert nucleotide sequence to list of codons
    ]

    return "".join([codon_dic[x] for x in clist])

In [None]:
get_aa_seq("TTCCCTTAA", codon_dic)

In [None]:
def get_mutations(seq, wt, codon_dict):
    """
    By comparing a mutated DNA sequence to the wild-type sequence,
    this function returns the mutations (if there are any).
    Mutations are formatted as # mutated codon / position / alternative codon / alternative amino acid
    in lists with matching indexes to be able to quickly convert to 1 row per mutation per mutated codon
    The alternative and corresponding wild-type codons are translated into their corresponding amino acid using the provided codon table dictionary
    From there, we also calculate the Hamming distances in codons, nucleotides and amino acids.
    """
    if len(seq) != len(wt):
        raise ValueError(
            f"Error.. Cannot annotate expected mutants because at least one sequence is of different length than wild-type."
        )

    is_wt = seq == wt

    list_alt_pos, list_alt_cod, list_alt_aa = [], [], []
    Nham_nt = 0
    Nham_aa = 0

    wt_codons = [
        wt[i : i + 3]
        for i in range(
            0, len(wt), 3
        )  # Converting WT nucleotide sequence to list of codons
    ]
    seq_codons = [
        seq[i : i + 3]
        for i in range(
            0, len(seq), 3
        )  # Converting nucleotide sequence of variant to list of codons
    ]

    for i, (wtc, c) in enumerate(zip(wt_codons, seq_codons)):  # Loop through codons
        alt_aa = codon_dic.get(c)
        wt_aa = codon_dic.get(wtc)
        if c != wtc:
            list_alt_pos.append(i)
            list_alt_cod.append(c)
            list_alt_aa.append(alt_aa)
            Nham_nt += (wtc != c) * sum(
                x != y for x, y in zip(wtc, c)  # Calls zip only when codons differ
            )
            if alt_aa != wt_aa:
                Nham_aa += 1

    Nham_codons = len(list_alt_pos)

    if Nham_codons > 0:
        mut_codons = list(range(1, Nham_codons + 1))
    else:
        mut_codons = [0]
        list_alt_pos = ["not-applicable"]
        list_alt_cod = ["not-applicable"]
        list_alt_aa = ["not-applicable"]

    return (
        is_wt,
        Nham_codons,
        Nham_nt,
        Nham_aa,
        mut_codons,
        list_alt_pos,
        list_alt_cod,
        list_alt_aa,
    )

In [None]:
get_mutations("TTCCCTTTA", "TTCCTATTA", codon_dic)

## Import expected mutants

In [None]:
exp_mut = pd.read_csv(exp_mut_path)
for x in [y for y in exp_mut.columns if y in ["WT_seq", "nt_seq", "barcode"]]:
    exp_mut[x] = exp_mut[x].str.upper()
exp_mut

## Translate nucleotide sequences into amino acid sequences

In [None]:
exp_mut["aa_seq"] = exp_mut.nt_seq.apply(lambda x: get_aa_seq(x, codon_dic))

## Get mutations by comparing sequences to WT

In [None]:
per_seq_cols = ["WT", "Nham_codons", "Nham_nt", "Nham_aa"]
per_mut_cols = ["mutated_codon", "pos", "alt_codons", "alt_aa"]
new_cols = per_seq_cols + per_mut_cols
collected_mutations = [
    get_mutations(seq, wt, codon_dic)
    for seq, wt in zip(exp_mut["nt_seq"], exp_mut["WT_seq"])
]
mutations_dict = dict(zip(new_cols, zip(*collected_mutations)))
exp_mut = exp_mut.assign(**mutations_dict)
exp_mut

In [None]:
exp_mut_long = exp_mut.explode(per_mut_cols).reset_index(drop=True)
exp_mut_long

## Import layout

In [None]:
layout = pd.read_csv(layout_path)
layout

## Merge onto layout

In [None]:
withSeqs = layout.drop(["R1", "R2", "N_forward", "N_reverse"], axis=1).merge(
    right=exp_mut_long.drop("WT_seq", axis=1), on="Mutated_seq"
)
withSeqs

## Add position offset

In [None]:
withSeqs["aa_pos"] = withSeqs.apply(
    lambda row: (
        row.pos + row.Pos_start if row.pos != "not-applicable" else "not-applicable"
    ),
    axis=1,
)
withSeqs

In [None]:
withSeqs.to_csv(output_file)