# Notebook to format sample layout by generating all expected mutants

## Import libraries

In [None]:
import pandas as pd

## Specify paths

In [None]:
layout_path = snakemake.params.layout
frag_seq_path = snakemake.params.seqs
codon_table_path = snakemake.params.codon_table
codon_mode = snakemake.params.codon_mode
output_file = snakemake.output[0]

## Import codon table

In [None]:
codon_table = pd.read_csv(codon_table_path, header=0)
codon_table['codon'] = codon_table['codon'].str.upper()
codon_table.head(3)

In [None]:
# Convert to dictionary
codon_dic = dict(zip(codon_table['codon'], codon_table['aminoacid']))

## Define and test functions

In [None]:
def get_alt_codons(seq, codon_dic, mode='NNN'):
    '''
    Based on a DNA sequence, the function returns two lists:
    1) A list containing all 0-based amino acid positions for the sequence
    2) A list containing all possible alternative codons (other than WT codon) at the matching positions
    For list 2, the mode defines which codons are acceptable: NNN by default, or NNK
    Codons are fetched in the provided codon table (dictionary)
    '''
    
    if mode=='NNN':
        alt = [x for x in codon_dic.keys()]
    elif mode=='NNK':
        alt = [x for x in codon_dic.keys() if x[2] in ['G', 'T']]
    else:
        print('Pleae specify a correct mode: either NNN or NNK')
    
    pos_l = []
    var_l = []

    for i in range(0,len(seq),3):
        list_var = [x for x in alt if x != seq[i:i+3]]
        pos_l.append(i//3) # 0-based position (aa)
        var_l.append(list_var) # list of possible codons other than WT
    
    return pos_l, var_l

In [None]:
print(get_alt_codons('TCTCCTGTT', codon_dic, 'NNN'))

In [None]:
def get_nt_seq(seq, mut_dic):
    list_codons = [seq[i:i+3] for i in range(0, len(seq), 3)] # Convert nucleotide sequence to list of codons
    seq_l = [mut_dic[a] if a in mut_dic.keys() else wtcodon for a, wtcodon in enumerate(list_codons)]
    return ''.join(seq_l)

In [None]:
get_nt_seq('TCTCCTGTT', {0: 'TTC', 2:'TTA'})

In [None]:
def get_aa_seq(seq, codon_dic):
    if len(seq)%3 == 0:
        clist = [seq[i:i+3] for i in range(0, len(seq), 3)] # Converting nucleotide sequence to list of codons
        return ''.join([codon_dic[x] for x in clist])
    else:
        statement = f"Warning.. the length of the provided wild-type DNA sequence is not a multiple of 3."
        warnings.warn(statement)

In [None]:
get_aa_seq('TTCCCTTTA', codon_dic)

In [None]:
def get_Hamming_distances(wt, alt_aaseq, pos, alt_c, alt_aa):
    '''
    Based on the WT DNA sequence and corresponding amino acid sequence,
    return Hamming distances in nucleotides, codons and amino acids of mutation
    Mutation is defined by an alternative codon, potentially resulting in an alternative amino acid at a given position
    '''
    import numpy as np
    
    # Test if WT DNA sequence
    if 'non-applicable' in [pos, alt_c, alt_aa]:
        Nham_nt, Nham_a = 0, 0
    else:
        Nham_c = 1
        pos = int(pos)
        
        #Retrieve WT codon and WT aa
        wtc = wt[pos*3:(pos+1)*3]
        wta = alt_aaseq[pos]
    
        # Hamming distance in amino acids
        if alt_aa == wta:
            Nham_a = 0
        else:
            Nham_a = 1
        
        # Hamming distance in nucleotides
        Nham_nt = sum(1 for x,y in zip(wtc, alt_c) if x != y)
    
    return Nham_nt, Nham_a

In [None]:
get_Hamming_distances('TTCCCTTTA', 'FPL', 1, 'CTA', codon_dic['CTA'])

In [None]:
get_Hamming_distances('TTCCCTTTA', 'FPL', 1, 'CCC', codon_dic['CCC'])

## Import layout and sequences

In [None]:
layout = pd.read_csv(layout_path)
layout

In [None]:
frag_seq = pd.read_csv(frag_seq_path, sep='\t')
frag_seq['WT_aa'] = frag_seq.WT_seq.apply(lambda x: get_aa_seq(x, codon_dic))
frag_seq

In [None]:
withSeqs = layout.drop(['R1','R2','N_forward','N_reverse'], axis=1).merge(right=frag_seq, on='Mutated_seq')
withSeqs

In [None]:
WTdf = withSeqs.copy()
WTdf['nt_seq'] = WTdf.WT_seq
WTdf['WT'] = True
for x in ['pos','aa_pos','alt_codons','alt_aa']:
    WTdf[x] = 'non-applicable'
WTdf

## Generate expected variants

In [None]:
withSeqs['pos'], withSeqs['alt_codons'] = zip(*withSeqs.WT_seq.apply(lambda x: get_alt_codons(x, codon_dic, codon_mode)))
withSeqs.head(2)

In [None]:
singles_compact = withSeqs.explode(['pos','alt_codons'])
singles_compact.head(2)

In [None]:
singles_df = singles_compact.explode('alt_codons')
singles_df

In [None]:
singles_df['mutations'] = singles_df.apply(lambda row: {row[f'pos']: row[f'alt_codons']}, axis=1)
singles_df

In [None]:
singles_df['nt_seq'] = singles_df.apply(lambda row: get_nt_seq(row.WT_seq, row.mutations), axis=1)
singles_df.drop(columns='mutations', inplace=True)
singles_df

In [None]:
singles_df['alt_aa'] = singles_df.alt_codons.apply(lambda x: codon_dic[x])
singles_df

In [None]:
singles_df['aa_pos'] = singles_df['pos'] + singles_df['Pos_start']
singles_df

In [None]:
expected_df = pd.concat([WTdf.convert_dtypes(), singles_df], ignore_index=True)
expected_df

In [None]:
expected_df['aa_seq'] = expected_df.nt_seq.apply(lambda x: get_aa_seq(x, codon_dic))
expected_df

In [None]:
expected_df['Nham_codons'] = expected_df.WT.isnull().astype(int)
expected_df['Nham_nt'], expected_df['Nham_aa'] = zip(*expected_df.apply(lambda row: get_Hamming_distances(row.WT_seq, row.WT_aa, row.pos, row.alt_codons, row.alt_aa), axis=1))
expected_df

In [None]:
expected_df.drop(['WT_seq','WT_aa'], axis=1, inplace=True)

In [None]:
expected_df.to_csv(output_file)

In [None]:
expected_df.groupby(['Species','Fragment'])[['nt_seq','aa_seq']].nunique().reset_index()