## NNK library bias calculation

### Problem definition
Given an NNK codon, what's the probability its each of the amino acids

In [None]:
from Bio.Seq import Seq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

In [None]:
matplotlib.rc('figure', dpi = 125)

In [None]:
n_nucleotide = ['A','T','G','C']
k_nucleotide = ['G', 'T']
s_nucleotide = ['G', 'C']

nnk_list = []
for n1 in n_nucleotide:
    for n2 in n_nucleotide:
        for k in k_nucleotide:
            nnk_list.append(n1 + n2 + k)

nns_list =  []
for n1 in n_nucleotide:
    for n2 in n_nucleotide:
        for s in s_nucleotide:
            nns_list.append(n1 + n2 + s)
            
df_nnk = pd.DataFrame({'codon': nnk_list})
df_nns = pd.DataFrame({'codon': nns_list})

In [None]:
df_nnk['aa'] = df_nnk['codon'].apply(lambda x: str(Seq(x).translate()))
df_nns['aa'] = df_nns['codon'].apply(lambda x: str(Seq(x).translate()))

In [None]:
nnk_codon_freq = (df_nnk.groupby(['aa']).agg('count') / df_nnk.shape[0]).reset_index().rename(columns = {'codon': 'freq'})
nnk_codon_freq = nnk_codon_freq.sort_values(by= 'freq')

nns_codon_freq = (df_nns.groupby(['aa']).agg('count') / df_nns.shape[0]).reset_index().rename(columns = {'codon': 'freq'})
nns_codon_freq = nns_codon_freq.sort_values(by= 'freq')

In [None]:
fig, axs = plt.subplots(nrows = 2, figsize = (4,6))
plt.tight_layout(pad = 2)

for ax,df,title in zip(axs, [nns_codon_freq, nnk_codon_freq], ['NNS', 'NNK']):

    sns.barplot(data = df, x = 'aa', y = 'freq', ax = ax,
               palette= 'Blues_d')

    ax.set_ylim(0,0.1)
    ax.set_ylabel('Frequency')
    ax.set_xlabel('')
    ax.set_title(title)

plt.show()

In [None]:
nnk_codon_freq