GOAL: go from bespoke to final codon table

# keys:
- sele = selected missense variants
- syn = includes synonymous codon variant
- no_stop = removes missense stop codons
- no_stop_syn = no stop codons AND includes synonymous codon

# key columns:
- codons list
- aa string
- iupac_codon (for final dictionary and variant codons)

In [1]:
import pandas as pd
import numpy as np
from Bio.Seq import Seq

In [2]:
df = pd.read_csv('bespoke_codon_table.csv')
df.drop(columns=['sele_notes', 'syn_notes'], inplace=True)
df.syn_bool.replace(np.nan, False, inplace=True)
df.fillna('', inplace=True)

In [3]:
# general functions
iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'}
rev_iupac_dict = {value:list(key) for key,value in iupac_dict.items()}

def get_iupac_symbol(nuc_set):
    """Return IUPAC symbol for a set of nucleotides"""
    for key in iupac_dict.keys():
        if set(key) == nuc_set:
            return iupac_dict[key]

# make a new aa column from codons list
def make_aa_col(missense_codons_col):
    codons = missense_codons_col.split(' ')
    aa_list = [str(Seq(codon).translate()) for codon in codons]
    return ''.join(aa_list)

# make a new iupac_codon column from codons list
def make_iupac_codon_col(wt_col, pos_col, missense_codons_col):
    codons = missense_codons_col.split(' ')
    if '' in codons:
        return ''
    wt_codon = str(wt_col)
    pos = int(pos_col)
    nucs = {codon[pos] for codon in codons}
    iupac = get_iupac_symbol(nucs)
    return wt_codon[:pos] + iupac + wt_codon[pos + 1:]

In [4]:
# make sele columns
df['sele_aa'] = df.apply(lambda x: make_aa_col(x['sele_codons']), axis=1)
df['sele_iupac_codon'] = df.apply(lambda x: make_iupac_codon_col(x['codon'], x['position'], x['sele_codons']), axis=1)

In [5]:
# make syn columns
def make_syn_iupac_codon_col(syn_bool_column, syn_iupac_codon_column, sele_iupac_codon_column):
    if syn_bool_column:
        return syn_iupac_codon_column
    else:
        return sele_iupac_codon_column

df['syn_iupac_codon'] = df.apply(lambda x: make_syn_iupac_codon_col(x['syn_bool'], x['syn_iupac_codon'], x['sele_iupac_codon']), axis=1)

def make_syn_codons_col(position_column, syn_bool_column, syn_iupac_codon_column):
    pos = int(position_column)
    iupac_codon = syn_iupac_codon_column
    if syn_bool_column:
        nucs = rev_iupac_dict[iupac_codon[pos]]
        codons_list = []
        for n in nucs:
            codons_list.append(iupac_codon[:pos] + n + iupac_codon[pos + 1:])
        return ' '.join(codons_list)
    else:
        return ''
    
df['syn_codons'] = df.apply(lambda x: make_syn_codons_col(x['position'], x['syn_bool'], x['syn_iupac_codon']), axis=1)

df['syn_aa'] = df.apply(lambda x: make_aa_col(x['syn_codons']), axis=1)

In [6]:
# make no_stop columns
def make_no_stop_codons_col(sele_codons_col):
    codons = sele_codons_col.split(' ')
    for codon in codons:
        if str(Seq(codon).translate()) == "*":
            codons.remove(codon)
    return ' '.join(codons)

df['no_stop_codons'] = df.apply(lambda x: make_no_stop_codons_col(x['sele_codons']), axis=1)
df['no_stop_aa'] = df.apply(lambda x: make_aa_col(x['no_stop_codons']), axis=1)
df['no_stop_iupac_codon'] = df.apply(lambda x: make_iupac_codon_col(x['codon'], x['position'], x['no_stop_codons']), axis=1)

In [7]:
# make no_stop_syn columns
def make_no_stop_syn_codons_col(syn_bool_col, no_stop_codons_col, syn_codons_col):
    if syn_bool_col:
        codons = syn_codons_col.split(' ')
        for codon in codons:
            if str(Seq(codon).translate()) == "*":
                codons.remove(codon)
        return ' '.join(codons)
    else:
        return no_stop_codons_col
    
df['no_stop_syn_codons'] = df.apply(lambda x: make_no_stop_syn_codons_col(x['syn_bool'], x['no_stop_codons'], x['syn_codons']), axis=1)
df['no_stop_syn_aa'] = df.apply(lambda x: make_aa_col(x['no_stop_syn_codons']), axis=1)
df['no_stop_syn_iupac_codon'] = df.apply(lambda x: make_iupac_codon_col(x['codon'], x['position'], x['no_stop_syn_codons']), axis=1)

In [11]:
# reorganize column order
col = ['codon', 'aa', 'position', 'missense_nuc', 'missense_codons', 'missense_aa', 'missense_iupac', 'missense_iupac_codon', 'sele_codons', 'sele_aa', 'sele_iupac_codon', 'syn_bool', 'syn_codons', 'syn_aa', 'syn_iupac_codon', 'no_stop_codons', 'no_stop_aa', 'no_stop_iupac_codon', 'no_stop_syn_codons', 'no_stop_syn_aa', 'no_stop_syn_iupac_codon']
df = df[col]

In [13]:
# save df
df.to_csv('final_codon_table.csv', index=False)