In [None]:
GOAL: go from handcrafted codon table to a final codon table

# keys:
- sele = selected missense variants
- syn = includes synonymous codon variant
- no_stop = removes missense stop codons
- no_stop_syn = no stop codons AND includes synonymous codon

# key columns:
- codons list
- aa string
- iupac_codon (for final dictionary and variant codons)

In [1]:
# add option for stop codon dictionary
import pandas as pd
from Bio.Seq import Seq

In [2]:
iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'}

In [3]:
rev_iupac_dict = {value:key for key,value in iupac_dict.items()}

In [4]:
df = pd.read_csv('dms_codon_table_v2.csv')
df.fillna('', inplace=True)

In [5]:
# the sele_aa column is incorrect
def sele_aa(row):
    codons = row['sele_missense_codons'].split(' ')
    aa_list = []
    for codon in codons:
        if codon == '':
            return ''
        else:
            aa = str(Seq(codon).translate())
            aa_list.append(aa)
    return ''.join(aa_list)

df['sele_aa'] = df.apply(sele_aa, axis=1)

In [6]:
def remove_stop_codons(row):
    codons = row['sele_missense_codons'].split(' ')
    no_stop_codons = []
    for codon in codons:
        aa = str(Seq(codon).translate())
        if aa != "*":
            no_stop_codons.append(codon)
    return ' '.join(no_stop_codons)

df['no_stop_codons'] = df.apply(remove_stop_codons, axis=1)

In [7]:
def no_stop_iupac(row):
    codons = row['no_stop_codons'].split(' ')
    #if not codons:
        #return ''
    pos = int(row['position'])
    iupac = []
    for codon in codons:
        if codon == '':
            return ''
        else:
            iupac.append(codon[pos])
    iupac=set(iupac)
    for key in iupac_dict.keys():
        if set(list(key)) == iupac:
            return iupac_dict[key]

df['no_stop_iupac'] = df.apply(no_stop_iupac, axis=1)

In [8]:
def no_stop_iupac_codon(row):
    iupac = row['no_stop_iupac']
    if iupac == '':
        return ''
    codon = row['codon']
    pos = int(row['position'])
    iupac_codon = codon[:pos] + iupac + codon[pos + 1:]
    return iupac_codon

df['no_stop_iupac_codon'] = df.apply(no_stop_iupac_codon, axis=1)

In [9]:
def no_stop_nucleotides(row):
    codons = row['no_stop_codons'].split(' ')
    pos = int(row['position'])
    nucs = []
    for codon in codons:
        if codon == '':
            return ''
        else:
            nucs.append(codon[pos])
    return ''.join(nucs)

df['no_stop_nucleotides'] = df.apply(no_stop_nucleotides, axis=1)

In [10]:
def no_stop_aa(row):
    codons = row['no_stop_codons'].split(' ')
    aa_list = []
    for codon in codons:
        if codon == '':
            return ''
        else:
            aa = str(Seq(codon).translate())
            aa_list.append(aa)
    return ''.join(aa_list)

df['no_stop_aa'] = df.apply(no_stop_aa, axis=1)

In [11]:
# return difference between stops and all aa
def stop_aa_dif(row):
    all_aa = set(list(row['sele_aa']))
    stop_aa = set(list(row['no_stop_aa']))
    diff = all_aa - stop_aa
    return ''.join(diff)

df.apply(stop_aa_dif, axis=1).value_counts()

     173
*     19
dtype: int64

In [37]:
# general functions
def get_iupac_symbol(nuc_set):
    """Return IUPAC symbol for a set of nucleotides"""
    iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'}
    for key in iupac_dict.keys():
        if set(key) == nuc_set:
            return iupac_dict[key]

# make a new aa column from codons list
def make_aa_col(missense_codons_col):
    codons = missense_codons_col.split(' ')
    aa_list = [str(Seq(codon).translate()) for codon in codons]
    return ''.join(aa_list)

# make a new iupac_codon column from codons list
def make_iupac_codon_col(wt_col, pos_col, missense_codons_col):
    codons = missense_codons_col.split(' ')
    if '' in codons:
        return ''
    wt_codon = str(wt_col)
    pos = int(pos_col)
    nucs = {codon[pos] for codon in codons}
    iupac = get_iupac_symbol(nucs)
    return wt_codon[:pos] + iupac + wt_codon[pos + 1:]

In [30]:
df.head()

Unnamed: 0,codon,aa,position,all_missense_aa,all_missense_nucleotides,all_missense_codons,all_iupac,all_iupac_codon,sele_missense_codons,syn_iupac_codon,...,sele_aa,syn_aa,syn_missense_codons,no_stop_codons,no_stop_iupac,no_stop_iupac_codon,no_stop_nucleotides,no_stop_aa,stop_syn_codons,stop_syn_aa
0,AAA,K,0,*QE,CGT,CAA GAA TAA,B,BAA,CAA GAA TAA,BAA,...,QE*,*QE,CAA GAA TAA,CAA GAA,S,SAA,CG,QE,CAA GAA,QE
1,AAA,K,1,TRI,CGT,ACA AGA ATA,B,ABA,ACA AGA ATA,ABA,...,TRI,RTI,ACA AGA ATA,ACA AGA ATA,B,ABA,CGT,TRI,ACA AGA ATA,TRI
2,AAA,K,2,N,CT,AAC AAT,Y,AAY,AAT,AAK,...,N,KN,AAG AAT,AAT,T,AAT,T,N,AAT,N
3,AAC,N,0,HDY,CGT,CAC GAC TAC,B,BAC,CAC GAC TAC,BAC,...,HDY,DYH,CAC GAC TAC,CAC GAC TAC,B,BAC,CGT,HDY,CAC GAC TAC,HDY
4,AAC,N,1,TSI,CGT,ACC AGC ATC,B,ABC,ACC AGC ATC,ABC,...,TSI,STI,ACC AGC ATC,ACC AGC ATC,B,ABC,CGT,TSI,ACC AGC ATC,TSI


In [39]:
df.columns.tolist()

['codon',
 'aa',
 'position',
 'all_missense_aa',
 'all_missense_nucleotides',
 'all_missense_codons',
 'all_iupac',
 'all_iupac_codon',
 'sele_missense_codons',
 'syn_iupac_codon',
 'syn_codon',
 'sele_missense_nucleotides',
 'sele_iupac',
 'sele_iupac_codon',
 'sele_aa',
 'syn_aa',
 'syn_missense_codons',
 'no_stop_codons',
 'no_stop_iupac',
 'no_stop_iupac_codon',
 'no_stop_nucleotides',
 'no_stop_aa',
 'stop_syn_codons',
 'stop_syn_aa']

In [38]:
# make new stop_syn_codons column
def stop_syn_codons(sele_codons_column, wt_aa_column):
    codons = sele_codons_column.split(' ')
    wt_aa = str(wt_aa_column)
    for codon in codons:
        # check if synonymous or stop codon
        if str(Seq(codon).translate()) in [wt_aa,'*']:
            codons.remove(codon)
    return ' '.join(codons)

df['no_stop_syn_codons'] = df.apply(lambda x: stop_syn_codons(x['sele_missense_codons'], x['aa']), axis=1)
df['no_stop_syn_aa'] = df.apply(lambda x: make_aa_col(x['stop_syn_codons']), axis=1)
df.apply(lambda x: make_iupac_codon_col(x['codon'], x['position'], x['stop_syn_codons']), axis=1)

0      SAA
1      ABA
2      AAT
3      BAC
4      ABC
      ... 
187    TSG
188    TTT
189    RTT
190    TVT
191    TTG
Length: 192, dtype: object

In [36]:
df.head()

Unnamed: 0,codon,aa,position,all_missense_aa,all_missense_nucleotides,all_missense_codons,all_iupac,all_iupac_codon,sele_missense_codons,syn_iupac_codon,...,sele_aa,syn_aa,syn_missense_codons,no_stop_codons,no_stop_iupac,no_stop_iupac_codon,no_stop_nucleotides,no_stop_aa,stop_syn_codons,stop_syn_aa
0,AAA,K,0,*QE,CGT,CAA GAA TAA,B,BAA,CAA GAA TAA,BAA,...,QE*,*QE,CAA GAA TAA,CAA GAA,S,SAA,CG,QE,CAA GAA,QE
1,AAA,K,1,TRI,CGT,ACA AGA ATA,B,ABA,ACA AGA ATA,ABA,...,TRI,RTI,ACA AGA ATA,ACA AGA ATA,B,ABA,CGT,TRI,ACA AGA ATA,TRI
2,AAA,K,2,N,CT,AAC AAT,Y,AAY,AAT,AAK,...,N,KN,AAG AAT,AAT,T,AAT,T,N,AAT,N
3,AAC,N,0,HDY,CGT,CAC GAC TAC,B,BAC,CAC GAC TAC,BAC,...,HDY,DYH,CAC GAC TAC,CAC GAC TAC,B,BAC,CGT,HDY,CAC GAC TAC,HDY
4,AAC,N,1,TSI,CGT,ACC AGC ATC,B,ABC,ACC AGC ATC,ABC,...,TSI,STI,ACC AGC ATC,ACC AGC ATC,B,ABC,CGT,TSI,ACC AGC ATC,TSI


In [24]:


df['stop_syn_aa'] = df.apply(lambda x: make_aa_col(x['stop_syn_codons']), axis=1)

0       QE
1      TRI
2        N
3      HDY
4      TSI
      ... 
187     SW
188      F
189     IV
190    YSC
191      L
Length: 192, dtype: object

In [27]:
# make a new aa column from codons list
def make_iupac_codon_col(missense_codons_col, pos_col):
    codons = missense_codons_col.split(' ')
    pos = int(pos_col)
    nucs = {codon[pos] for codon in codons}
    iupac = get_iupac_symbol(nucs)
    return codon[:pos] + iupac + codon[pos + 1:]

In [23]:
# check number of missense stops
df['sele_aa'].str.contains('\*').sum()

19

In [2]:
df = pd.read_csv('dms_no_stops.csv')
df.head()

Unnamed: 0,codon,aa,position,all_missense_aa,all_missense_nucleotides,all_missense_codons,all_iupac,all_iupac_codon,sele_missense_codons,syn_iupac_codon,...,sele_iupac,sele_iupac_codon,sele_aa,syn_aa,syn_missense_codons,no_stop_codons,no_stop_iupac,no_stop_iupac_codon,no_stop_nucleotides,no_stop_aa
0,AAA,K,0,*QE,CGT,CAA GAA TAA,B,BAA,CAA GAA TAA,BAA,...,B,BAA,QE*,*QE,CAA GAA TAA,CAA GAA,S,SAA,CG,QE
1,AAA,K,1,TRI,CGT,ACA AGA ATA,B,ABA,ACA AGA ATA,ABA,...,B,ABA,TRI,RTI,ACA AGA ATA,ACA AGA ATA,B,ABA,CGT,TRI
2,AAA,K,2,N,CT,AAC AAT,Y,AAY,AAT,AAK,...,T,AAT,N,KN,AAG AAT,AAT,T,AAT,T,N
3,AAC,N,0,HDY,CGT,CAC GAC TAC,B,BAC,CAC GAC TAC,BAC,...,B,BAC,HDY,DYH,CAC GAC TAC,CAC GAC TAC,B,BAC,CGT,HDY
4,AAC,N,1,TSI,CGT,ACC AGC ATC,B,ABC,ACC AGC ATC,ABC,...,B,ABC,TSI,STI,ACC AGC ATC,ACC AGC ATC,B,ABC,CGT,TSI
