# Reformat the designed library mutations

This notebook reformats the csv of designed library mutations for input into the analysis pipeline in the top level directory. 

In [1]:
import pandas as pd

In [2]:
old_df = pd.read_csv('results/IDT_library_df.csv')
display(old_df)

Unnamed: 0.1,Unnamed: 0,site,wildtype,mutant,mutation,effect,log2effect,is_stop,num_codons
0,11472,613,G,G,G624G,1.000000,0.000000,False,1
1,226,109,S,S,S110S,1.000000,0.000000,False,1
2,3168,249,V,V,V255V,1.000000,0.000000,False,1
3,2650,225,E,E,E231E,1.000000,0.000000,False,1
4,7174,427,P,P,P438P,1.000000,0.000000,False,1
...,...,...,...,...,...,...,...,...,...
7205,13833,95,W,R,W96R,0.251042,-1.994000,False,1
7206,13869,97,N,K,N98K,0.270381,-1.886934,False,1
7207,0,99,M,*,M100*,0.199444,-2.325945,True,1
7208,1995,194,A,*,A200*,0.217001,-2.204224,True,1


In [3]:
conversion_table = pd.read_csv('results/conversion.csv')

In [4]:
# make dictionary 
mut_type_dictionary = {}
for mutation in conversion_table['mutation'].unique().tolist():
    mut_types = conversion_table.query('mutation==@mutation')['mut_type'].tolist()
    mut_type_dictionary[mutation] = mut_types

In [5]:
# rebuild table row by row (needs to be implemented differently/better)
rows = []
mutations = old_df['mutation'].unique().tolist()
old_df=old_df.set_index('mutation')
for mutation in mutations: 
    if mutation[0]==mutation[-1]:
        row = old_df.loc[[mutation]]
        row['mutation_type'] = 'synonymous'
        rows.append(row)
    else:
        mut_types = mut_type_dictionary[mutation]
        for mut_type in mut_types:
            row = old_df.loc[[mutation]]
            row['mutation_type'] = mut_type
            rows.append(row)
new_df = pd.concat(rows)

In [6]:
assert(old_df.set_index('Unnamed: 0').equals((new_df.reset_index()
                                               .set_index('Unnamed: 0')
                                               .drop_duplicates(subset='mutation')
                                               .drop(['mutation', 'mutation_type'], axis=1)
                                               )))

In [7]:
designed_mutations = (new_df.reset_index().drop(['Unnamed: 0',
                                                 'wildtype',
                                                 'mutation',
                                                 'effect',
                                                 'log2effect',
                                                 'is_stop',
                                                 'num_codons'], axis=1)
                      .rename(columns={'mutant': 'amino_acid',
                                      'site': 'sequential_site'}))
designed_mutations

Unnamed: 0,sequential_site,amino_acid,mutation_type
0,613,G,synonymous
1,109,S,synonymous
2,249,V,synonymous
3,225,E,synonymous
4,427,P,synonymous
...,...,...,...
10216,95,R,natural_sequence_mutations
10217,97,K,natural_sequence_mutations
10218,99,*,stop
10219,194,*,stop


In [8]:
designed_mutations.query('mutation_type!="synonymous"').to_csv('results/designed_mutations.csv', index=False)