# Convert to LevSeq format


LevSeq columns:
- experiment_id
- barcode_plate
- plate	well
- nucleotide_mutation
- alignment_count
- average_mutation_frequency
- p_value
- p_adj_value
- amino_acid_substitutions
- nt_sequence
- aa_sequence  

Metadata columns:
- cif file  
- experiment_id
- experiment_name
- experiment_date
- substrate_smiles
- product_smiles
- assay_technique


In [75]:
import pandas as pd

df = pd.read_csv('output/protein-evolution-database_V4_proteins_reactions_clean.csv')
df.columns

Index(['culture_collection_entry', 'enzyme_name_from_paper',
       'Uniprot_ID(if applicable)', 'comment', 'reaction_smiles',
       'parent_DNA_sequence', 'parent_aminoacid_sequence',
       'aminoacid_mutations_from_parent', 'variant_DNA_sequence',
       'mutations_from_parent', 'cofactor', 'additive (if applicable)',
       'additive_CAS', 'enzyme_form', 'substrate_concentration',
       'activity_for_reaction_% (if applicable)', 'TTN (if applicable)',
       'selectivity(ee%),diastereo or chemo should be a separate smiles entry',
       'alternative_product_SMILES', 'failed_substrates (if available)',
       'date published ', 'first author', 'paper title', 'doi', 'SUBMITTED BY',
       'raw data name', 'cannonical_reactions', 'named_reactions', 'errors',
      dtype='object')

# First assign experiment ids based on the parent and the reaction


In [76]:
experiment_to_id = {}
experiment_labels = []
i = 1
for parent_aa, cannonical_reactions in df[['parent_aa', 'cannonical_reactions']].values:
    label = f'{parent_aa}_{cannonical_reactions}'
    if experiment_to_id.get(label):
        experiment_labels.append(experiment_to_id.get(label))
    else:
        experiment_to_id[label] = f'E-{i}'
        experiment_labels.append(f'E-{i}')
        i += 1

df['experiment_id'] = experiment_labels
df['barcode_plate'] = 1
df['well'] = 1
df['alignment_count'] = 100
df['average_mutation_frequency'] = 1
df['p_value'] = 0.01
df['p_adj_value'] = 0.01
df['nt_sequence'] = [v if '?' not in v else a for v, a in df[['variant_DNA_sequence', 'parent_DNA_sequence']].values]
df['aa_sequence'] = df['variant_aa'].values


# Clean DNA mutations

In [77]:
nucleotide_mutations = []
for nt_to_change, aa_to_change in df[['mutations_from_parent', 'aminoacid_mutations_from_parent']].values:
    if nt_to_change == '?':
        if aa_to_change == '?':
            nucleotide_mutations.append('#PARENT#')
        else:
            nucleotide_mutations.append('#N.A.#')
    elif not isinstance(nt_to_change, str):
        print(nt_to_change)
        nucleotide_mutations.append('#N.A.#')
    else:
        # check what separator was used
        if '_' in nt_to_change:
            sep = '_'
        elif ',' in nt_to_change:
            sep = ','
        elif '-' in nt_to_change:
            sep = '-'
        nt_to_change = nt_to_change.split(sep)
        clean_nt = []
        for nt in nt_to_change:
            if 'FAD' not in nt:
                nt = nt.split('+')[0] # Remove any random domains... 
                nt = nt.replace(' ', '')
                nt = nt.upper()
                clean_nt.append(nt)
        if len(clean_nt) == 0:
            nucleotide_mutations.append('#N.A.#')
            print(nt_to_change)
        nucleotide_mutations.append('_'.join(clean_nt))
df['nucleotide_mutation'] = nucleotide_mutations

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


# Clean AA substitutions

In [78]:

# Clean the AA substitutions
amino_acid_substitutions = []
for aa_to_change in df['aminoacid_mutations_from_parent'].values:
    if aa_to_change == '?':
        amino_acid_substitutions.append('#PARENT#')
    elif not isinstance(aa_to_change, str):
        print(aa_to_change)
        amino_acid_substitutions.append('#N.A.#')
    else:
        # check what separator was used
        if '_' in aa_to_change:
            sep = '_'
        elif ',' in aa_to_change:
            sep = ','
        elif '-' in aa_to_change:
            sep = '-'
        aa_to_change = aa_to_change.split(sep)
        clean_aa = []
        for aa in aa_to_change:
            if 'FAD' not in aa:
                aa = aa.split('+')[0] # Remove any random domains... 
                aa = aa.replace(' ', '')
                aa = aa.upper()
                clean_aa.append(aa)
        if len(clean_aa) == 0:
            amino_acid_substitutions.append('#N.A.#')
            print(aa_to_change)
        amino_acid_substitutions.append('_'.join(clean_aa))
        
df['amino_acid_substitutions'] = amino_acid_substitutions


# Save as a newly formatted DF

In [79]:
levseq_df = df[['id', 'experiment_id', 'barcode_plate', 'well',
               'alignment_count', 'average_mutation_frequency', 'p_value', 
               'p_adj_value', 'nt_sequence', 'aa_sequence']]
levseq_df.to_csv('output/LevSeq_formatted_protein-evolution-database_V4.csv', index=False)

# Now also make a metadata df to go with each one

Metadata columns:
- cif file  
- experiment_id
- experiment_name
- experiment_date
- substrate_smiles
- product_smiles
- assay_technique

```
Index(['Unnamed: 0', 'culture_collection_entry', 'enzyme_name_from_paper',
       'Uniprot_ID(if applicable)', 'comment', 'reaction_smiles',
       'parent_DNA_sequence', 'parent_aminoacid_sequence',
       'aminoacid_mutations_from_parent', 'variant_DNA_sequence',
       'mutations_from_parent', 'cofactor', 'additive (if applicable)',
       'additive_CAS', 'enzyme_form', 'substrate_concentration',
       'activity_for_reaction_% (if applicable)', 'TTN (if applicable)',
       'selectivity(ee%),diastereo or chemo should be a separate smiles entry',
       'alternative_product_SMILES', 'failed_substrates (if available)',
       'date published ', 'first author', 'paper title', 'doi', 'SUBMITTED BY',
       'raw data name', 'cannonical_reactions', 'named_reactions', 'errors',
       'parent_aa', 'variant_aa', 'warnings', 'id'],
      dtype='object')

```

In [80]:
doi_to_date = {'DOI: 10.1126/science.aah6219': '25.11.2016',
 'DOI: 10.1126/science.adi5554': '25.01.2024',
 'doi.org/10.1002/anie.202002861': '24.03.2020',
 'doi.org/10.1002/anie.202106938': '15.07.2021',
 'doi.org/10.1002/anie.202110873': '17.09.2021',
 'doi.org/10.1002/anie.202303879': '01.06.2023',
 'doi.org/10.1002/cbic.201900497': '12.09.2019',
 'doi.org/10.1021/acscatal.0c01349': '20.04.2020',
 'doi.org/10.1021/acscatal.0c01888': '04.06.2020',
 'doi.org/10.1021/acscatal.3c05370': '16.12.2023',
 'doi.org/10.1021/acscatal.9b02089': '20.08.2019',
 'doi.org/10.1021/acscentsci.3c00516': '14.12.2023',
 'doi.org/10.1021/jacs.0c01313': '29.03.2020',
 'doi.org/10.1021/jacs.0c03428': '25.05.2020',
 'doi.org/10.1021/jacs.1c11340': '23.12.2021',
 'doi.org/10.1021/jacs.2c00251': '08.03.2022',
 'doi.org/10.1021/jacs.2c02723': '13.05.2022',
 'doi.org/10.1021/jacs.2c08285': '04.10.2022',
 'doi.org/10.1021/jacs.3c04870': '11.07.2023',
 'doi.org/10.1021/jacs.3c11722': '02.01.2024',
 'doi.org/10.1021/jacs.4c09989': '27.09.2024',
 'doi.org/10.1021/jacs.9b04344': '07.06.2019',
 'doi.org/10.1021/jacs.9b09864': '20.11.2019',
 'doi.org/10.1038/s41557-019-0343-5': '14.10.2019',
 'doi.org/10.1038/s41589-024-01619-z': '14.05.2024',
 'doi.org/10.1038/s41929-022-00908-x': '19.01.2023',
 'doi.org/10.1038/s41929-024-01149-w': '03.05.2024',
 'doi.org/10.1038/s44160-023-00431-2': '02.11.2023',
 'doi/10.1021/jacs.4c04190': '03.07.2024',
 'doi: 10.1055/s-0037-1611662': '14.01.2019',
 'doi:10.1038/nature24996': '29.11.2017',
 'https://doi.org/10.1002/anie.202208936': '19.12.2022',
 'https://doi.org/10.1021/jacs.3c08053': '06.09.2023',
 'https://doi.org/10.1021/jacs.9b02931': '09.05.2019',
 'https://doi.org/10.1021/jacs.9b11608': '02.12.2019',
 'https://doi.org/10.1038/s41557-021-00794-z': '18.10.2021'}

In [81]:
grped = df.groupby('experiment_id')
metadata_rows = []
for name, experiment in grped:
    try:
        # this is what makes each a row in the df (each should have the same reaction/products and substrates)
        substrates = experiment['cannonical_reactions'].values[0].split('>>')[0]
        products = experiment['cannonical_reactions'].values[0].split('>>')[1]
        paper = experiment['paper title'].values[0]
        date = doi_to_date.get(experiment['doi'].values[0])
        metadata_rows.append([name, paper, date, substrates, products, 'literature', f'{name}.cif'])
    except:
        print(name)
metadata_df = pd.DataFrame(metadata_rows, columns=['experiment_id', 'experiment_name', 'experiment_date', 
                                          'substrate_smiles', 'product_smiles', 'assay_technique', 'cif_filename'])
metadata_df

Unnamed: 0,experiment_id,experiment_name,experiment_date,substrate_smiles,product_smiles,assay_technique,cif_filename
0,E-1,Dual-Function Enzyme Catalysis for Enantiosele...,18.10.2021,CNc1ccccc1.[N-]=[N+]=C1CCOC1=O,CN(c1ccccc1)[C@H]1CCOC1=O,literature,E-1.cif
1,E-10,Dual-Function Enzyme Catalysis for Enantiosele...,18.10.2021,Cc1cccc(N)c1.[N-]=[N+]=C1CCOC1=O,Cc1cccc(NC2CCOC2=O)c1,literature,E-10.cif
2,E-100,Tailoring Tryptophan Synthase TrpB for Selecti...,20.11.2019,CCC1C(=O)Nc2ccccc21.N[C@@H](CO)C(=O)O,CCC1(C[C@H](N)C(=O)O)C(=O)Nc2ccccc21,literature,E-100.cif
3,E-101,Tailoring Tryptophan Synthase TrpB for Selecti...,20.11.2019,CCCC1C(=O)Nc2ccccc21.N[C@@H](CO)C(=O)O,CCCC1(C[C@H](N)C(=O)O)C(=O)Nc2ccccc21,literature,E-101.cif
4,E-102,Tailoring Tryptophan Synthase TrpB for Selecti...,20.11.2019,CCCCC1C(=O)Nc2ccccc21.N[C@@H](CO)C(=O)O,CCCCC1(C[C@H](N)C(=O)O)C(=O)Nc2ccccc21,literature,E-102.cif
...,...,...,...,...,...,...,...
668,E-95,Tailoring Tryptophan Synthase TrpB for Selecti...,20.11.2019,CC1C(=O)Nc2ccccc21.N[C@@H](CO)C(=O)O,CC1(C[C@H](N)C(=O)O)C(=O)Nc2ccccc21,literature,E-95.cif
669,E-96,Tailoring Tryptophan Synthase TrpB for Selecti...,20.11.2019,CC1C(=O)Nc2cc(Cl)ccc21.N[C@@H](CO)C(=O)O,CC1(C[C@H](N)C(=O)O)C(=O)Nc2cc(Cl)ccc21,literature,E-96.cif
670,E-97,Tailoring Tryptophan Synthase TrpB for Selecti...,20.11.2019,CC1C(=O)Nc2ccc(F)cc21.N[C@@H](CO)C(=O)O,CC1(C[C@H](N)C(=O)O)C(=O)Nc2ccc(F)cc21,literature,E-97.cif
671,E-98,Tailoring Tryptophan Synthase TrpB for Selecti...,20.11.2019,CC1C(=O)Oc2ccccc21.N[C@@H](CO)C(=O)O,CC1(C[C@H](N)C(=O)O)C(=O)Oc2ccccc21,literature,E-98.cif


In [82]:
metadata_df.to_csv('output/LevSeq-metadata_formatted_protein-evolution-database_V4.csv', index=False)