# Enzyme analysis

1. Convert the enzyme variant to amino acids based on the AAs and be warned about 1 vs 0 so need to check
2. Calculate concentration rather than yield (if yield is given then yield * mmol concentration)
3. Normalize to parent for each column (TTN, Yield)
4. Embeddings sequences
5. Convert to LevSeq format

In [9]:
! export CUDA_VISIBLE_DEVICES=1

In [1]:
import pandas as pd
import os
import torch 

# CUDA setup
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"
cuda = True
DEVICE = torch.device("cuda" if cuda else "cpu")

data_dir = 'output'
df = pd.read_csv(os.path.join(data_dir, 'protein-evolution-database_V6_proteins_reactions_clean-YL.csv'))
df['aminoacid_mutations_from_parent'] = [a.replace(',', '_') for a in df['aminoacid_mutations_from_parent'].values]

# Iterate through the parent amino acid sequence and make the variant sequence 

Since sometimes people use 0 index and sometimes 1 index, first check the parent AA sequnece has the correct AA on the LHS.

In [2]:
from sciutil import SciUtil

u = SciUtil()

def convert_to_variant(parent, aa_to_change, verbose=False):
    warn = []
    error = []
    variant = []
    #try:
    aa_to_change = aa_to_change.replace(' ', '').strip()
    variant = list(parent)
    parent = list(parent)
    sep = '_'
    # check what separator was used
    if '_' in aa_to_change:
        sep = '_'
    elif ',' in aa_to_change:
        sep = ','
    for aa in aa_to_change.split(sep):
        if 'FAD' not in aa:
            aa = aa.split('+')[0] # Remove any random domains... 
            try:
                parent_aa = aa[0]
                variant_aa = aa[-1]
                position = int(aa[1:-1])
                # Check the parent at the position is correct
                if position < len(parent):
                    if parent[position] == parent_aa and parent[position - 1] == parent_aa : # Can't distinguish... since they are the same!
                        if verbose:
                            u.warn_p(['Defaulting to 1 index since could not distinguish... warning to check sequence'])
                        variant[position + 1] = variant_aa
                        warn.append([aa, 'Warning: position 0 and 1 index had same AA used 1 index'])
                    elif parent[position] == parent_aa: # 0 index
                        variant[position] = variant_aa
                    elif parent[position - 1] == parent_aa: # 1 index
                        variant[position - 1] = variant_aa
                    else:
                        error.append([aa, 'Error in parent AA != AA in seq']) # Failed at this one we have an idea about where it was
                elif parent[position - 1] == parent_aa: # 1 index
                        variant[position - 1] = variant_aa
                else:
                    error.append([aa, 'Error in parent AA != AA in seq']) # Failed at this one we have an idea about where it was
            except Exception as e:
                error.append([aa, str(e)])
        else:
            u.warn_p(['FAD domain, manually check'])

    return ''.join(variant), warn, error
    
# Check these are the same since this is what we need
assert convert_to_variant('MTAKEMPQPKTFGELKNLPLLNTD', 'A2K')[0] == 'MTKKEMPQPKTFGELKNLPLLNTD' # 0 indexed example
assert convert_to_variant('MATKEMPQPKTFGELKNLPLLNTD', 'A2K')[0] == 'MKTKEMPQPKTFGELKNLPLLNTD' # 1 indexed
assert len(convert_to_variant('MAAKEMPQPKTFGELKNLPLLNTD', 'A2K')[1]) > 0 # Should print an error too 

In [3]:
from tqdm import tqdm 
from Bio.Seq import translate
import re

aas = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

reg = f"^[{''.join(aas)}]*$"

parent_aa = []
variant_aa = []
errors, warnings = [], []
changed = 0

for parent, parent_nt, aa_to_change, variant_nt in tqdm(df[['parent_aminoacid_sequence', 'parent_DNA_sequence', 'aminoacid_mutations_from_parent', 'variant_DNA_sequence']].values):
    if not parent or not isinstance(parent, str) or len(parent) < 2:
        # Try converting the parent nt
        if not parent_nt or not isinstance(parent_nt, str) or len(parent_nt) < 2:
            parent = None
            errors.append(['Parent was not string?'])
            warnings.append(['Parent was not string?'])
            parent_aa.append(None)
            continue
        parent_nt = parent_nt.replace(' ', '')
        parent = translate(parent_nt)
    parent = parent.replace(' ', '')
    parent = parent.strip()
    # We're replace his tags... 
    parent = parent.replace('LEHHHHHH', '').replace('HHHHHH', '').replace('*', '')
    # Check if the parent has any odd AAs
    if not re.match(reg, parent):
        # Parent has strange seqs..
        for ai, a in enumerate(parent):
            if a not in aas:
                print(a, ai)
        errors.append(['Parent had a non-correct AA in it.. check for numbers!'])
        variant_aa.append(None)
        warnings.append(None)
        parent_aa.append(parent)
        continue
    aa_to_change = aa_to_change.strip()
    if aa_to_change != '?' and parent is not None:
        variant, warn, err = convert_to_variant(parent, aa_to_change)
        # Check if it got truncated 
        variant = variant.split('*')[0] # Truncate 
        variant_aa.append(variant)
        # Ensure the variant only has amino acids
        if not re.match(reg, variant):
            # Parent has strange seqs..
            print('NO MATCH')
            break
        changed += 1
        if variant_aa == parent:
            print('Same?', aa_to_change)
        if len(warn) == 0: 
            warnings.append(None)
        else:
            warnings.append(warn)
        if len(err) == 0:
            errors.append(None)
        else:
            errors.append(err)
    else:
        variant_aa.append(None)
        errors.append(None)
        warnings.append(None)
    parent_aa.append(parent)


100%|███████████████████████████| 1342/1342 [00:00<00:00, 15563.62it/s]

[93m--------------------------------------------------------------------------------[0m
[93m                          FAD domain, manually check	                           [0m
[93m--------------------------------------------------------------------------------[0m
[93m--------------------------------------------------------------------------------[0m
[93m                          FAD domain, manually check	                           [0m
[93m--------------------------------------------------------------------------------[0m
[93m--------------------------------------------------------------------------------[0m
[93m                          FAD domain, manually check	                           [0m
[93m--------------------------------------------------------------------------------[0m
[93m--------------------------------------------------------------------------------[0m
[93m                          FAD domain, manually check	                           [0m
[93m-----




In [4]:
df['parent_aa'] = parent_aa
df['variant_aa'] = variant_aa
df['errors'] = errors
df['warnings'] = warnings
# Print out the number of errors
err_df = df[~df['errors'].isna()]
err_df['first author'].value_counts()

Andrew Zhou    20
Juner Zhang    16
Name: first author, dtype: int64

In [5]:
err_df

Unnamed: 0,culture_collection_entry,enzyme_name_from_paper,Uniprot_ID(if applicable),comment,reaction_smiles,parent_DNA_sequence,parent_aminoacid_sequence,aminoacid_mutations_from_parent,variant_DNA_sequence,mutations_from_parent,...,paper title,doi,SUBMITTED BY,raw data name,cannonical_reactions,named_reactions,errors,parent_aa,variant_aa,warnings
117,4866,P411-C10,?,express at 22 °C,CC1=CC=C(N(C)C)C=C1.O=C2OCCC2=[N+]=[N-]>>CC3=C...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,Cc1ccc(N(C)C)cc1.[N-]=[N+]=C1CCOC1=O>>Cc1ccc(N...,"N,N,4-trimethylaniline + 3-diazooxolan-2-one =","[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,
127,4866,P411-C10,?,express at 24 °C,CN(C)C1=CC=C(OC)C=C1.O=C2OCCC2=[N+]=[N-]>>CN(C...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,COc1ccc(N(C)C)cc1.[N-]=[N+]=C1CCOC1=O>>COc1ccc...,"4-methoxy-N,N-dimethylaniline + 3-diazooxolan-...","[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,
137,4866,P411-C10,?,express at 24 °C,ClC1=CC=C(N(C)C)C=C1.O=C2OCCC2=[N+]=[N-]>>ClC3...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,CN(C)c1ccc(Cl)cc1.[N-]=[N+]=C1CCOC1=O>>CN(CC1C...,"4-chloro-N,N-dimethylaniline + 3-diazooxolan-2...","[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,
147,4866,P411-C10,?,express at 24 °C,CN(C)C1=CC(C)=CC=C1.O=C2OCCC2=[N+]=[N-]>>CN(CC...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,Cc1cccc(N(C)C)c1.[N-]=[N+]=C1CCOC1=O>>Cc1cccc(...,"N,N,3-trimethylaniline + 3-diazooxolan-2-one =","[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,
157,4866,P411-C10,?,express at 24 °C,CN(C)C1=CC=CC=C1.O=C2OCCC2=[N+]=[N-]>>CN(CC3CC...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,CN(C)c1ccccc1.[N-]=[N+]=C1CCOC1=O>>CN(CC1CCOC1...,"N,N-dimethylaniline + 3-diazooxolan-2-one =","[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,
167,4866,P411-C10,?,express at 24 °C,CN(C)C1=C(C)C=CC=C1.O=C2OCCC2=[N+]=[N-]>>CN(CC...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,Cc1ccccc1N(C)C.[N-]=[N+]=C1CCOC1=O>>Cc1ccccc1N...,"N,N,2-trimethylaniline + 3-diazooxolan-2-one =","[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,
177,4866,P411-C10,?,express at 24 °C,CN(CC)C1=CC=CC=C1.O=C2OCCC2=[N+]=[N-]>>O=C3OCC...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,CCN(C)c1ccccc1.[N-]=[N+]=C1CCOC1=O>>CCN(CC1CCO...,N-ethyl-N-methylaniline + 3-diazooxolan-2-one =,"[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,
187,4866,P411-C10,?,express at 24 °C,CN(CC)C1=CC=CC=C1.O=C2OCCC2=[N+]=[N-]>>CN(C(C)...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,CCN(C)c1ccccc1.[N-]=[N+]=C1CCOC1=O>>CC(C1CCOC1...,N-ethyl-N-methylaniline + 3-diazooxolan-2-one =,"[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,
197,4866,P411-C10,?,express at 24 °C,CCN(CC)C1=CC=CC=C1.O=C2OCCC2=[N+]=[N-]>>O=C3OC...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,CCN(CC)c1ccccc1.[N-]=[N+]=C1CCOC1=O>>CCN(c1ccc...,"N,N-diethylaniline + 3-diazooxolan-2-one =","[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,
207,4866,P411-C10,?,express at 24 °C,CCN(CC)C1=CC=CC=C1.O=C2OCCC2=[N+]=[N-]>>O=C3OC...,ACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAA...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,N70E_A74G_V78L_A82L_F87A_M118S_P142S_F162L_T17...,ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTA...,?,...,Enzymatic Lactone-Carbene C−H Insertion to Bui...,doi.org/10.1021/acscatal.0c01349,Kai Chen,Lactone C-H insertion 2020,CCN(CC)c1ccccc1.[N-]=[N+]=C1CCOC1=O>>CCN(c1ccc...,"N,N-diethylaniline + 3-diazooxolan-2-one =","[[N70E, Error in parent AA != AA in seq], [A74...",MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,


# Continue with data that has no errors

Ensure that the required libraries are installed:

Install `enzyme-tk`


In [15]:
#! pip install enzymetk

# Create ESM2 or ESM3 embeddings of each protein



In [7]:
# Do ESM embedding of the ones that look good!
import sys
sys.path.append('/disk1/ariane/vscode/enzyme-tk')
from enzymetk.embedprotein_esm_step import EmbedESM
from enzymetk.save_step import Save
import pandas as pd

df['id'] = [f'DEDB{i}' for i in range(0, len(df))]
df = df[df['errors'].isna()]
# Fill in variants with parent if there is no variant
df['variant_aa'] = [p if v is None else v for p, v in df[['parent_aa', 'variant_aa']].values]
variant_df = df[~df['variant_aa'].isna()] 
variant_df = df[~df['variant_aa'].isna()] # Remove any that didn't have variants
variant_df = variant_df.drop_duplicates(subset=['variant_aa']) # Remove any that didn't have variants

variant_df['variant_id'] = [f'VDEDB{i}' for i in range(0, len(variant_df))]
variant_df['variant_aa_nohis'] = [s.replace('LEHHHHHH', '').replace('HHHHHH', '').replace('*', '') for s in variant_df['variant_aa'].values]
# Save variant DF
variant_df.to_csv('output/protein-evolution-database_V4_proteins_reactions_clean_unique_variants.csv', index=False)

# Hmmm there are too many duplicates (need to check if this is from 
id_col = 'variant_id'
seq_col = 'variant_aa_nohis'

#embedding_df = (variant_df << (EmbedESM(id_col, seq_col, extraction_method='mean', tmp_dir='/disk1/ariane/vscode/DirectedEvolutionDB/analysis/tmpv2/') >> Save('output/protein-evolution-database_V4_embedded_proteins.pkl')))

In [8]:
len(variant_df)

361

# Save embedded file 

Save both only the parents and also the 

In [9]:
import pandas as pd
df.to_csv('output/protein-evolution-database_V6_proteins_reactions_clean.csv', index=False)
parents = df.drop_duplicates(subset=['parent_aa', 'reaction_smiles'])
parents['substrate_smiles'] = [x.split('>')[0] for x in parents['reaction_smiles'].values]
parents = parents[parents['aminoacid_mutations_from_parent'] == '?']
parents = parents.drop_duplicates(subset=['parent_aa', 'substrate_smiles'])
parents.to_csv('output/protein-evolution-database_DF6_proteins_reactions_clean_parents.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parents['substrate_smiles'] = [x.split('>')[0] for x in parents['reaction_smiles'].values]


In [10]:
df.to_csv('output/protein-evolution-database_V6_proteins_reactions_clean.csv', index=False)


In [58]:
final_parents = parents.drop_duplicates(subset=['parent_aa', 'paper title'], keep='first')
final_parents.to_csv('parents_helen_thesis.csv')

In [59]:
df.to_csv('output/protein-evolution-database_V5_proteins_reactions_clean.csv', index=False)
first_parent = df.drop_duplicates(subset='parent_aa', keep='first')
# Then also do the last variant with the same chemistry for those ones
chemistries = set(first_parent['reaction_smiles'].values)
canonical_reaction = df[df['reaction_smiles'].isin(chemistries)]
# Get the "most mutated variant"
mutations = [len(aa.split('_')) for aa in canonical_reaction['aminoacid_mutations_from_parent'].values]
canonical_reaction['num_mutations'] = mutations
canonical_reaction = canonical_reaction.sort_values(by='num_mutations', ascending=False)
final_variants = canonical_reaction.drop_duplicates(subset=['parent_aa', 'paper title'], keep='first')
#final_variants

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_reaction['num_mutations'] = mutations


In [60]:
final_variants.to_csv('variants_helen_thesis.csv')
