In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from common.bio.amino_acid import *
from common.bio.smiles import *
from common.preprocessing.dataframe import *
import json
import os

In [3]:
SMILES_COLUMNS=['smiles_product_1','smiles_product_2','smiles_substrate_1','smiles_substrate_2']

In [23]:
MIN_NUM_EXAMPLES = 200

LABEL_COLUMN = "EC number"
VERSION = "Single"
MAX_SEQUENCE_LENGTH=256
IS_VALIDATION_REQUIRED = False
ADD_SMILES=True
NUM_THREADS = 16
DATA_SOURCE = "../../data/protein/cgan/data_sources/augmented_2018_10_16.csv"

## Loading data

In [24]:
original_data = pd.read_csv(DATA_SOURCE, sep='\t', skipinitialspace=True).rename(columns={"EC class": "EC number", 
                                                                                          "Sequence" :"sequence"})
original_data.shape

(409900, 2)

In [25]:
data = original_data

In [26]:
data.head()

Unnamed: 0,sequence,EC number
0,MQGFKIARHFELPTAPSQFFAGSSLNRLSFLRSNREFLNKAFYDHT...,3.6.1.22
1,MDRIIEKLDHGWWVVSHEQKLWLPKGELPYGEAANFDLVGQRALQI...,3.6.1.22
2,MSARWTTAVLDPQITGGLAVARSPDGFLVDANGALFPRDWLKRQDL...,3.6.1.22
3,MDRIIEKLDHGWWVVSHEQKLWLPKGELPYGEAANFDLVGQRALQI...,3.6.1.22
4,MSARWTTAVLDPQMTGGLAVARSPEGFLVDANGALFPRDWLKRQDL...,3.6.1.22


## Adding smiles components

In [27]:
def filter_character(selected_data, character):
    selected_data = selected_data[~selected_data.smiles_product_1.str.contains(character)]
    selected_data = selected_data[~selected_data.smiles_product_2.str.contains(character)]
    selected_data = selected_data[~selected_data.smiles_substrate_1.str.contains(character)]
    selected_data = selected_data[~selected_data.smiles_substrate_2.str.contains(character)]
    return selected_data

In [28]:
def filter_smiles_characters(selected_data):
    selected_data = filter_character(selected_data, "s")
    selected_data = filter_character(selected_data, "F")
    selected_data = filter_character(selected_data, "R")
    selected_data = filter_character(selected_data, "X")
    selected_data = filter_character(selected_data, "6")
    selected_data = filter_character(selected_data, "#")
    selected_data = filter_character(selected_data, "l")
    selected_data = filter_character(selected_data, "@")
    selected_data.smiles_product_1 = selected_data.smiles_product_1.str.replace('R', '*')
    selected_data.smiles_product_2 = selected_data.smiles_product_2.str.replace('R', '*')
    selected_data.smiles_substrate_1 = selected_data.smiles_substrate_1.str.replace('R', '*')
    selected_data.smiles_substrate_2 = selected_data.smiles_substrate_2.str.replace('R', '*')
    return selected_data

In [29]:
if ADD_SMILES:
    enzyme_smiles_map = pd.read_csv("../../data/protein/cgan/enzyme_with_smiles.csv", sep='\t', skipinitialspace=True)
    enzyme_smiles_map = enzyme_smiles_map.fillna("")
    enzyme_smiles_map = filter_smiles_characters(enzyme_smiles_map)
    for col in SMILES_COLUMNS:
        enzyme_smiles_map = enzyme_smiles_map[enzyme_smiles_map[col].str.len() <= 128]
    merged = pd.merge(data, enzyme_smiles_map, left_on=LABEL_COLUMN, right_on='EC class', how="inner")
    merged = merged.drop('EC class', axis=1)
    data = merged

In [30]:
data.head()

Unnamed: 0,sequence,EC number,smiles_product_1,smiles_product_2,smiles_substrate_1,smiles_substrate_2
0,MQGFKIARHFELPTAPSQFFAGSSLNRLSFLRSNREFLNKAFYDHT...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
1,MDRIIEKLDHGWWVVSHEQKLWLPKGELPYGEAANFDLVGQRALQI...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
2,MSARWTTAVLDPQITGGLAVARSPDGFLVDANGALFPRDWLKRQDL...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
3,MDRIIEKLDHGWWVVSHEQKLWLPKGELPYGEAANFDLVGQRALQI...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
4,MSARWTTAVLDPQMTGGLAVARSPEGFLVDANGALFPRDWLKRQDL...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O


## Selecting data

In [31]:
is_test=False
if is_test:
    prefix = "test"
else:
    prefix = "train"

In [32]:
data

Unnamed: 0,sequence,EC number,smiles_product_1,smiles_product_2,smiles_substrate_1,smiles_substrate_2
0,MQGFKIARHFELPTAPSQFFAGSSLNRLSFLRSNREFLNKAFYDHT...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
1,MDRIIEKLDHGWWVVSHEQKLWLPKGELPYGEAANFDLVGQRALQI...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
2,MSARWTTAVLDPQITGGLAVARSPDGFLVDANGALFPRDWLKRQDL...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
3,MDRIIEKLDHGWWVVSHEQKLWLPKGELPYGEAANFDLVGQRALQI...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
4,MSARWTTAVLDPQMTGGLAVARSPEGFLVDANGALFPRDWLKRQDL...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
5,MDRIIEKLDHGWWVVSHEQKLWLPKGELPYGEAANFDLVGQRALQI...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
6,MSRPERWTTAVLDVEAPGGLAVVQGDQGFLLDANGVMFPRGWLAGQ...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
7,MDRIIEKLDHGWWVVSHEQKLWLPKGELPYGEAANFDLVGQRALQI...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
8,MDRIIEKLESGWWIVSHEQKLWLPYGELPHGLAANFDLVGQRALRI...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O
9,MDRIIEKLESGWWIVSHEQKLWLPYGELPHGLAANFDLVGQRALRI...,3.6.1.22,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,NC(=O)C1=CN(C2OC(COP(=O)(O)O)C(O)C2O)C=CC1,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,O


In [33]:
properties = {}
selected_data = data[(data.sequence.str.len() <= MAX_SEQUENCE_LENGTH) &(data.sequence.str.len() >= 64)]
if ADD_SMILES:
    selected_data = filter_smiles_characters(selected_data)
    selected_data =  selected_data[(selected_data[LABEL_COLUMN].str.startswith("3.")) | (selected_data[LABEL_COLUMN].str.startswith("4.")) | (selected_data[LABEL_COLUMN].str.startswith("6."))]
if is_test:
    selected_data = selected_data.groupby(LABEL_COLUMN, group_keys=False).filter(lambda x: len(x) < MIN_NUM_EXAMPLES)
    selected_data = selected_data.groupby(LABEL_COLUMN, group_keys=False).filter(lambda x: len(x) >= 1)
else:
    selected_data = selected_data.groupby(LABEL_COLUMN, group_keys=False).filter(lambda x: len(x) >= MIN_NUM_EXAMPLES)

In [37]:
selected_data = selected_data[selected_data["EC number"] == "3.6.4.12"]

In [38]:
len(selected_data)

23319

In [39]:
selected_data.groupby(LABEL_COLUMN).count()

Unnamed: 0_level_0,sequence,smiles_product_1,smiles_product_2,smiles_substrate_1,smiles_substrate_2
EC number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3.6.4.12,23319,23319,23319,23319,23319


## Extra preprocessing steps

### Adding label

Converting unique label to int

In [40]:
id_to_category, category_to_id = get_category_to_id_mapping(selected_data, LABEL_COLUMN)
selected_data["Label"] = [ category_to_id[val] for index, val in selected_data[LABEL_COLUMN].iteritems()]
selected_data.head()

There are 1 unique categories


Unnamed: 0,sequence,EC number,smiles_product_1,smiles_product_2,smiles_substrate_1,smiles_substrate_2,Label
98785,MFEYVTGYVEYVGPEYVVIDHNGIGYQIFTPNPYVFQRSKQEIRVY...,3.6.4.12,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O,O=P([O-])([O-])O,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O...,O,0
98786,MIGKLTGTLLEKNPPEVLVDCHGVGYEVQVPMSTFYNLPAVGERVQ...,3.6.4.12,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O,O=P([O-])([O-])O,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O...,O,0
98787,MFAFLRGELVTVSREEAVVEVSGIGYLLHISSGTSRRLPPEGSQVR...,3.6.4.12,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O,O=P([O-])([O-])O,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O...,O,0
98788,MIGKLTGTLLEKNPPEVLVDCHGVGYEVQVSMSTFYNLPAVGERVS...,3.6.4.12,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O,O=P([O-])([O-])O,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O...,O,0
98789,MIGRLRGVVIEKQPPEVLLEVGGVGYEVQMPMSCFYDLPEIGKEAT...,3.6.4.12,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O,O=P([O-])([O-])O,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O...,O,0


### Train/test split

In [41]:
from sklearn.model_selection import train_test_split
if IS_VALIDATION_REQUIRED:
    train_data, val_data = train_test_split(selected_data, test_size=0.2)
else:
    train_data = selected_data
    val_data = pd.DataFrame(columns=[selected_data.columns])

In [42]:
train_data.shape, val_data.shape

((23319, 7), (0, 7))

### CSV

In [44]:
if ADD_SMILES:
    PATH = "../../data/protein/cgan/Length_{}_Examples_{}_{}/".format(MAX_SEQUENCE_LENGTH, MIN_NUM_EXAMPLES, VERSION)
else:
    PATH = "../../data/protein/wgan/Length_{}_{}/".format(MAX_SEQUENCE_LENGTH, VERSION)

In [45]:
os.makedirs(PATH, exist_ok=True)

In [46]:
def save_as_csv(is_test, data, val_prefix=""):
    # Sequence csv
    seq_data = data[["Label", LABEL_COLUMN, "sequence"]]
    seq_path = os.path.join(PATH, "{}{}_sequences.csv".format(prefix,val_prefix))
    seq_data.to_csv(seq_path, sep='\t', index=None)
    # Reaction csv
    reaction_data = data[["Label", LABEL_COLUMN, *SMILES_COLUMNS]].drop_duplicates().sort_values("EC number")
    reaction_path = os.path.join(PATH,"{}{}_reactions.csv".format(prefix, val_prefix))
    reaction_data.to_csv(reaction_path, sep='\t', index=None)

In [47]:
save_as_csv(is_test, train_data)
if IS_VALIDATION_REQUIRED:
    save_as_csv(is_test, val_data, "val_")

### Sequences

In [48]:
def save_sequences_as_tfrecords(original_data, prefix, num_threads = NUM_THREADS):
    data = original_data
    data["sequence"] = from_amino_acid_to_id(data, "sequence")
    data = data[["Label", "sequence"]]
    save_as_tfrecords_multithreaded(os.path.join(PATH, prefix), data) 

In [49]:
save_sequences_as_tfrecords(train_data, prefix)
if IS_VALIDATION_REQUIRED:
    save_sequences_as_tfrecords(val_data, "val")

Data was stored in ../../data/protein/cgan/Length_256_Examples_200_Single/train\0.tfrecords
Completed all threads in 26.041584014892578 seconds


### Reactions

In [50]:
def save_reaction_as_tfrecords(original_data, prefix):
    data = original_data[["Label", *SMILES_COLUMNS]].drop_duplicates().sort_values("Label")
    for smiles_col in SMILES_COLUMNS:
        new_col = []
        for index, row in data[smiles_col].iteritems():
            new_col.append([SMILES_CHARACTER_TO_ID[char] for char in row ])
        data[smiles_col] = new_col
    path = os.path.join(PATH, "{}.npy".format(prefix))
    np.save(path, data)
    #save_as_numpy_array(path, data, columns=SMILES_COLUMNS) 

In [51]:
save_reaction_as_tfrecords(train_data, prefix + "_reactions")
if IS_VALIDATION_REQUIRED:
    save_reaction_as_tfrecords(val_data, "val_reactions")    

### Properties

In [52]:
def get_class_mapping(selected_data):
    unique_classes = selected_data[[LABEL_COLUMN, "Label"]].sort_values(LABEL_COLUMN).drop_duplicates()
    mapping = pd.Series(unique_classes[LABEL_COLUMN].values,index=unique_classes.Label).to_dict()
    return mapping

In [53]:
def get_class_counts(selected_data):
    class_counts = selected_data[[LABEL_COLUMN, "Label"]].sort_values(LABEL_COLUMN).groupby(LABEL_COLUMN).count().reset_index()
    counts = pd.Series(class_counts.Label.values,index=class_counts[LABEL_COLUMN]).to_dict()
    return counts

In [54]:
def get_properties(data, validation_required=False):
    properties["seq_length"] = MAX_SEQUENCE_LENGTH
    properties["min_num_examples"] = MIN_NUM_EXAMPLES
    properties["total_records"] = len(selected_data)
    properties["num_of_classes"] = len(data.Label.unique())
    if not validation_required:
        properties["num_examples"] = get_class_counts(data)
        properties["class_mapping"] = get_class_mapping(data)
    return properties

In [55]:
properties = get_properties(selected_data)
properties

{'seq_length': 256,
 'min_num_examples': 200,
 'total_records': 23319,
 'num_of_classes': 1,
 'num_examples': {'3.6.4.12': 23319},
 'class_mapping': {0: '3.6.4.12'}}

In [56]:
if is_test:
    filename = 'properties_test.json'
else:
    filename = 'properties.json'
full_path = os.path.join(PATH, filename)
with open(full_path, 'w') as outfile:
    json.dump(properties, outfile)
    
if IS_VALIDATION_REQUIRED:
    full_path = os.path.join(PATH, 'properties_test.json')
    with open(full_path, 'w') as outfile:
        json.dump(get_properties(test, True), outfile)
        print("Test Properties saved in {}".format(full_path))
    
print("Properties saved in {}".format(full_path))

Properties saved in ../../data/protein/cgan/Length_256_Examples_200_Single/properties.json


 #### !!!!!!!!!!!!!!! That is all done !!!!!!!!!!!!!

# End of preprocessing