In [1]:
import pandas as pd
import numpy as np
import swifter

from common.preprocessing.dataframe import *
import json
from common.bio.constants import ID_TO_AMINO_ACID

Config

In [9]:
LEVEL = "Level_4"
SEQUENCE_LENGTH = 512
MIN_SEQUENCE_LENGTH = 80
ROOT = "../../data/protein/embedding/"
file_path = ROOT + "data_sources/unreviewed.gz"
save_path = ROOT + "unreviewed_" + str(SEQUENCE_LENGTH)
os.makedirs(save_path, exist_ok = True)
import glob

# Loading data

In [10]:
data = pd.read_csv(file_path, sep='\t', header=(0), skipinitialspace=True).drop("Entry", axis=1)
data = data.rename(columns={"Sequence" :"sequence"})
data.sequence = data.sequence.str.strip()

In [11]:
data.head()

Unnamed: 0,sequence
0,MITSLEMRILEQNSVAYGVSTRLLMENAGASIAKEIVNRVRPPAEV...
1,MDALPIFLKLRNXPALVVGGGSVAERKIDLLIRAGAKVTVVAPEIG...
2,MTAILERRESTSLWGRFCNWITSTENRLYIGWFGVLMIPTLLTATS...
3,MTAILERRESESLWGRFCNWVTSTENRLYIGWFGVLMIPTLLTATS...
4,MEYRKIQEALEALQKGRLVLVIDDKDRENEGDLICSAQAATTENVN...


# Data quality checks

In [12]:
data[data["sequence"].str.contains(" ")]

Unnamed: 0,sequence


In [13]:
len(data)

15610

Limit to length sequences (for efficiency) & Also remove sequences without labels

In [17]:
processed_data = data[(data["sequence"].str.len() <= SEQUENCE_LENGTH) & (data["sequence"].str.len() >= MIN_SEQUENCE_LENGTH) &
                      (data["EC number"].notnull()) & (data["sequence"].str.startswith("M"))]

In [18]:
def remove_classes(row, separator, to_remove):
    split_row = row.split(separator)
    new_column = separator.join([ec for ec in split_row if not to_remove in ec])
    return new_column
processed_data["EC number"] = processed_data["EC number"].apply(remove_classes, args=(";", "-"))
processed_data = processed_data[processed_data["EC number"] != ""]

KeyError: 'EC number'

In [19]:
len(processed_data)

15610

Splitting enzymes into multiple rows that belong to multiple classes 

In [10]:
#processed_data = split_dataframe_list_to_rows(processed_data, "EC number", ";")
processed_data["EC number"] = processed_data["EC number"].str.strip()

Filter out B,O,U,X,Z amino acids

In [20]:
for amino_acid in ['B','O','U','X','Z' ]:
    processed_data = processed_data[~processed_data["sequence"].str.contains(amino_acid)]

In [21]:
len(processed_data)

15451

In [22]:
processed_data.head()

Unnamed: 0,sequence
0,MITSLEMRILEQNSVAYGVSTRLLMENAGASIAKEIVNRVRPPAEV...
2,MTAILERRESTSLWGRFCNWITSTENRLYIGWFGVLMIPTLLTATS...
3,MTAILERRESESLWGRFCNWVTSTENRLYIGWFGVLMIPTLLTATS...
4,MEYRKIQEALEALQKGRLVLVIDDKDRENEGDLICSAQAATTENVN...
5,MRQVNRISPNINKIGKMILKELLKAIQPVQIAGDPNIEITGINIDS...


In [23]:
def countAminoAcid(data, amino_acid):
    return len([ 1 for index, val in data.sequence.iteritems() if amino_acid in val ])


In [24]:
countAminoAcid(processed_data, 'Z')

0

## Generating labels and features

Parsing labels

In [16]:
def generateLabel(item, index):
    return ".".join(item.split(".")[0:index])  

In [17]:
processed_data_level = processed_data[["sequence", "EC number"]]
levels = []
for i in range(1,5):
    levels.append("Level_"+str(i))
    processed_data["Level_"+str(i)] = [ generateLabel(item,i) for item in processed_data["EC number"] ]
processed_data = processed_data.drop("EC number", axis=1)
processed_data.head()

Unnamed: 0,sequence,Level_1,Level_2,Level_3,Level_4
12,MSGTRASNDRPPGAGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...,6,6.3,6.3.4,6.3.4.4
13,MSGTRASNDRPPGTGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...,6,6.3,6.3.4,6.3.4.4
14,MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKV...,6,6.3,6.3.4,6.3.4.4
15,MSISESSPAATSLPNGDCGRPRARSGGNRVTVVLGAQWGDEGKGKV...,6,6.3,6.3.4,6.3.4.4
18,MGSKTEMMERDAMATVAPYAPVTYHRRARVDLDDRLPKPYMPRALQ...,1,1.11,1.11.2,1.11.2.3


## Level label processing

Converting unique label to int

In [18]:
def class_to_number(df, column):
    classes = df[column].sort_values().unique()
    print ("There are {} unique classes for level {}".format(len(classes), LEVEL))
    indexToClass = {i:classes[i] for i in range (len(classes))}
    classToIndex = {classes[i]:i for i in range (len(classes))}
    return indexToClass, classToIndex

Filterring out labels with little number of examples

In [19]:
import copy
levels_copy = copy.deepcopy(levels)
levels_copy.remove(LEVEL)

In [20]:
MIN_NUM_EXAMPLES = 25
processed_data_level = processed_data.groupby([LEVEL]).filter(lambda x: x[LEVEL].count() > MIN_NUM_EXAMPLES)

In [21]:
#processed_data_level = processed_data_level.drop(levels_copy, axis=1)
indexToClass, classToIndex = class_to_number(processed_data_level, LEVEL)

There are 769 unique classes for level Level_4


In [22]:
full_path = os.path.join(save_path, "classToIndex.json")
with open(full_path, 'w') as outfile:
    json.dump(classToIndex, outfile) 

In [23]:
processed_data_level["Label"] = [ classToIndex[val] for index, val in processed_data_level[LEVEL].iteritems()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [24]:
path = os.path.join(save_path, "data.csv")
processed_data_level.to_csv(path, sep='\t', index=None)

In [25]:
processed_data_level = processed_data_level.drop(LEVEL, axis=1)

In [26]:
processed_data_level.head()

Unnamed: 0,sequence,Level_1,Level_2,Level_3,Label
12,MSGTRASNDRPPGAGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...,6,6.3,6.3.4,758
13,MSGTRASNDRPPGTGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...,6,6.3,6.3.4,758
14,MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKV...,6,6.3,6.3.4,758
15,MSISESSPAATSLPNGDCGRPRARSGGNRVTVVLGAQWGDEGKGKV...,6,6.3,6.3.4,758
34,MQFKNILVVCIGNICRSPMAEYLLKQNYPQLTIHSAGISGMIGYSA...,3,3.1,3.1.3,441


Train/Test/Val

#### Train/test split

In [25]:
def train_val_test_split(data, val=0.2, test=0.1):
    train_split = int((1-val-test)*len(data))
    val_split = int((1-test)*len(data))
    train, val, test = np.split(data.sample(frac=1), [train_split, val_split])
    print("Full: {} | Train: {} | Validation: {} | Test: {}".format(len(data), len(train), len(val), len(test)))
    print("Train: {:.2f}% | Validation: {:.2f}% | Test: {:.2f}%".format(len(train)/len(data)*100, 
                                                                        len(val)/len(data)*100, 
                                                                        len(test)/len(data)*100))
    return train, val, test

In [27]:
data = processed_data_level.groupby("Label")
train, val, test = None, None, None
for group_id in range(len(data.groups)):
    train_g, val_g, test_g  = train_val_test_split(data.get_group(group_id))
    if train is None:
        train, val, test = train_g, val_g, test_g 
    else:
        train = train.append(train_g)
        val = val.append(val_g)
        test = test.append(test_g)
len(train), len(val), len(test)

AttributeError: 'DataFrame' object has no attribute 'groups'

In [29]:
train.to_csv(os.path.join(save_path, "train.csv"), sep='\t', index=None)
val.to_csv(os.path.join(save_path, "val.csv"), sep='\t', index=None)
test.to_csv(os.path.join(save_path, "test.csv"), sep='\t', index=None)

In [30]:
# train["sequence"] = train.sequence.str.ljust(SEQUENCE_LENGTH, '0')
# val["sequence"] = val.sequence.str.ljust(SEQUENCE_LENGTH, '0')
# test["sequence"] = test.sequence.str.ljust(SEQUENCE_LENGTH, '0')

### Sequence to embeddings ids

In [5]:
def get_indicies(val, kmer2index, kmer_size):
    return [kmer2index.get(val[i:i+kmer_size], 0) for i in range(len(val)-(kmer_size-1))]

def convert_to_index(data, file_name):
    embedding_path = os.path.join(ROOT, "data_sources", file_name)
    embeddings = pd.read_csv(embedding_path, delimiter="\t")
    all_words = embeddings["words"]
    index2kmer = all_words.to_dict()
    kmer_size = len(index2kmer[1])
    kmer2index = inv_map = {v: k for k, v in index2kmer.items()}
    print("The size of vocabulary is {}".format(max(list(index2kmer.keys()))))
    data["sequence"] = data.sequence.swifter.apply(lambda x: get_indicies(x, kmer2index, kmer_size))
    return data, kmer_size

In [17]:
def store_data(embedding_path):
    print("Reading original data")
    train = pd.read_csv(os.path.join(save_path, "train.csv"), sep='\t')
    val = pd.read_csv(os.path.join(save_path, "val.csv"), sep='\t')
    test = pd.read_csv(os.path.join(save_path, "test.csv"), sep='\t')
    print("Lengths. Max: {} Min: {}".format(max([len(x) for x in train.sequence]), min([len(x) for x in train.sequence])))
    train_emb, _ = convert_to_index(train, embedding_path)
    val_emb, _ = convert_to_index(val, embedding_path)
    test_emb, kmer_size = convert_to_index(test, embedding_path)
    print("Embedded sequence lengths. Max: {} Min: {}".format(max([len(x) for x in train_emb.sequence]), 
                                                              min([len(x) for x in train_emb.sequence])))
    
    print("Indicies used. Max: {} Min: {}".format(max([max(x) for x in train_emb.sequence]), 
                                                  min([min(x) for x in train_emb.sequence])))

    tf_records_path = os.path.join(save_path, "{}_kmers".format(kmer_size))
    store_fn = save_as_npy #save_as_tfrecords_multithreaded
    store_fn(os.path.join(tf_records_path, "train"), train[["Label", "sequence", "Level_2"]])#, 
                                    #group_by_col="Level_2") 
    store_fn(os.path.join(tf_records_path, "val"), val[["Label", "sequence", "Level_2"]]) #,
                                   #group_by_col="Level_2") 
    store_fn(os.path.join(tf_records_path, "test"), test[["Label", "sequence", "Level_2"]]) #, 
                                    #group_by_col="Level_2") 

Saving down results

In [18]:
store_data("protVec_100d_3grams.csv")

Reading original data
Lengths. Max: 512 Min: 80
The size of vocabulary is 9047


Pandas Apply: 100%|██████████| 113857/113857 [00:13<00:00, 8308.54it/s]


The size of vocabulary is 9047


Pandas Apply: 100%|██████████| 32640/32640 [00:03<00:00, 8552.42it/s]


The size of vocabulary is 9047


Pandas Apply: 100%|██████████| 16652/16652 [00:02<00:00, 7967.83it/s]


Embedded sequence lengths. Max: 510 Min: 78
Indicies used. Max: 8001 Min: 1
Data was stored in ../../data/protein/classification/sample_512\3_kmers\train\data.npy
Data was stored in ../../data/protein/classification/sample_512\3_kmers\val\data.npy
Data was stored in ../../data/protein/classification/sample_512\3_kmers\test\data.npy


In [None]:
store_data("2grams.csv")

Reading original data
Lengths. Max: 512 Min: 80
The size of vocabulary is 400


Pandas Apply: 100%|██████████| 113857/113857 [00:13<00:00, 8298.70it/s]


The size of vocabulary is 400


Pandas Apply: 100%|██████████| 32640/32640 [00:04<00:00, 8045.63it/s]


The size of vocabulary is 400


Pandas Apply: 100%|██████████| 16652/16652 [00:02<00:00, 7496.34it/s]


Embedded sequence lengths. Max: 511 Min: 79


In [None]:
store_data("1grams.csv")

# Preprocessing done

In [None]:
np.load("../../data/protein/classification/sample_512/1_kmers/train/data.npy")[0][0].shape