In [1]:
import pandas as pd
import numpy as np
import swifter

from common.preprocessing.dataframe import *
import json
from common.bio.constants import ID_TO_AMINO_ACID

Config

In [2]:
LEVEL = "Level_4"
SEQUENCE_LENGTH = 750
MIN_SEQUENCE_LENGTH = 80
ROOT = "../../data/protein/classification/"
file_path = ROOT + "data_sources/uniprot-reviewed%3Ayes.tab.gz"
save_path = ROOT + "full_" + str(SEQUENCE_LENGTH)
os.makedirs(save_path, exist_ok = True)

# Loading data

In [3]:
data = pd.read_csv(file_path, sep='\t', header=(0), skipinitialspace=True).drop("Entry", axis=1)
data = data.rename(columns={"Sequence" :"sequence"})
data.sequence = data.sequence.str.strip()

In [4]:
data.head()

Unnamed: 0,EC number,sequence
0,3.1.3.48,MARAGGNCGVWRSLVLLGLYGCSVVRAAGTSVTVDRHAPASSYEFS...
1,3.1.3.48,MKPAARETRTPPRSPGLRWALLPLLLLLRQGQVLCAGAAPNPIFDI...
2,3.1.3.48,MRTLGTCLVTLAGLLLTAAGETFSGGCLFDEPYSTCGYSQADEDDF...
3,,MRRPRRPGGPAGCGGSEGSGGLRLLVCLLLLSGRPGGCSAISAHGC...
4,3.1.3.48,MGHLPTGIHGARRLLPLLWLFVLFKNATAFHVTVQDDNNIVVSLEA...


# Data quality checks

In [5]:
data[data["sequence"].str.contains(" ")]

Unnamed: 0,EC number,sequence


In [6]:
len(data)

557713

Limit to length sequences (for efficiency) & Also remove sequences without labels

In [7]:
processed_data = data[(data["sequence"].str.len() <= SEQUENCE_LENGTH) & (data["sequence"].str.len() >= MIN_SEQUENCE_LENGTH) &
                      (data["EC number"].notnull()) & (data["sequence"].str.startswith("M"))]

In [8]:
def remove_classes(row, separator, to_remove):
    split_row = row.split(separator)
    new_column = separator.join([ec for ec in split_row if not to_remove in ec])
    return new_column
processed_data["EC number"] = processed_data["EC number"].apply(remove_classes, args=(";", "-"))
processed_data = processed_data[processed_data["EC number"] != ""]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [9]:
len(processed_data)

206644

Splitting enzymes into multiple rows that belong to multiple classes 

In [10]:
#processed_data = split_dataframe_list_to_rows(processed_data, "EC number", ";")
processed_data["EC number"] = processed_data["EC number"].str.strip()

Filter out B,O,U,X,Z amino acids

In [11]:
for amino_acid in ['B','O','U','X','Z' ]:
    processed_data = processed_data[~processed_data["sequence"].str.contains(amino_acid)]

In [12]:
len(processed_data)

206202

In [13]:
processed_data.head()

Unnamed: 0,EC number,sequence
5,3.1.3.48,MRRAVCFPALCLLLNLHAAGCFSGNNDHFLAINQKKSGKPVFIYKH...
12,6.3.4.4,MSGTRASNDRPPGAGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...
13,6.3.4.4,MSGTRASNDRPPGTGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...
14,6.3.4.4,MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKV...
15,6.3.4.4,MSISESSPAATSLPNGDCGRPRARSGGNRVTVVLGAQWGDEGKGKV...


In [14]:
def countAminoAcid(data, amino_acid):
    return len([ 1 for index, val in data.sequence.iteritems() if amino_acid in val ])


In [15]:
countAminoAcid(processed_data, 'Z')

0

## Generating labels and features

Parsing labels

In [16]:
def generateLabel(item, index):
    return ".".join(item.split(".")[0:index])  

In [17]:
processed_data_level = processed_data[["sequence", "EC number"]]
levels = []
for i in range(1,5):
    levels.append("Level_"+str(i))
    processed_data["Level_"+str(i)] = [ generateLabel(item,i) for item in processed_data["EC number"] ]
processed_data = processed_data.drop("EC number", axis=1)
processed_data.head()

Unnamed: 0,sequence,Level_1,Level_2,Level_3,Level_4
5,MRRAVCFPALCLLLNLHAAGCFSGNNDHFLAINQKKSGKPVFIYKH...,3,3.1,3.1.3,3.1.3.48
12,MSGTRASNDRPPGAGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...,6,6.3,6.3.4,6.3.4.4
13,MSGTRASNDRPPGTGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...,6,6.3,6.3.4,6.3.4.4
14,MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKV...,6,6.3,6.3.4,6.3.4.4
15,MSISESSPAATSLPNGDCGRPRARSGGNRVTVVLGAQWGDEGKGKV...,6,6.3,6.3.4,6.3.4.4


## Level label processing

Converting unique label to int

In [18]:
def class_to_number(df, column):
    classes = df[column].sort_values().unique()
    print ("There are {} unique classes for level {}".format(len(classes), LEVEL))
    indexToClass = {i:classes[i] for i in range (len(classes))}
    classToIndex = {classes[i]:i for i in range (len(classes))}
    return indexToClass, classToIndex

Filterring out labels with little number of examples

In [19]:
import copy
levels_copy = copy.deepcopy(levels)
levels_copy.remove(LEVEL)

In [20]:
MIN_NUM_EXAMPLES = 25
processed_data_level = processed_data.groupby([LEVEL]).filter(lambda x: x[LEVEL].count() > MIN_NUM_EXAMPLES)

In [21]:
#processed_data_level = processed_data_level.drop(levels_copy, axis=1)
indexToClass, classToIndex = class_to_number(processed_data_level, LEVEL)

There are 879 unique classes for level Level_4


In [22]:
full_path = os.path.join(save_path, "classToIndex.json")
with open(full_path, 'w') as outfile:
    json.dump(classToIndex, outfile) 

In [23]:
processed_data_level["Label"] = [ classToIndex[val] for index, val in processed_data_level[LEVEL].iteritems()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [24]:
path = os.path.join(save_path, "data.csv")
processed_data_level.to_csv(path, sep='\t', index=None)

In [25]:
processed_data_level = processed_data_level.drop(LEVEL, axis=1)

In [26]:
processed_data_level.head()

Unnamed: 0,sequence,Level_1,Level_2,Level_3,Label
5,MRRAVCFPALCLLLNLHAAGCFSGNNDHFLAINQKKSGKPVFIYKH...,3,3.1,3.1.3,507
12,MSGTRASNDRPPGAGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...,6,6.3,6.3.4,865
13,MSGTRASNDRPPGTGGVKRGRLQQEAAATGSRVTVVLGAQWGDEGK...,6,6.3,6.3.4,865
14,MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKV...,6,6.3,6.3.4,865
15,MSISESSPAATSLPNGDCGRPRARSGGNRVTVVLGAQWGDEGKGKV...,6,6.3,6.3.4,865


Train/Test/Val

#### Train/test split

In [27]:
def train_val_test_split(data, val=0.2, test=0.1):
    train_split = int((1-val-test)*len(data))
    val_split = int((1-test)*len(data))
    train, val, test = np.split(data.sample(frac=1), [train_split, val_split])
    print("Full: {} | Train: {} | Validation: {} | Test: {}".format(len(data), len(train), len(val), len(test)))
    print("Train: {:.2f}% | Validation: {:.2f}% | Test: {:.2f}%".format(len(train)/len(data)*100, 
                                                                        len(val)/len(data)*100, 
                                                                        len(test)/len(data)*100))
    return train, val, test

In [28]:
data = processed_data_level.groupby("Label")
train, val, test = None, None, None
for group_id in range(len(data.groups)):
    train_g, val_g, test_g  = train_val_test_split(data.get_group(group_id))
    if train is None:
        train, val, test = train_g, val_g, test_g 
    else:
        train = train.append(train_g)
        val = val.append(val_g)
        test = test.append(test_g)
len(train), len(val), len(test)

Full: 187 | Train: 130 | Validation: 38 | Test: 19
Train: 69.52% | Validation: 20.32% | Test: 10.16%
Full: 45 | Train: 31 | Validation: 9 | Test: 5
Train: 68.89% | Validation: 20.00% | Test: 11.11%
Full: 187 | Train: 130 | Validation: 38 | Test: 19
Train: 69.52% | Validation: 20.32% | Test: 10.16%
Full: 41 | Train: 28 | Validation: 8 | Test: 5
Train: 68.29% | Validation: 19.51% | Test: 12.20%
Full: 31 | Train: 21 | Validation: 6 | Test: 4
Train: 67.74% | Validation: 19.35% | Test: 12.90%
Full: 159 | Train: 111 | Validation: 32 | Test: 16
Train: 69.81% | Validation: 20.13% | Test: 10.06%
Full: 58 | Train: 40 | Validation: 12 | Test: 6
Train: 68.97% | Validation: 20.69% | Test: 10.34%
Full: 38 | Train: 26 | Validation: 8 | Test: 4
Train: 68.42% | Validation: 21.05% | Test: 10.53%
Full: 86 | Train: 60 | Validation: 17 | Test: 9
Train: 69.77% | Validation: 19.77% | Test: 10.47%
Full: 44 | Train: 30 | Validation: 9 | Test: 5
Train: 68.18% | Validation: 20.45% | Test: 11.36%
Full: 220 | Trai

Train: 69.74% | Validation: 20.18% | Test: 10.09%
Full: 37 | Train: 25 | Validation: 8 | Test: 4
Train: 67.57% | Validation: 21.62% | Test: 10.81%
Full: 42 | Train: 29 | Validation: 8 | Test: 5
Train: 69.05% | Validation: 19.05% | Test: 11.90%
Full: 90 | Train: 63 | Validation: 18 | Test: 9
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 392 | Train: 274 | Validation: 78 | Test: 40
Train: 69.90% | Validation: 19.90% | Test: 10.20%
Full: 566 | Train: 396 | Validation: 113 | Test: 57
Train: 69.96% | Validation: 19.96% | Test: 10.07%
Full: 61 | Train: 42 | Validation: 12 | Test: 7
Train: 68.85% | Validation: 19.67% | Test: 11.48%
Full: 581 | Train: 406 | Validation: 116 | Test: 59
Train: 69.88% | Validation: 19.97% | Test: 10.15%
Full: 134 | Train: 93 | Validation: 27 | Test: 14
Train: 69.40% | Validation: 20.15% | Test: 10.45%
Full: 92 | Train: 64 | Validation: 18 | Test: 10
Train: 69.57% | Validation: 19.57% | Test: 10.87%
Full: 125 | Train: 87 | Validation: 25 | Test: 13
Train:

Full: 170 | Train: 119 | Validation: 34 | Test: 17
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 99 | Train: 69 | Validation: 20 | Test: 10
Train: 69.70% | Validation: 20.20% | Test: 10.10%
Full: 161 | Train: 112 | Validation: 32 | Test: 17
Train: 69.57% | Validation: 19.88% | Test: 10.56%
Full: 81 | Train: 56 | Validation: 16 | Test: 9
Train: 69.14% | Validation: 19.75% | Test: 11.11%
Full: 637 | Train: 445 | Validation: 128 | Test: 64
Train: 69.86% | Validation: 20.09% | Test: 10.05%
Full: 182 | Train: 127 | Validation: 36 | Test: 19
Train: 69.78% | Validation: 19.78% | Test: 10.44%
Full: 27 | Train: 18 | Validation: 6 | Test: 3
Train: 66.67% | Validation: 22.22% | Test: 11.11%
Full: 54 | Train: 37 | Validation: 11 | Test: 6
Train: 68.52% | Validation: 20.37% | Test: 11.11%
Full: 42 | Train: 29 | Validation: 8 | Test: 5
Train: 69.05% | Validation: 19.05% | Test: 11.90%
Full: 839 | Train: 587 | Validation: 168 | Test: 84
Train: 69.96% | Validation: 20.02% | Test: 10.01%
Full

Full: 125 | Train: 87 | Validation: 25 | Test: 13
Train: 69.60% | Validation: 20.00% | Test: 10.40%
Full: 54 | Train: 37 | Validation: 11 | Test: 6
Train: 68.52% | Validation: 20.37% | Test: 11.11%
Full: 35 | Train: 24 | Validation: 7 | Test: 4
Train: 68.57% | Validation: 20.00% | Test: 11.43%
Full: 571 | Train: 399 | Validation: 114 | Test: 58
Train: 69.88% | Validation: 19.96% | Test: 10.16%
Full: 461 | Train: 322 | Validation: 92 | Test: 47
Train: 69.85% | Validation: 19.96% | Test: 10.20%
Full: 43 | Train: 30 | Validation: 8 | Test: 5
Train: 69.77% | Validation: 18.60% | Test: 11.63%
Full: 212 | Train: 148 | Validation: 42 | Test: 22
Train: 69.81% | Validation: 19.81% | Test: 10.38%
Full: 36 | Train: 25 | Validation: 7 | Test: 4
Train: 69.44% | Validation: 19.44% | Test: 11.11%
Full: 968 | Train: 677 | Validation: 194 | Test: 97
Train: 69.94% | Validation: 20.04% | Test: 10.02%
Full: 191 | Train: 133 | Validation: 38 | Test: 20
Train: 69.63% | Validation: 19.90% | Test: 10.47%
Full

Full: 273 | Train: 191 | Validation: 54 | Test: 28
Train: 69.96% | Validation: 19.78% | Test: 10.26%
Full: 46 | Train: 32 | Validation: 9 | Test: 5
Train: 69.57% | Validation: 19.57% | Test: 10.87%
Full: 47 | Train: 32 | Validation: 10 | Test: 5
Train: 68.09% | Validation: 21.28% | Test: 10.64%
Full: 101 | Train: 70 | Validation: 20 | Test: 11
Train: 69.31% | Validation: 19.80% | Test: 10.89%
Full: 71 | Train: 49 | Validation: 14 | Test: 8
Train: 69.01% | Validation: 19.72% | Test: 11.27%
Full: 503 | Train: 352 | Validation: 100 | Test: 51
Train: 69.98% | Validation: 19.88% | Test: 10.14%
Full: 259 | Train: 181 | Validation: 52 | Test: 26
Train: 69.88% | Validation: 20.08% | Test: 10.04%
Full: 37 | Train: 25 | Validation: 8 | Test: 4
Train: 67.57% | Validation: 21.62% | Test: 10.81%
Full: 44 | Train: 30 | Validation: 9 | Test: 5
Train: 68.18% | Validation: 20.45% | Test: 11.36%
Full: 47 | Train: 32 | Validation: 10 | Test: 5
Train: 68.09% | Validation: 21.28% | Test: 10.64%
Full: 291 |

Full: 45 | Train: 31 | Validation: 9 | Test: 5
Train: 68.89% | Validation: 20.00% | Test: 11.11%
Full: 54 | Train: 37 | Validation: 11 | Test: 6
Train: 68.52% | Validation: 20.37% | Test: 11.11%
Full: 55 | Train: 38 | Validation: 11 | Test: 6
Train: 69.09% | Validation: 20.00% | Test: 10.91%
Full: 391 | Train: 273 | Validation: 78 | Test: 40
Train: 69.82% | Validation: 19.95% | Test: 10.23%
Full: 79 | Train: 55 | Validation: 16 | Test: 8
Train: 69.62% | Validation: 20.25% | Test: 10.13%
Full: 71 | Train: 49 | Validation: 14 | Test: 8
Train: 69.01% | Validation: 19.72% | Test: 11.27%
Full: 600 | Train: 420 | Validation: 120 | Test: 60
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 310 | Train: 217 | Validation: 62 | Test: 31
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 678 | Train: 474 | Validation: 136 | Test: 68
Train: 69.91% | Validation: 20.06% | Test: 10.03%
Full: 379 | Train: 265 | Validation: 76 | Test: 38
Train: 69.92% | Validation: 20.05% | Test: 10.03%
Full

Full: 32 | Train: 22 | Validation: 6 | Test: 4
Train: 68.75% | Validation: 18.75% | Test: 12.50%
Full: 59 | Train: 41 | Validation: 12 | Test: 6
Train: 69.49% | Validation: 20.34% | Test: 10.17%
Full: 33 | Train: 23 | Validation: 6 | Test: 4
Train: 69.70% | Validation: 18.18% | Test: 12.12%
Full: 487 | Train: 340 | Validation: 98 | Test: 49
Train: 69.82% | Validation: 20.12% | Test: 10.06%
Full: 37 | Train: 25 | Validation: 8 | Test: 4
Train: 67.57% | Validation: 21.62% | Test: 10.81%
Full: 40 | Train: 28 | Validation: 8 | Test: 4
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 64 | Train: 44 | Validation: 13 | Test: 7
Train: 68.75% | Validation: 20.31% | Test: 10.94%
Full: 537 | Train: 375 | Validation: 108 | Test: 54
Train: 69.83% | Validation: 20.11% | Test: 10.06%
Full: 139 | Train: 97 | Validation: 28 | Test: 14
Train: 69.78% | Validation: 20.14% | Test: 10.07%
Full: 96 | Train: 67 | Validation: 19 | Test: 10
Train: 69.79% | Validation: 19.79% | Test: 10.42%
Full: 100 | Tr

Full: 26 | Train: 18 | Validation: 5 | Test: 3
Train: 69.23% | Validation: 19.23% | Test: 11.54%
Full: 421 | Train: 294 | Validation: 84 | Test: 43
Train: 69.83% | Validation: 19.95% | Test: 10.21%
Full: 59 | Train: 41 | Validation: 12 | Test: 6
Train: 69.49% | Validation: 20.34% | Test: 10.17%
Full: 29 | Train: 20 | Validation: 6 | Test: 3
Train: 68.97% | Validation: 20.69% | Test: 10.34%
Full: 535 | Train: 374 | Validation: 107 | Test: 54
Train: 69.91% | Validation: 20.00% | Test: 10.09%
Full: 430 | Train: 301 | Validation: 86 | Test: 43
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 30 | Train: 21 | Validation: 6 | Test: 3
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 66 | Train: 46 | Validation: 13 | Test: 7
Train: 69.70% | Validation: 19.70% | Test: 10.61%
Full: 57 | Train: 39 | Validation: 12 | Test: 6
Train: 68.42% | Validation: 21.05% | Test: 10.53%
Full: 319 | Train: 223 | Validation: 64 | Test: 32
Train: 69.91% | Validation: 20.06% | Test: 10.03%
Full: 65 |

Train: 68.29% | Validation: 19.51% | Test: 12.20%
Full: 158 | Train: 110 | Validation: 32 | Test: 16
Train: 69.62% | Validation: 20.25% | Test: 10.13%
Full: 52 | Train: 36 | Validation: 10 | Test: 6
Train: 69.23% | Validation: 19.23% | Test: 11.54%
Full: 46 | Train: 32 | Validation: 9 | Test: 5
Train: 69.57% | Validation: 19.57% | Test: 10.87%
Full: 26 | Train: 18 | Validation: 5 | Test: 3
Train: 69.23% | Validation: 19.23% | Test: 11.54%
Full: 127 | Train: 88 | Validation: 26 | Test: 13
Train: 69.29% | Validation: 20.47% | Test: 10.24%
Full: 1825 | Train: 1277 | Validation: 365 | Test: 183
Train: 69.97% | Validation: 20.00% | Test: 10.03%
Full: 1031 | Train: 721 | Validation: 206 | Test: 104
Train: 69.93% | Validation: 19.98% | Test: 10.09%
Full: 39 | Train: 27 | Validation: 8 | Test: 4
Train: 69.23% | Validation: 20.51% | Test: 10.26%
Full: 706 | Train: 494 | Validation: 141 | Test: 71
Train: 69.97% | Validation: 19.97% | Test: 10.06%
Full: 83 | Train: 58 | Validation: 16 | Test: 9
T

Train: 69.95% | Validation: 19.70% | Test: 10.34%
Full: 525 | Train: 367 | Validation: 105 | Test: 53
Train: 69.90% | Validation: 20.00% | Test: 10.10%
Full: 713 | Train: 499 | Validation: 142 | Test: 72
Train: 69.99% | Validation: 19.92% | Test: 10.10%
Full: 171 | Train: 119 | Validation: 34 | Test: 18
Train: 69.59% | Validation: 19.88% | Test: 10.53%
Full: 63 | Train: 44 | Validation: 12 | Test: 7
Train: 69.84% | Validation: 19.05% | Test: 11.11%
Full: 110 | Train: 77 | Validation: 22 | Test: 11
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 42 | Train: 29 | Validation: 8 | Test: 5
Train: 69.05% | Validation: 19.05% | Test: 11.90%
Full: 48 | Train: 33 | Validation: 10 | Test: 5
Train: 68.75% | Validation: 20.83% | Test: 10.42%
Full: 200 | Train: 140 | Validation: 40 | Test: 20
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 78 | Train: 54 | Validation: 16 | Test: 8
Train: 69.23% | Validation: 20.51% | Test: 10.26%
Full: 617 | Train: 431 | Validation: 124 | Test: 62
T

Train: 69.93% | Validation: 20.05% | Test: 10.02%
Full: 141 | Train: 98 | Validation: 28 | Test: 15
Train: 69.50% | Validation: 19.86% | Test: 10.64%
Full: 743 | Train: 520 | Validation: 148 | Test: 75
Train: 69.99% | Validation: 19.92% | Test: 10.09%
Full: 170 | Train: 119 | Validation: 34 | Test: 17
Train: 70.00% | Validation: 20.00% | Test: 10.00%
Full: 745 | Train: 521 | Validation: 149 | Test: 75
Train: 69.93% | Validation: 20.00% | Test: 10.07%
Full: 699 | Train: 489 | Validation: 140 | Test: 70
Train: 69.96% | Validation: 20.03% | Test: 10.01%
Full: 286 | Train: 200 | Validation: 57 | Test: 29
Train: 69.93% | Validation: 19.93% | Test: 10.14%
Full: 436 | Train: 305 | Validation: 87 | Test: 44
Train: 69.95% | Validation: 19.95% | Test: 10.09%
Full: 697 | Train: 487 | Validation: 140 | Test: 70
Train: 69.87% | Validation: 20.09% | Test: 10.04%
Full: 486 | Train: 340 | Validation: 97 | Test: 49
Train: 69.96% | Validation: 19.96% | Test: 10.08%
Full: 202 | Train: 141 | Validation: 4

(131241, 37633, 19192)

In [29]:
train.to_csv(os.path.join(save_path, "train.csv"), sep='\t', index=None)
val.to_csv(os.path.join(save_path, "val.csv"), sep='\t', index=None)
test.to_csv(os.path.join(save_path, "test.csv"), sep='\t', index=None)

In [30]:
# train["sequence"] = train.sequence.str.ljust(SEQUENCE_LENGTH, '0')
# val["sequence"] = val.sequence.str.ljust(SEQUENCE_LENGTH, '0')
# test["sequence"] = test.sequence.str.ljust(SEQUENCE_LENGTH, '0')

### Sequence to embeddings ids

In [30]:
def get_indicies(val, kmer2index, kmer_size):
    return [kmer2index.get(val[i:i+kmer_size], 0) for i in range(len(val)-(kmer_size-1))]

def convert_to_index(data, file_name):
    embedding_path = os.path.join(ROOT, "data_sources", file_name)
    embeddings = pd.read_csv(embedding_path, delimiter="\t")
    all_words = embeddings["words"]
    index2kmer = all_words.to_dict()
    kmer_size = len(index2kmer[1])
    kmer2index = inv_map = {v: k for k, v in index2kmer.items()}
    print("The size of vocabulary is {}".format(max(list(index2kmer.keys()))))
    data["sequence"] = data.sequence.swifter.apply(lambda x: get_indicies(x, kmer2index, kmer_size))
    return data, kmer_size

In [31]:
def store_data(embedding_path):
    print("Reading original data")
    train = pd.read_csv(os.path.join(save_path, "train.csv"), sep='\t')
    val = pd.read_csv(os.path.join(save_path, "val.csv"), sep='\t')
    test = pd.read_csv(os.path.join(save_path, "test.csv"), sep='\t')
    print("Lengths. Max: {} Min: {}".format(max([len(x) for x in train.sequence]), min([len(x) for x in train.sequence])))
    train_emb, _ = convert_to_index(train, embedding_path)
    val_emb, _ = convert_to_index(val, embedding_path)
    test_emb, kmer_size = convert_to_index(test, embedding_path)
    print("Embedded sequence lengths. Max: {} Min: {}".format(max([len(x) for x in train_emb.sequence]), 
                                                              min([len(x) for x in train_emb.sequence])))
    
    print("Indicies used. Max: {} Min: {}".format(max([max(x) for x in train_emb.sequence]), 
                                                  min([min(x) for x in train_emb.sequence])))
    
    tf_records_path = os.path.join(save_path, "{}_kmers".format(kmer_size))
    store_fn = save_as_npy #save_as_tfrecords_multithreaded
    store_fn(os.path.join(tf_records_path, "train"), train[["Label", "sequence", "Level_2"]], columns=["sequence"])#, 
                                    #group_by_col="Level_2") 
    store_fn(os.path.join(tf_records_path, "val"), val[["Label", "sequence", "Level_2"]], columns=["sequence"]) #,
                                   #group_by_col="Level_2") 
    store_fn(os.path.join(tf_records_path, "test"), test[["Label", "sequence", "Level_2"]], columns=["sequence"]) #, 
                                    #group_by_col="Level_2") 

Saving down results

In [32]:
store_data("protVec_100d_3grams.csv")

Reading original data
Lengths. Max: 750 Min: 80
The size of vocabulary is 9047


Pandas Apply: 100%|██████████| 131241/131241 [00:23<00:00, 5596.79it/s]


The size of vocabulary is 9047


Pandas Apply: 100%|██████████| 37633/37633 [00:07<00:00, 4985.05it/s]


The size of vocabulary is 9047


Pandas Apply: 100%|██████████| 19192/19192 [00:02<00:00, 6462.97it/s]


Embedded sequence lengths. Max: 748 Min: 78
Indicies used. Max: 8001 Min: 1
Data was stored in ../../data/protein/classification/full_750\3_kmers\train\data.npy
Data was stored in ../../data/protein/classification/full_750\3_kmers\val\data.npy
Data was stored in ../../data/protein/classification/full_750\3_kmers\test\data.npy


In [33]:
store_data("2grams.csv")

Reading original data
Lengths. Max: 750 Min: 80
The size of vocabulary is 400


Pandas Apply: 100%|██████████| 131241/131241 [00:16<00:00, 7799.48it/s]


The size of vocabulary is 400


Pandas Apply: 100%|██████████| 37633/37633 [00:04<00:00, 7912.39it/s]


The size of vocabulary is 400


Pandas Apply: 100%|██████████| 19192/19192 [00:02<00:00, 6532.93it/s]


Embedded sequence lengths. Max: 749 Min: 79
Indicies used. Max: 400 Min: 0
Data was stored in ../../data/protein/classification/full_750\2_kmers\train\data.npy
Data was stored in ../../data/protein/classification/full_750\2_kmers\val\data.npy
Data was stored in ../../data/protein/classification/full_750\2_kmers\test\data.npy


In [34]:
store_data("1grams.csv")

Reading original data
Lengths. Max: 750 Min: 80
The size of vocabulary is 20


Pandas Apply: 100%|██████████| 131241/131241 [00:14<00:00, 9314.67it/s] 


The size of vocabulary is 20


Pandas Apply: 100%|██████████| 37633/37633 [00:03<00:00, 9525.44it/s] 


The size of vocabulary is 20


Pandas Apply: 100%|██████████| 19192/19192 [00:02<00:00, 9209.76it/s] 


Embedded sequence lengths. Max: 750 Min: 80
Indicies used. Max: 20 Min: 1
Data was stored in ../../data/protein/classification/full_750\1_kmers\train\data.npy
Data was stored in ../../data/protein/classification/full_750\1_kmers\val\data.npy
Data was stored in ../../data/protein/classification/full_750\1_kmers\test\data.npy


# Preprocessing done