In [1]:
import pandas as pd
import numpy as np
import os
import swifter
from common.preprocessing.dataframe import *
from common.bio.amino_acid import *

In [20]:
STRUCTURE_COL = "sst8"
ROOT = "..\\..\\data\\protein\\structure\\secondary_structure\\"
#DATA_SOURCES = ROOT + "data_sources\\2018-06-06-pdb-intersect-pisces.csv"
DATA_SOURCES = ROOT + "data_sources\\sst_with_clusters.csv"
SAVE_PATH = ROOT

In [21]:
data = pd.read_csv(DATA_SOURCES)

In [23]:
filtered_data = data[(data.seq.str.len() <= 729)]
filtered_data = filtered_data[['seq', STRUCTURE_COL, "cluster"]]
filtered_data["seq"] = filtered_data["seq"].str.strip()

In [24]:
filtered_data = filter_non_standard_amino_acids(filtered_data, "seq")

In [25]:
filtered_data.shape

(8958, 2)

#### Gathering labels

In [26]:
def class_to_number(df, column):
    unique_char = []
    for c in df[column]:
        unique_char.extend(list(set(c)))

    classes = sorted(list(set(unique_char)))
    classes.insert(0, "0")
    print ("There are {} unique classes".format(len(classes)))
    indexToClass = {i:classes[i] for i in range (len(classes))}
    classToIndex = {classes[i]:i for i in range (len(classes))}
    return indexToClass, classToIndex

In [27]:
indexToClass, classToIndex = class_to_number(filtered_data, STRUCTURE_COL)

There are 9 unique classes


In [28]:
indexToClass

{0: '0', 1: 'B', 2: 'C', 3: 'E', 4: 'G', 5: 'H', 6: 'I', 7: 'S', 8: 'T'}

#### Train/test split

In [29]:
def train_val_test_split(data, val=0.2, test=0.1):
    train_split = int((1-val-test)*len(data))
    val_split = int((1-test)*len(data))
    train, val, test = np.split(data.sample(frac=1), [train_split, val_split])
    print("Full: {} | Train: {} | Validation: {} | Test: {}".format(len(data), len(train), len(val), len(test)))
    print("Train: {:.2f}% | Validation: {:.2f}% | Test: {:.2f}%".format(len(train)/len(data)*100, 
                                                                        len(val)/len(data)*100, 
                                                                        len(test)/len(data)*100))
    return train, val, test

In [11]:
test = filtered_data.groupby("cluster").filter(lambda x: len(x) < 2)
val = filtered_data.groupby("cluster").filter(lambda x: len(x) < 11 and len(x) > 1)
train = filtered_data.groupby("cluster").filter(lambda x: len(x) >= 11)

In [30]:
#train, val, test = train_val_test_split(filtered_data.groupby(STRUCTURE_COL).filter(lambda x: len(x) > 1), 0, 0)

Full: 14 | Train: 14 | Validation: 0 | Test: 0
Train: 100.00% | Validation: 0.00% | Test: 0.00%


In [31]:
#unique = filtered_data.groupby(STRUCTURE_COL).filter(lambda x: len(x) == 1)

In [32]:
#extra_train, val, test  = train_val_test_split(unique, val=0.09, test=0.01)

Full: 8944 | Train: 8049 | Validation: 805 | Test: 90
Train: 89.99% | Validation: 9.00% | Test: 1.01%


In [33]:
#train = train.append(extra_train)

In [34]:
train.to_csv(os.path.join(SAVE_PATH, "train.csv"), sep='\t', index=None)
val.to_csv(os.path.join(SAVE_PATH, "val.csv"), sep='\t', index=None)
test.to_csv(os.path.join(SAVE_PATH, "test.csv"), sep='\t', index=None)

### To indices

In [35]:
def get_indicies(val, kmer2index, kmer_size):
    return [kmer2index.get(val[i:i+kmer_size], 0) for i in range(len(val)-(kmer_size-1))]

def convert_to_index(data, file_name):
    embedding_path = os.path.join("..\\..\\data\\protein\\classification\\data_sources", file_name)
    embeddings = pd.read_csv(embedding_path, delimiter="\t")
    all_words = embeddings["words"]
    index2kmer = all_words.to_dict()
    kmer_size = len(index2kmer[1])
    kmer2index = inv_map = {v: k for k, v in index2kmer.items()}
    print("The size of vocabulary is {}".format(max(list(index2kmer.keys()))))
    data["sequence"] = data.seq.swifter.apply(lambda x: get_indicies(x, kmer2index, kmer_size))
    data = data.drop("seq", axis = 1)
    return data, kmer_size

In [36]:
def store_data(embedding_path):
    print("Reading original data")
    train = pd.read_csv(os.path.join(SAVE_PATH, "train.csv"), sep='\t')
    val = pd.read_csv(os.path.join(SAVE_PATH, "val.csv"), sep='\t')
    test = pd.read_csv(os.path.join(SAVE_PATH, "test.csv"), sep='\t')
    print("Lengths. Max: {} Min: {}".format(max([len(x) for x in train.seq]), min([len(x) for x in train.seq])))
    train_emb, _ = convert_to_index(train, embedding_path)
    train_emb[STRUCTURE_COL] = train_emb[STRUCTURE_COL].apply(lambda x: [ classToIndex[c] for c in x])
    val_emb, _ = convert_to_index(val, embedding_path)
    val_emb[STRUCTURE_COL] = val_emb[STRUCTURE_COL].apply(lambda x: [classToIndex[c] for c in x])
    test_emb, kmer_size = convert_to_index(test, embedding_path)
    test_emb[STRUCTURE_COL] = test_emb[STRUCTURE_COL].apply(lambda x: [classToIndex[c] for c in x])
    print("Embedded sequence lengths. Max: {} Min: {}".format(max([len(x) for x in train_emb.sequence]), 
                                                              min([len(x) for x in train_emb.sequence])))
    
    print("Indicies used. Max: {} Min: {}".format(max([max(x) for x in train_emb.sequence]), 
                                                  min([min(x) for x in train_emb.sequence])))
    
    tf_records_path = os.path.join(SAVE_PATH, "{}_kmers".format(kmer_size))
    store_fn = save_as_npy #save_as_tfrecords_multithreaded
    store_fn(os.path.join(tf_records_path, "train"), train_emb[["sequence", STRUCTURE_COL]], columns=["sequence", STRUCTURE_COL])
    store_fn(os.path.join(tf_records_path, "val"), val_emb[["sequence", STRUCTURE_COL]], columns=["sequence", STRUCTURE_COL])
    store_fn(os.path.join(tf_records_path, "test"), test_emb[["sequence", STRUCTURE_COL]], columns=["sequence", STRUCTURE_COL])

In [37]:
store_data("1grams.csv")

Reading original data
Lengths. Max: 729 Min: 20
The size of vocabulary is 20


Pandas Apply: 100%|██████████| 8063/8063 [00:00<00:00, 15222.56it/s]


The size of vocabulary is 20


Pandas Apply: 100%|██████████| 805/805 [00:00<00:00, 13424.89it/s]


The size of vocabulary is 20


Pandas Apply: 100%|██████████| 90/90 [00:00<00:00, 7505.61it/s]


Embedded sequence lengths. Max: 729 Min: 20
Indicies used. Max: 20 Min: 0
Data was stored in ..\..\data\protein\structure\secondary_structure\sample_1_kmers\train\data.npy
Data was stored in ..\..\data\protein\structure\secondary_structure\sample_1_kmers\val\data.npy
Data was stored in ..\..\data\protein\structure\secondary_structure\sample_1_kmers\test\data.npy


In [5]:
data = np.load(ROOT + "data_sources\\CB513.npy")

## Preprocessing ss.txt

In [116]:
from Bio import SeqIO
data = []
i = 0
with open(ROOT + "data_sources\\ss.txt") as f:
    line = f.readline().replace("\n", "")
    
    while True:
        number, _, data_type = line.split(":")

        seq=""
        structure=""
        line = f.readline().replace("\n", "")
        while ">" not in line:
            seq = seq+line
            line = f.readline().replace("\n", "")
        n, _, d = line.split(":")

        assert number == n
        assert "secstr" == d  
        line = f.readline().replace("\n", "") 
        while ">" not in line:
            structure = structure+line
            line = f.readline().replace("\n", "")
        data.append([number.replace(">", ""), seq, structure.replace(" ", "C")])
        if line == "> END":
            break
        i += 1
        if i % 100000 == 0:
            print("Already processed {} rows".format(i))
    

In [117]:
len(data)

408332

In [118]:
df = pd.DataFrame(data, columns=["id","seq","sst8"])

In [119]:
df.to_csv(ROOT + "\\data_sources\\ss.csv", index=None)

In [120]:
for_cluster = pd.read_csv(ROOT + "data_sources\\ss.csv")

In [121]:
for_cluster[["id", "seq"]].to_csv(ROOT + "data_sources\\ss_to_cluster.csv", index=None, sep="\t")

## Including cluster

In [6]:
from Bio import SeqIO
data = []
i = 0
with open(ROOT + "data_sources\\ss_to_cluster_renamed.fasta") as f:
    line = f.readline().replace("\n", "")
    
    while True:
        line = line.replace(">", "")
        if  " " in line:
            new_id, original_id = line.split(" ")
        else:
            new_id, original_id = line, line

        seq=""
        line = f.readline().replace("\n", "")
        while ">" not in line:
            seq = seq+line
            line = f.readline().replace("\n", "")

        data.append([new_id, original_id, seq])
        if line == "> END":
            break
        i += 1
        if i % 100000 == 0:
            print("Already processed {} rows".format(i))

100000
200000
300000
400000


In [7]:
cluster = pd.read_csv(ROOT + "data_sources\\ss_to_cluster_renamedDB_clu.tsv", names=["cluster", "id"], sep="\t")

In [8]:
df = pd.DataFrame(data, columns=["new_id","original_id","seq"])

In [9]:
df_with_cluster = df.merge(cluster, left_on='new_id', right_on='id', how='inner').drop("id", axis=1)

In [10]:
df_with_cluster.to_csv(ROOT + "data_sources\\seq_with_clusters.csv", index=None)

In [12]:
df_with_cluster = pd.read_csv(ROOT + "data_sources\\seq_with_clusters.csv")

In [13]:
structures = pd.read_csv(ROOT + "data_sources\\ss.csv")

In [14]:
final = df_with_cluster.join(structures[STRUCTURE_COL]).drop(["new_id", "original_id"], axis=1)

In [16]:
final.to_csv(ROOT + "data_sources\\sst_with_clusters.csv", index=None)