In [1]:
import pandas as pd
import numpy as np
import os
import swifter
from common.preprocessing.dataframe import *
from common.bio.amino_acid import *

In [2]:
STRUCTURE_COL = "sst8"
ROOT = "..\\..\\data\\protein\\structure\\secondary_structure\\"
#DATA_SOURCES = ROOT + "data_sources\\2018-06-06-pdb-intersect-pisces.csv"
DATA_SOURCES = ROOT + "data_sources\\cullpdb+profile_6133.npy"
SAVE_PATH = ROOT

In [3]:
data = np.load(DATA_SOURCES)

In [4]:
data = data.reshape(6133,700,57)
data.shape

(6133, 700, 57)

In [5]:
labels = np.argmax(data[:,:,22:31], axis =2)

In [6]:
sequences = np.argmax(data[:, :, np.r_[0:22]], axis =2)

In [7]:
sequences = sequences+100
labels = labels + 100

In [8]:
acid_mapping = {100:1, 101:2, 102:4, 103:3, 104:6, 105:5, 106:8, 107:7, 108:9, 109:11, 110:10, 111:12, 
 112:14, 113:13, 114:16, 115:15, 116:17, 117:19, 118:18, 119:20, 120:21, 121:0}


In [9]:
for k in acid_mapping.keys():
    sequences[sequences == k] = acid_mapping[k]

In [10]:
sequences[0][0], np.argmax(data[:, :, np.r_[0:22]], axis =2)[0][0]

(10, 10)

In [11]:
structure_mapping = {100:2, 101:1, 102:3, 103:4, 104:6, 105:5, 106:7, 107:8, 108:0}

In [12]:
{0: '0', 1: 'B', 2: 'C', 3: 'E', 4: 'G', 5: 'H', 6: 'I', 7: 'S', 8: 'T'}

{0: '0', 1: 'B', 2: 'C', 3: 'E', 4: 'G', 5: 'H', 6: 'I', 7: 'S', 8: 'T'}

In [13]:
for k in structure_mapping.keys():
    labels[labels == k] = structure_mapping[k]

In [14]:
labels[0][0], np.argmax(data[:,:,22:31], axis =2)[0][0]

(2, 0)

In [15]:
#filtered_data = filter_non_standard_amino_acids(filtered_data, "seq")

In [239]:
mutations = data[:,:,35:55]

In [240]:
mutations = np.power(mutations, 2.5)

In [241]:
mutations = mutations/np.sum(mutations, axis=2)[:,:,None]

  """Entry point for launching an IPython kernel.


In [242]:
mutations = mutations.cumsum(axis=2)

In [243]:
data[3,33,35:55]

array([0.23866731, 0.07242649, 0.15709548, 0.41338244, 0.13354172,
       0.0772722 , 0.159762  , 0.87974316, 0.97996432, 0.40371731,
       0.67480522, 0.22793643, 0.13705128, 0.55477923, 0.72111517,
       0.2592251 , 0.28905049, 0.56954622, 0.04269665, 0.26894143])

In [244]:
(data[3,33,35:55]/7.260719653218985).cumsum(axis=0)

array([0.03287103, 0.04284614, 0.06448249, 0.12141658, 0.13980893,
       0.15045143, 0.17245503, 0.29361976, 0.4285877 , 0.48419063,
       0.57712979, 0.60852288, 0.6273986 , 0.7038069 , 0.80312421,
       0.83882661, 0.87863678, 0.9570789 , 0.9629594 , 1.        ])

In [245]:
mutations[3][33]

array([0.00821882, 0.00863576, 0.01152469, 0.04397415, 0.04589888,
       0.04638909, 0.04940217, 0.26379839, 0.54456988, 0.57515575,
       0.68563315, 0.69295906, 0.69501276, 0.76271886, 0.89313729,
       0.90324191, 0.91650857, 0.98881048, 0.98892174, 1.        ])

In [237]:
lengths = (sequences==0).argmax(axis=1)
lengths = lengths.reshape(6133,1)

In [247]:
sequences.shape, labels.shape, lengths.shape, mutations.shape

((6133, 700), (6133, 700), (6133, 1), (6133, 700, 20))

In [137]:
np.random.choice(np.arange(1,20+1), p = data[3,153,35:55]/sum(data[3,153,35:55]))

11

## Save to numpy

In [99]:
train_idx = 5600
test_idx = 5877
val_idx = 6133

In [175]:
train = np.stack([sequences[0:train_idx], labels[0:train_idx]], axis= 1)
val = np.stack([sequences[test_idx:val_idx], labels[test_idx:val_idx]], axis= 1)
test = np.stack([sequences[train_idx+5:test_idx], labels[train_idx+5:test_idx]], axis= 1)

In [103]:
train.shape

(5600, 2, 700)

In [75]:
os.makedirs(os.path.join(SAVE_PATH, "cullpdb" ,"test"), exist_ok=True)

In [104]:
np.save(os.path.join(SAVE_PATH, "cullpdb" ,"train", "data.npy"), train)
np.save(os.path.join(SAVE_PATH, "cullpdb" ,"val", "data.npy"), val)
np.save(os.path.join(SAVE_PATH, "cullpdb" ,"test", "data.npy"), test)

In [105]:
np.load(os.path.join(SAVE_PATH, "cullpdb" ,"train", "data.npy"))[1].shape

(2, 700)

## Lengths

In [176]:
train = lengths[0:train_idx]
val = lengths[test_idx:val_idx]
test = lengths[train_idx+5:test_idx]

In [177]:
np.save(os.path.join(SAVE_PATH, "cullpdb" ,"train", "lengths.npy"), train)
np.save(os.path.join(SAVE_PATH, "cullpdb" ,"val", "lengths.npy"), val)
np.save(os.path.join(SAVE_PATH, "cullpdb" ,"test", "lengths.npy"), test)

## Mutations

In [248]:
train = mutations[0:train_idx]
val = mutations[test_idx:val_idx]
test = mutations[train_idx+5:test_idx]

In [249]:
np.save(os.path.join(SAVE_PATH, "cullpdb" ,"train", "mutations_cumsum.npy"), train)
np.save(os.path.join(SAVE_PATH, "cullpdb" ,"val", "mutations_cumsum.npy"), val)
np.save(os.path.join(SAVE_PATH, "cullpdb" ,"test", "mutations_cumsum.npy"), test)

In [250]:
np.load(os.path.join(SAVE_PATH, "cullpdb" ,"train", "mutations.npy"))[1].shape

(700, 20)