In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import json
import fastai

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
ROOT = "../../data/protein/classification/full_750/"
DATA_PATH = ROOT+"3_kmers"
EMBEDDING_PATH = "../../data/protein/classification/data_sources/protVec_100d_3grams.csv"
MODEL_PATH = "../../weights/protein/classification/full_750/3_kmers"
SEQUENCE_LENGTH=748

In [4]:
with open(ROOT+"classToIndex.json") as f:
    data = json.load(f)
NUM_CLASSES = max(data.values())+1

In [5]:
class EnzymeDataSet(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, path, seq_length, transform=None):
        self.data = np.load(path)
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        end_padding = self.seq_length-len(row[0])
        seq = np.pad(row[0], mode="constant", pad_width=(0,end_padding))
        return seq, row[1]

In [6]:
embeddings = np.loadtxt(open(EMBEDDING_PATH, "rb"), delimiter="\t", skiprows=1, usecols=[i for i in range(1,101)])


In [7]:
enzymeDataSet = EnzymeDataSet(ROOT+"3_kmers/test/data.npy", 748)

In [8]:
iterator = torch.utils.data.DataLoader(dataset=enzymeDataSet, batch_size=2, shuffle=True)

In [9]:
class ResnetIdentityBlock(nn.Module):

    def __init__(self, num_inputs, num_outputs, kernel_size, strides, dilation_rate=1, dropout=0.2, downsample = True, 
                 act=F.relu):
        super(ResnetIdentityBlock, self).__init__()
        self.act = act
        self.conv1 = nn.Conv1d(num_inputs, num_outputs, kernel_size, stride=strides, dilation =1)
        self.conv2 = nn.Conv1d(num_outputs, num_outputs, kernel_size, stride=1, dilation =dilation_rate)
        self.conv3 = nn.Conv1d(num_inputs, num_outputs, kernel_size, stride=strides, dilation =1)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.padding = nn.ReflectionPad1d((2,2))
        self.bn1 = nn.BatchNorm1d(num_outputs)
        self.bn2 = nn.BatchNorm1d(num_outputs)
        if downsample:            
            self.downsample = nn.Conv1d(num_inputs, num_outputs, kernel_size, stride=strides, dilation =1)
        
    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.dropout1(out)
        out = self.padding(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.dropout2(out)
        if self.downsample is not None:
            residual = self.downsample(residual)

        out += residual
        out = self.act(out)
        
        return out

In [10]:
class EnzymeClassifier(nn.Module):
    def __init__(self):
        super(EnzymeClassifier, self).__init__()
        self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        self.embedding.from_pretrained(torch.from_numpy(embeddings)) 
        self.resnet_block1 = ResnetIdentityBlock(embeddings.shape[1], 128, 5, 3)
        self.resnet_block2 = ResnetIdentityBlock(128, 256, 5, 3)   
        self.resnet_block3 = ResnetIdentityBlock(256, 512, 5, 3)  
        self.resnet_block4 = ResnetIdentityBlock(512, 1024, 5, 3)
        self.out = torch.nn.Linear(1024, NUM_CLASSES)
        self.softmax = nn.Softmax(1)

    def forward(self, x):
        x = x.long().cuda()
        embedded_seq = self.embedding(x)
        embedded_seq.transpose_(1, 2)
        h1 = self.resnet_block1(embedded_seq)
        h2 = self.resnet_block2(h1)    
        h3 = self.resnet_block3(h2)   
        h4 = self.resnet_block4(h3)
        flat = torch.sum(h4, dim=2)
        out = self.out(flat)
        return self.softmax(out)

In [11]:
enzymeClassifier = EnzymeClassifier().cuda()

In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(enzymeClassifier.parameters(), lr=0.001)

In [13]:
data = fastai.basic_data.DataBunch.create(enzymeDataSet, enzymeDataSet, test_ds=None, bs=16, num_workers=0)
learner = fastai.basic_train.Learner(data, enzymeClassifier, 
        loss_func=criterion, 
        metrics=None, 
        path=None, 
        model_dir='models')

In [17]:
learner.fit(1)

Total time: 12:12
epoch  train_loss  valid_loss
1      6.944083    6.943697    (12:12)



In [14]:
learner