In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import json
import fastai
import math
from functools import partial

  return f(*args, **kwds)
  return f(*args, **kwds)


In [7]:
ROOT = "../../data/protein/structure/secondary_structure/"
DATA_PATH = ROOT+"sample_1_kmers"
MODEL_PATH = "../../weights/protein/structure/secondary_structure/1_kmers"
SEQUENCE_LENGTH=512
VOCAB_SIZE=20
BERT_CONFIG_FILE = "../../../bert/config/bert_config_file.json"
BERT_WEIGHTS = "../../../bert_pytorch/weights/tpu"
NUM_CLASSES = 9

In [8]:
epochs = 1
num_workers = 8 # On cloud 8
batch_size = 64

In [14]:
class EnzymeDataSet(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, path, seq_length, is_test=True, vocab_size=20, random=False):
        self.data = np.load(path)
        self.seq_length = seq_length
        self.is_test = is_test
        self.vocab_size = vocab_size
        self.random = random

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        seq = np.asarray(row[1])
        label = row[0]
        if len(seq) > SEQUENCE_LENGTH:
            seq = seq[0:512]
            label = label[0:512]
        mask = np.ones(len(row[1]))
        if not self.is_test:
            mutations = int(len(seq)*0.05)
            np.put(seq, np.random.randint(0, len(seq)-1, mutations), np.random.randint(1, self.vocab_size, mutations))
        to_pad = self.seq_length-len(seq)
        if self.random:
            end_padding = randint(0, to_pad)
            begin_padding = to_pad - end_padding
            seq = np.pad(seq, mode="constant", pad_width=(begin_padding,end_padding))
            mask = np.pad(mask, mode="constant", pad_width=(begin_padding,end_padding))
            label = np.pad(label, mode="constant", pad_width=(begin_padding,end_padding))
        else:
            seq = np.pad(seq, mode="constant", pad_width=(0,to_pad))
            mask = np.pad(mask, mode="constant", pad_width=(0,to_pad))
            label = np.pad(label, mode="constant", pad_width=(0,to_pad))
        return (np.int64(seq) , mask), label

In [15]:
train_ds = EnzymeDataSet(DATA_PATH+"/train/data.npy", SEQUENCE_LENGTH, is_test=False, vocab_size=VOCAB_SIZE)
val_ds = EnzymeDataSet(DATA_PATH+"/val/data.npy", SEQUENCE_LENGTH)
test_ds = EnzymeDataSet(DATA_PATH+"/test/data.npy", SEQUENCE_LENGTH)

In [11]:
from modeling import *
class BertEnzymeClassification(nn.Module):
    def __init__(self, config, num_labels):
        super(BertEnzymeClassification, self).__init__()
        self.bert = BertModel(config)
        self.conv = nn.Conv1d(config.hidden_size, num_labels, 1)

        def init_weights(module):
            if isinstance(module, (nn.Conv1d)):
                # Slightly different from the TF version which uses truncated_normal for initialization
                # cf https://github.com/pytorch/pytorch/pull/5617
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            elif isinstance(module, BERTLayerNorm):
                module.beta.data.normal_(mean=0.0, std=config.initializer_range)
                module.gamma.data.normal_(mean=0.0, std=config.initializer_range)
        self.apply(init_weights)

    def forward(self, input_ids, attention_mask, labels=None):
        all_encoder_layers, pooled_output = self.bert(input_ids, attention_mask=attention_mask)
        logits = self.conv(all_encoder_layers)
        return logits

In [12]:
from modeling import BertConfig
bert_config = BertConfig.from_json_file(BERT_CONFIG_FILE)

In [13]:
enzymeClassifier = BertEnzymeClassification(bert_config, NUM_CLASSES)
enzymeClassifier.bert.load_state_dict(torch.load(BERT_WEIGHTS, map_location='cpu'))
enzymeClassifier.to('cuda')

FileNotFoundError: [Errno 2] No such file or directory: '../../../bert_pytorch/weights/tpu'

In [None]:
enzymeClassifier.bert.embeddings.word_embeddings.weight[0][0]

In [None]:
for n, p in enzymeClassifier.bert.named_parameters():
    if p.requires_grad: 
        p.requires_grad=False

In [None]:
from optimization import BERTAdam
param_optimizer = list(enzymeClassifier.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0}
    ]
train_examples = len(np.load(DATA_PATH+"/train/data.npy"))
num_train_steps = int(train_examples / batch_size / epochs)
optimizer = partial(BERTAdam, params = optimizer_grouped_parameters,
                     lr=5e-5,
                     warmup=0.1,
                     t_total=num_train_steps)

In [None]:
criterion = nn.CrossEntropyLoss(weight=torch.tensor([0.0001, 1, 1, 1, 1, 1, 1, 1, 1]).cuda())

In [None]:
sum(p.numel() for p in enzymeClassifier.parameters() if p.requires_grad)

In [None]:
data = fastai.basic_data.DataBunch.create(train_ds=train_ds, valid_ds=val_ds, test_ds=test_ds, bs=batch_size, num_workers=num_workers)
learner = fastai.basic_train.Learner(data, enzymeClassifier, 
        loss_func=criterion, 
        metrics=fastai.accuracy, 
        #opt_func=optimizer,
        path=None, 
        model_dir='models')

In [None]:
learner.fit_one_cycle(epochs)

In [None]:
learner.fit(2)

In [None]:
preds, truth =learner.get_preds(is_test=True)

In [None]:
fastai.accuracy(preds, truth)

In [None]:
[ (n, p) for n, p in enzymeClassifier.named_parameters()]

In [None]:
[ n for n, p in enzymeClassifier.named_parameters() if p.requires_grad]