In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import json
import fastai
import math
from functools import partial

In [2]:
ROOT = "../../data/protein/classification/sample_512/"
DATA_PATH = ROOT+"1_kmers"
MODEL_PATH = "../../weights/protein/classification/sample_512/1_kmers"
SEQUENCE_LENGTH=512
VOCAB_SIZE=20
BERT_CONFIG_FILE = "../../../bert/config/bert_config_file.json"
BERT_WEIGHTS = "../../../bert_pytorch/weights/tpu"

In [3]:
epochs = 1
num_workers = 8 # On cloud 8
batch_size = 64

In [4]:
with open(ROOT+"classToIndex.json") as f:
    data = json.load(f)
NUM_CLASSES = max(data.values())+1

In [5]:
class EnzymeDataSet(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, path, seq_length, is_test=True, vocab_size=20, random=False):
        self.data = np.load(path)
        self.seq_length = seq_length
        self.is_test = is_test
        self.vocab_size = vocab_size
        self.random = random

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        seq = np.asarray(row[1])
        label = row[0]
        mask = np.ones(len(row[1]))
        if not self.is_test:
            mutations = int(len(seq)*0.05)
            np.put(seq, np.random.randint(0, len(seq)-1, mutations), np.random.randint(1, self.vocab_size, mutations))
        to_pad = self.seq_length-len(seq)
        if self.random:
            end_padding = randint(0, to_pad)
            begin_padding = to_pad - end_padding
            seq = np.pad(seq, mode="constant", pad_width=(begin_padding,end_padding))
            mask = np.pad(mask, mode="constant", pad_width=(begin_padding,end_padding))
        else:
            seq = np.pad(seq, mode="constant", pad_width=(0,to_pad))
            mask = np.pad(mask, mode="constant", pad_width=(0,to_pad))
        return (np.int64(seq) , mask), label

In [6]:
train_ds = EnzymeDataSet(DATA_PATH+"/train/data.npy", SEQUENCE_LENGTH, is_test=False, vocab_size=VOCAB_SIZE)
val_ds = EnzymeDataSet(DATA_PATH+"/val/data.npy", SEQUENCE_LENGTH)
test_ds = EnzymeDataSet(DATA_PATH+"/test/data.npy", SEQUENCE_LENGTH)

In [7]:
from modeling import *
class BertEnzymeClassification(nn.Module):
    def __init__(self, config, num_labels):
        super(BertEnzymeClassification, self).__init__()
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)

        def init_weights(module):
            if isinstance(module, (nn.Linear, nn.Embedding)):
                # Slightly different from the TF version which uses truncated_normal for initialization
                # cf https://github.com/pytorch/pytorch/pull/5617
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            elif isinstance(module, BERTLayerNorm):
                module.beta.data.normal_(mean=0.0, std=config.initializer_range)
                module.gamma.data.normal_(mean=0.0, std=config.initializer_range)
            if isinstance(module, nn.Linear):
                module.bias.data.zero_()
        self.apply(init_weights)

    def forward(self, input_ids, attention_mask, labels=None):
        _, pooled_output = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [8]:
from modeling import BertConfig
bert_config = BertConfig.from_json_file(BERT_CONFIG_FILE)

In [10]:
enzymeClassifier = BertEnzymeClassification(bert_config, NUM_CLASSES)
enzymeClassifier.bert.load_state_dict(torch.load(BERT_WEIGHTS, map_location='cpu'))
enzymeClassifier.to('cuda')

BertEnzymeClassification(
  (bert): BertModel(
    (embeddings): BERTEmbeddings(
      (word_embeddings): Embedding(22, 512)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(16, 512)
      (LayerNorm): BERTLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BERTEncoder(
      (layer): ModuleList(
        (0): BERTLayer(
          (attention): BERTAttention(
            (self): BERTSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BERTSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): BERTLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BERTIntermediate(


In [11]:
enzymeClassifier.bert.embeddings.word_embeddings.weight[0][0]

tensor(-0.0131, device='cuda:0', grad_fn=<SelectBackward>)

In [12]:
for n, p in enzymeClassifier.bert.named_parameters():
    if p.requires_grad: 
        p.requires_grad=False

In [13]:
from optimization import BERTAdam
param_optimizer = list(enzymeClassifier.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0}
    ]
train_examples = len(np.load(DATA_PATH+"/train/data.npy"))
num_train_steps = int(train_examples / batch_size / epochs)
optimizer = partial(BERTAdam, params = optimizer_grouped_parameters,
                     lr=5e-5,
                     warmup=0.1,
                     t_total=num_train_steps)

In [14]:
criterion = nn.CrossEntropyLoss()

In [15]:
sum(p.numel() for p in enzymeClassifier.parameters() if p.requires_grad)

394497

In [16]:
data = fastai.basic_data.DataBunch.create(train_ds=train_ds, valid_ds=val_ds, test_ds=test_ds, bs=batch_size, num_workers=num_workers)
learner = fastai.basic_train.Learner(data, enzymeClassifier, 
        loss_func=criterion, 
        metrics=fastai.accuracy, 
        #opt_func=optimizer,
        path=None, 
        model_dir='models')

In [None]:
learner.fit_one_cycle(epochs)

epoch,train_loss,valid_loss,accuracy
,,,


In [22]:
learner.fit(2)

Total time: 36:00
epoch  train_loss  valid_loss  accuracy
1      4.676698    4.226252    0.224173  (18:00)
2      4.427995    3.824543    0.316728  (18:00)



In [18]:
preds, truth =learner.get_preds(is_test=True)

In [19]:
fastai.accuracy(preds, truth)

tensor(0.2042)

In [20]:
[ (n, p) for n, p in enzymeClassifier.named_parameters()]

[('bert.embeddings.word_embeddings.weight', Parameter containing:
  tensor([[ 0.0406, -0.0241, -0.0395,  ...,  0.0361, -0.0143,  0.0142],
          [-0.0097, -0.0146,  0.0551,  ..., -0.0228,  0.0357, -0.0681],
          [-0.0211, -0.0125,  0.0431,  ...,  0.0410, -0.0876,  0.0886],
          ...,
          [ 0.0125,  0.0202,  0.0176,  ...,  0.0742, -0.0242,  0.0116],
          [-0.0877, -0.0584,  0.0307,  ..., -0.0108, -0.0005,  0.0087],
          [ 0.0154,  0.0143,  0.0031,  ..., -0.0186, -0.0120,  0.0114]],
         device='cuda:0')),
 ('bert.embeddings.position_embeddings.weight', Parameter containing:
  tensor([[-0.0445, -0.0227, -0.0819,  ..., -0.0561,  0.0720,  0.0309],
          [-0.0076,  0.0056,  0.0226,  ...,  0.0060,  0.0601, -0.0187],
          [ 0.0234,  0.0312, -0.0096,  ...,  0.0118, -0.0712, -0.0095],
          ...,
          [-0.0116, -0.0197, -0.0123,  ...,  0.0076,  0.0034,  0.0024],
          [-0.0209, -0.0399,  0.0403,  ...,  0.0108, -0.0335,  0.0178],
          [-0

In [21]:
[ n for n, p in enzymeClassifier.named_parameters() if p.requires_grad]

['classifier.weight', 'classifier.bias']