<h1>LLM - Using DistilBert for Protein Function Classification</h1>
<h2> By Edwin Tembo - 2023</h2>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvidia-smi

Tue Jan  9 02:57:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0              25W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
! pip install transformers



In [None]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
import ast
import datetime
import os
import sys

from torch.utils.tensorboard import SummaryWriter

logging.basicConfig(level=logging.ERROR)
torch.cuda.empty_cache()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [None]:
writer = SummaryWriter(log_dir='/content/drive/MyDrive/protein/CAFA_TORCH_RUNS')

In [None]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
data = pd.read_csv('/content/drive/MyDrive/protein/CAFA_SEQ_DATA/train_data.csv')
data.drop(['id'], inplace=True, axis=1)
data.rename(columns = {"sequence": "text"}, inplace=True)
new_df = data
new_df["labels"] = new_df.labels.apply(ast.literal_eval)
new_df.head()

Unnamed: 0,text,labels
0,M N S V T V S H A P Y T I T Y H D D W E P V M ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,M T E Y R N F L L L F I T S L S V I Y P C T G ...,"[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, ..."
2,M R L S S S P P R G P Q Q L S S F G S V D W L ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,M G G E A G A D G P R G R V K S L G L V F E D ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,M V E T N S P P A G Y T L K R S P S D L G E Q ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 1024
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
PRE_TRAINED_MODEL ='Rostlab/prot_bert'
MODEL_VOCAB = '/content/drive/MyDrive/protein/CAFA_TORCH_MODELS/vocab_distilbert_protBert.bin'

NUM_LABELS        = 500
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_VOCAB, truncation=True, do_lower_case=True)
SEED = 567

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        ##text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

class InferenceDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])


        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }


In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=SEED)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (142246, 2)
TRAIN Dataset: (113797, 2)
TEST Dataset: (28449, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL)
        self.pre_classifier = torch.nn.Linear(MAX_LEN, MAX_LEN)
        self.dropout = torch.nn.Dropout(0.1)
        self.lin1 = torch.nn.Linear(1024, 512)
        self.lin2 = torch.nn.Linear(512, 256)
        self.lin3 = torch.nn.Linear(256,64)
        self.classifier = torch.nn.Linear(64, NUM_LABELS)

    def forward(self, input_ids, attention_mask, token_type_ids, num_pre_classifiers=1):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = self.lin1(pooler)
        pooler = self.lin2(pooler)
        pooler = self.lin3(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing DistilBertModel: ['bert.encoder.layer.2.attention.self.key.bias', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.encoder.layer.7.output.dense.bias', 'bert.encoder.layer.7.attention.self.key.bias', 'bert.encoder.layer.3.output.dense.bias', 'bert.encoder.layer.13.attention.self.query.weight', 'bert.encoder.layer.18.attention.output.LayerNorm.weight', 'bert.encoder.layer.12.attention.output.dense.bias', 'bert.encoder.layer.19.attention.self.value.bias', 'bert.encoder.layer.7.output.LayerNorm.bias', 'bert.encoder.layer.20.attention.self.value.weight', 'bert.encoder.layer.9.attention.self.value.bias', 'bert.encoder.layer.29.attention.self.query.weight', 'bert.encoder.layer.29.intermediate.dense.bias', 'bert.encoder.layer.25.attention.output.dense.bias', 'bert.encoder.layer.20.attention.self.query.weight', 'bert.encoder.layer.2.intermediate.dense.bias', 'bert.encoder.layer.12.output.dense.bias'

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30, 1024, padding_idx=0)
      (position_embeddings): Embedding(40000, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-29): 30 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=1024, out_features=1024, bias=True)
            (k_lin): Linear(in_features=1024, out_features=1024, bias=True)
            (v_lin): Linear(in_features=1024, out_features=1024, bias=True)
            (out_lin): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (sa_layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
import os
def load_model_ckp(checkpoint_path, model, optimizer):
  checkpoint = torch.load(checkpoint_path)
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  return model, optimizer, checkpoint['epoch'], checkpoint['loss']

def save_model_ckp(epoch, model, optimizer,loss, save_path):
  torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, save_path)

def save_model(model, model_dir, model_name='model.pth'):
    path = os.path.join(model_dir, model_name)
    # recommended way from http://pytorch.org/docs/master/notes/serialization.html
    torch.save(model.state_dict(), path)
    logger.info(f"Saving model: {path} \n")



In [None]:
CHECKPOINT_DIR = '/content/drive/MyDrive/5minit_prot/CAFA_TORCH_MODELS'
CHECKPOINT_MODEL_PREFIX = 'distillBert_from_protbert'
def train(epoch, model):
    model.train()
    losses = []
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            now = datetime.datetime.now()
            now = now.strftime("%Y%m%d%H%M%S")
            model_name = f"{CHECKPOINT_MODEL_PREFIX}_epoch{epoch}_{now}.pth"
            save_path = os.path.join(CHECKPOINT_DIR,model_name)
            save_model_ckp(epoch, model, optimizer,loss, save_path)

            writer.add_scalar('Loss/Val', np.mean(losses), _)
            ##writer.add_scalar('Accuracy/Val', acc, _)
            writer.add_hparams(hparam_dict = {'lr': LEARNING_RATE, 'bsize': TRAIN_BATCH_SIZE} ,
                       metric_dict = {'Loss/Val': np.mean(losses),
                                      ##'Accuracy/Val' : acc,
                                      'Step':_},
                       hparam_domain_discrete=None,
                       run_name=None)


        loss.backward()
        optimizer.step()



In [None]:
for epoch in range(EPOCHS):
    train(epoch, model)

In [None]:
# pool of size=3, stride=2
m = torch.nn.MaxPool1d(20, stride=2)
input = torch.randn(1024, 1)
output = m(input)

In [None]:
output.shape

In [None]:
def validation(testing_loader, val_model):
    val_model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            ##fin_outputs.extend(torch.softmax(outputs,dim=1).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
MODEL_DIR = '/content/drive/MyDrive/prot/CAFA_TORCH_MODELS'

def model_fn(model_dir,
             model_name,
             num_classes=500):
    logger.info('model_fn')
    print('Loading the trained model...')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = DistilBERTClass() # pass number of classes, in our case its 10
    new_optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
    model, optimizer, epoch, loss = load_model_ckp(checkpoint_path=os.path.join(model_dir, model_name ),
                    model=model,
                   optimizer=new_optimizer)
    return model.to(device)
new_model = model_fn(model_dir=MODEL_DIR,
                  model_name='distillBert_from_protbert_epoch0_20230608222539.pth',
                  num_classes=500
                   )

model_fn


INFO:__main__:model_fn


Loading the trained model...


Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing DistilBertModel: ['bert.encoder.layer.2.attention.self.key.bias', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.encoder.layer.7.output.dense.bias', 'bert.encoder.layer.7.attention.self.key.bias', 'bert.encoder.layer.3.output.dense.bias', 'bert.encoder.layer.13.attention.self.query.weight', 'bert.encoder.layer.18.attention.output.LayerNorm.weight', 'bert.encoder.layer.12.attention.output.dense.bias', 'bert.encoder.layer.19.attention.self.value.bias', 'bert.encoder.layer.7.output.LayerNorm.bias', 'bert.encoder.layer.20.attention.self.value.weight', 'bert.encoder.layer.9.attention.self.value.bias', 'bert.encoder.layer.29.attention.self.query.weight', 'bert.encoder.layer.29.intermediate.dense.bias', 'bert.encoder.layer.25.attention.output.dense.bias', 'bert.encoder.layer.20.attention.self.query.weight', 'bert.encoder.layer.2.intermediate.dense.bias', 'bert.encoder.layer.12.output.dense.bias'

KeyboardInterrupt: ignored

In [None]:
for epoch in range(EPOCHS):
    train(epoch, new_model)

In [None]:
# Saving the files for inference
now = datetime.datetime.now()
now = now.strftime("%Y%m%d%H%M%S")
output_dir = f'/content/drive/MyDrive/prot/{now}_CAFA_TORCH_MODELS'
pathExists = os.path.exists(output_dir)
if not pathExists:
  os.makedirs(output_dir)

WEIGHTS_NAME = f'{now}_distillBert_protBert.pth'
CONFIG_NAME= f'{now}_distillBert_protBert.bin'

model_to_save = new_model.module if hasattr(new_model, 'module') else new_model
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
##model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)

print('Saved')

In [None]:
##save_model_ckp(epoch = 0, model = new_model, optimizer = ,loss, save_path):


In [None]:
val_model_path = 'distillBert_from_protbert_epoch0_20230609164638.pth'

val_model = model_fn(model_dir=MODEL_DIR,
                  model_name=val_model_path,
                  num_classes=500
                   )
outputs, targets = validation(testing_loader, val_model =val_model )

final_outputs = np.array(outputs) >=0.5

In [None]:
final_outputs = np.array(outputs) >=0.3

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

In [None]:

outputs, targets = validation(testing_loader, model =new_model )

final_outputs = np.array(outputs) >=0.5

In [None]:
final_outputs = np.array(outputs) >=0.5

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))
val_f1_score      = metrics.f1_score(np.array(targets), np.array(final_outputs))
print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")
print(f"f1 Score = {val_f1_score}")

In [None]:
submission_targets = pd.read_csv('/content/drive/MyDrive/prot/CAFA_SEQ_DATA/targets.csv')


In [None]:
submission_targets.head()

In [None]:
inf_model = model_fn(model_dir=MODEL_DIR,
                  model_name='/content/drive/MyDrive/prot/CAFA_TORCH_MODELS/distillBert_from_protbert_epoch0_20230609164638.pth',
                  num_classes=500
                   )

In [None]:

submission_targets.drop(['id', 'sequence_length', 'taxonomyID'], inplace=True, axis=1)
submission_targets.rename(columns = {"sequence": "text"}, inplace=True)
submission_targets.head()


In [None]:
targ_set = InferenceDataset(submission_targets, tokenizer, MAX_LEN)


In [None]:
targ_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

targ_loader = DataLoader(targ_set, **targ_params)


In [None]:
def doInference(targ_loader, targ_model):
    targ_model.eval()

    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(targ_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)

            fin_outputs.extend(torch.softmax(outputs,dim=1).cpu().detach().numpy().tolist())
    return fin_outputs

In [None]:
inference_out = doInference(targ_loader = targ_loader, targ_model=inf_model)

In [None]:
inference_out= np.array(inference_out)

In [None]:
inference_out.shape

In [None]:
class_map = np.load('/content/drive/MyDrive/prot/CAFA_SEQ_DATA/class_map.npy', allow_pickle=True)

In [None]:
class_map = class_map.tolist()

In [None]:
submission_df = pd.DataFrame(inference_out, columns = class_map)

In [None]:
submission_df["id"]= submission_targets["id"]

In [None]:
submission_df.set_index("id", inplace=True)

In [None]:
submission_df.head()

In [None]:
df_melted = submission_df.reset_index()

In [None]:
df_melted.head()

In [None]:
df_melted = df_melted.melt(["id"])
df_melted.head()

In [None]:
df_melted['variable'] = df_melted['variable'].str.replace("_GO","GO")

In [None]:
df_melted.to_csv('/content/drive/MyDrive/prot/CAFA_SUBMISSION/submission_2.tsv', header=False, index=False, sep='\t')