In [None]:
!pip install -U transformers
!pip install accelerate
!pip install -U datasets
!pip install scikit-learn
!pip install peft
!pip install -U bitsandbytes

In [None]:
import pandas as pd
import numpy as np
import torch

if torch.cuda.is_available():
    # Move your tensors and model to the GPU
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU")

GPU is available


In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from tokenizers import Tokenizer
import torch
import torch.nn.functional as F
from tokenizers.processors import BertProcessing
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

# load model and tokenizer
modelOne = AutoModelForCausalLM.from_pretrained("hugohrban/progen2-base", trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4'
        ))
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, target_modules=['qkj_proj', 'out_proj', 'fc_in', 'fc_out']
)
model = get_peft_model(modelOne, peft_config)
model.print_trainable_parameters()
tokenizer = Tokenizer.from_file('/workspace/tokenizer.json')
#tokenizer = Tokenizer.from_file('tokenizer.json')

In [None]:
model.to(device)

In [None]:
df = pd.read_csv("/workspace/SynthDNA Polymerase Dataset Update.csv")
#df = pd.read_csv("SynthDNA Polymerase Dataset Update.csv")
sequences = df.iloc[:, 0].tolist()

In [None]:
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import DataLoader, Dataset
from torch.nn.parallel import DistributedDataParallel as DDP
from tokenizers import Tokenizer
import os
import pandas as pd
from sklearn.model_selection import train_test_split

import numpy as np
import subprocess
from itertools import combinations

class ProteinDataset(Dataset):
    def __init__(self, sequences, tokenizer, begin_token_id, end_token_id):
        self.tokenized_sequences = [tokenizer.encode(sequence, add_special_tokens=False) for sequence in sequences]
        for i, encoding in enumerate(self.tokenized_sequences):
            modified_ids = encoding.ids
            self.tokenized_sequences[i] = modified_ids

    def __len__(self):
        return len(self.tokenized_sequences)

    def __getitem__(self, idx):
        return self.tokenized_sequences[idx]

def collate_fn(batch):
    max_length = max(len(sequence) for sequence in batch)
    padded_input_ids = torch.tensor(
        [sequence + [0] * (max_length - len(sequence)) for sequence in batch], dtype=torch.long)
    return {"input_ids": padded_input_ids, "labels": padded_input_ids.clone()}

In [None]:
train_sequences, test_sequences = train_test_split(sequences, test_size=0.1, random_state=42)
test_dataset = ProteinDataset(test_sequences, tokenizer, begin_token_id=1, end_token_id=2)
train_dataset = ProteinDataset(train_sequences, tokenizer, begin_token_id=1, end_token_id=2)
print(len(train_dataset))
print(len(test_dataset))

50182
5576


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='/workspace/results',
    eval_strategy="epoch",    # output directory          # evaluate at the end of each epoch
    num_train_epochs=10,                  # total number of training epochs
    per_device_train_batch_size=2,       # batch size per device during training
    per_device_eval_batch_size=2,        # batch size for evaluation
    warmup_steps=500,# reduced number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=4,
    weight_decay=0.01,                    # strength of weight decay
    logging_dir='/workspace/logs',
    logging_steps=10,                     # directory for storing logs
    learning_rate=1e-5,                   # learning rate
    save_total_limit=1,                   # limit the total number of checkpoints                       # save checkpoints every 5000 steps
    auto_find_batch_size=False,            # enable auto batch size adjustment
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=lambda data: collate_fn(data)
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained('/workspace/finetuned_progen2')
tokenizer.save_pretrained('/workspace/finetuned_progen2')

In [None]:
model_finetuned = AutoModelForCausalLM.from_pretrained('/workspace/results/checkpoint-1220', trust_remote_code=True, torch_dtype='auto')

In [None]:
trainer.evaluate()