<a href="https://colab.research.google.com/github/charleszhang418/SpaceX/blob/main/code/dnabert_finetune_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

+ Data: https://osdr.nasa.gov/bio/repo/data/studies/OSD-466
+ Sample:
  1. RR10_FCS_FLT_KO_F19: p21-null, Space Flight
  2. RR10_FCS_FLT_WT_F16: Wild Type, Space Flight
  3. RR10_FCS_GC_WT_G3: Wild Type, Ground Control
  4. RR10_FCS_GC_KO_G4: p21-null, Ground Control
  5. RR10_FCS_VIV_WT_V1: Wild Type, Vivarium Control
  6. RR10_FCS_VIV_KO_V13: p21-null,	Vivarium Control

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# %cd gdrive/MyDrive/NASA
%cd gdrive/MyDrive/Project/data_science/nasa-space-app-2023/
!ls

/content/gdrive/MyDrive/Project/data_science/nasa-space-app-2023
code  data  model


In [3]:
!pip install transformers
!pip install torch
!pip install einops
!pip install transformers[torch]
!pip install evaluate

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m85.3 MB/s[0m eta [36m0:00:00[0m
Inst

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, BertModel
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import evaluate
import numpy as np

In [None]:
dna_data = pd.read_csv('data/dna_data.csv')
print(dna_data.shape)
dna_data.head()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('zhihan1996/DNABERT-2-117M', trust_remote_code=True)
# model = BertModel.from_pretrained('zhihan1996/DNABERT-2-117M', trust_remote_code=True)
model = BertForSequenceClassification.from_pretrained('zhihan1996/DNABERT-2-117M', num_labels=3)

In [None]:
# # Create random label for testing
# import random
# N = len(dna_data)
# random_label_list = [random.randint(0, 2) for _ in range(N)]
# dna_data['label'] = random_label_list

In [None]:
# Label encoding

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(list(dna_data['label']))
dna_data['label'] = encoded_labels

# original_labels = label_encoder.inverse_transform(encoded_labels)

In [9]:
train_data = dna_data.sample(frac=0.8, random_state=524)
test_eval_data = dna_data.drop(train_data.index)
eval_data = test_eval_data.sample(frac=0.5, random_state=524)
test_data = test_eval_data.drop(eval_data.index)

print(train_data.shape, eval_data.shape, test_data.shape)

train_dna = list(train_data['DNA'])
train_labels = list(train_data['label'])

val_dna = list(eval_data['DNA'])
val_labels = list(eval_data['label'])


test_dna = list(test_data['DNA'])
test_labels = list(test_data['label'])

train_encodings = tokenizer(train_dna, truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(val_dna, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_dna, truncation=True, padding=True, return_tensors='pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


(28800, 6) (3600, 6) (3600, 6)


In [10]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [11]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  acc = metric.compute(predictions=predictions, references=labels)['accuracy']
  return {
      'accuracy': acc
  }

training_args = TrainingArguments(
    output_dir='model/results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=2e-5,
    logging_dir='model/logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate(test_dataset)

# model.save_pretrained("./your_finetuned_model")
# tokenizer.save_pretrained("./your_finetuned_tokenizer")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TypeError: ignored

In [25]:
results

{'eval_loss': 2.5745129585266113,
 'eval_accuracy': 0.336,
 'eval_runtime': 0.199,
 'eval_samples_per_second': 628.021,
 'eval_steps_per_second': 40.193,
 'epoch': 10.0}