In [None]:
!pip install transformers==4.29.2 torch==1.13.1 accelerate==0.20.3 scikit-learn==1.4.2
!pip install "numpy<2.0" --force-reinstall --no-cache-dir
!pip uninstall flash-attn -y
!pip uninstall triton -y

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
from google.colab import drive
!git clone https://github.com/fabianagoes/ismb_tutorial8.git
%cd ismb_tutorial8
drive.mount('/content/drive')
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from utils import get_all_subdirs
import ipywidgets as widgets
from IPython.display import display

In [None]:
root_dir = "/content/ismb_tutorial8/datasets"
subdirs = get_all_subdirs(root_dir) # internal function that goes over all subdirectories. Source code: utils.py

dropdown = widgets.Dropdown(
    options=subdirs,
    description='Select:',
    disabled=False,
)

selected_path = {'value': subdirs[0]}

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        selected_path['value'] = change['new']

dropdown.observe(on_change)

display(dropdown)

In [None]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
model.eval()
model = model.to(device)

In [None]:
# Generate embeddings
def get_cls_embedding(sequence):
    inputs = tokenizer(sequence, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs[0]  # Pega o primeiro item da tupla
        cls_embedding = last_hidden_state[:, 0, :]

        return cls_embedding.squeeze().cpu().numpy()

In [None]:
# Generate embeddings for the datasets
def embed_dataframe(df):
    X = np.array([get_cls_embedding(seq) for seq in df['sequence']])
    y = df['label'].values
    return X, y

In [None]:
print('Dataset Selected: ',selected_path['value'])

train_data=pd.read_csv(selected_path['value']+'/train.csv')#read_csv to read csv files
val_data=pd.read_csv(selected_path['value']+'/dev.csv')
test_data=pd.read_csv(selected_path['value']+'/test.csv')
data={'Train':train_data,'Val':val_data,'Test':test_data}

train_data.head(5)# displays first 5 datapoints

X_train, y_train = embed_dataframe(train_data)
X_val, y_val = embed_dataframe(val_data)
X_test, y_test = embed_dataframe(test_data)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

In [None]:
# Train MLP
clf = MLPClassifier(hidden_layer_sizes=(256,), max_iter=500, random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Evaluating on test data
y_pred = clf.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

acc = accuracy_score(y_test, y_pred)
prec, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")