In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install evaluate
!pip install huggingface
!pip install huggingface_hub[hf_xet]  # sth to do with offline support
!pip install python-dotenv
!pip install wandb

Collecting evaluate


  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
















Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

Installing collected packages: evaluate


Successfully installed evaluate-0.4.3


Collecting huggingface


  Downloading huggingface-0.0.1-py3-none-any.whl.metadata (2.9 kB)


Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)


Installing collected packages: huggingface


Successfully installed huggingface-0.0.1








Collecting python-dotenv


  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)


Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)


Installing collected packages: python-dotenv


Successfully installed python-dotenv-1.1.0








In [3]:
""" import dependencies """
import logging
import os

import evaluate
import huggingface_hub
from huggingface_hub import HfApi
import numpy as np
import wandb
from datasets import load_dataset, Dataset
from dotenv import load_dotenv
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from torch.utils.data import IterableDataset
from transformers import BertTokenizer, BatchEncoding, AutoTokenizer, \
    AutoModelForSequenceClassification, AutoConfig, TrainingArguments, Trainer, DataCollatorWithPadding
import torch

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # hyena dna requires this
print("import dependencies completed")

import dependencies completed


In [4]:
""" load_env does not work in kaggle. a simple hack to reconcile the issue """
import os
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
os.environ["HF_TOKEN"] = user_secrets.get_secret("HF_TOKEN")
os.environ["WAND_DB_API_KEY"] = user_secrets.get_secret("WAND_DB_API_KEY")

print("Reconcile my code with kaggle")

Reconcile my code with kaggle


In [5]:
""" Common codes """
# some colors for visual convenience
red = "\u001b[31m"
green = "\u001b[32m"
yellow = "\u001b[33m"
blue = "\u001b[34m"

timber = logging.getLogger()
# logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)  # change to level=logging.DEBUG to print more logs...


def getDynamicGpuDevice():
    if torch.cuda.is_available():
        return torch.device("cuda")  # For NVIDIA GPUs
    elif torch.backends.mps.is_available():
        return torch.device("mps")  # For Apple Silicon Macs
    else:
        return torch.device("cpu")   # Fallback to CPU

def getDynamicBatchSize():
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0).lower()
        vramGiB = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)  # Convert to GB

        if "a100" in gpu_name:   # A100 (40GB+ VRAM)
            batch_size = 128
        elif "v100" in gpu_name:  # V100 (16GB/32GB VRAM)
            batch_size = 64 if vramGiB >= 32 else 32
        elif "p100" in gpu_name:  # P100 (16GB VRAM)
            batch_size = 32
        elif "t4" in gpu_name:    # Tesla T4 (16GB VRAM, common in Colab/Kaggle)
            batch_size = 32  # Maybe try 64 if no OOM
        elif "rtx 3090" in gpu_name or vramGiB >= 24:  # RTX 3090 (24GB VRAM)
            batch_size = 64
        elif vramGiB >= 16:   # Any other 16GB+ VRAM GPUs
            batch_size = 32
        elif vramGiB >= 8:    # 8GB VRAM GPUs (e.g., RTX 2080, 3060, etc.)
            batch_size = 16
        elif vramGiB >= 6:    # 6GB VRAM GPUs (e.g., RTX 2060)
            batch_size = 8
        else:
            batch_size = 4  # Safe fallback for smaller GPUs
    else:
        batch_size = 4  # CPU mode, keep it small

    return batch_size

def getGpuName():
    gpu_name = torch.cuda.get_device_name(0).lower()
    return gpu_name

# for hyenaDna. its tokenizer can process longer sequences...
def sequenceEncodePlusForHyenaDna(
    tokenizer: BertTokenizer,
    sequence: str,
    label: int
) -> BatchEncoding:
    input_ids = tokenizer(sequence)["input_ids"]
    input_ids: torch.Tensor = torch.Tensor(input_ids)
    label_tensor = torch.tensor(label)
    encoded_map: dict = {
        "input_ids": input_ids.long(),
        # "attention_mask": attention_mask.int(),    # hyenaDNA does not have attention layer
        "labels": label_tensor
    }

    batchEncodingDict: BatchEncoding = BatchEncoding(encoded_map)
    return batchEncodingDict

# for dnaBert. it cannot process longer sequences...
def sequenceEncodePlusWithSplitting(
        tokenizer: BertTokenizer,
        sequence: str,
        label: int
) -> BatchEncoding:
    max_size = 512

    tempMap: BatchEncoding = tokenizer.encode_plus(
        sequence,
        add_special_tokens=False,  # we'll add the special tokens manually in the for loop below
        return_attention_mask=True,
        return_tensors="pt"
    )

    someInputIds1xN = tempMap["input_ids"]  # shape = 1xN , N = sequence length
    someMasks1xN = tempMap["attention_mask"]
    inputIdsList = list(someInputIds1xN[0].split(510))
    masksList = list(someMasks1xN[0].split(510))

    tmpLength: int = len(inputIdsList)

    for i in range(0, tmpLength):
        cls: torch.Tensor = torch.Tensor([101])
        sep: torch.Tensor = torch.Tensor([102])

        isTokenUnitTensor = torch.Tensor([1])

        inputIdsList[i]: torch.Tensor = torch.cat([
            cls,
            inputIdsList[i],
            sep
        ])

        masksList[i] = torch.cat([
            isTokenUnitTensor,
            masksList[i],
            isTokenUnitTensor
        ])


        pad_len: int = max_size - inputIdsList[i].shape[0]
        if pad_len > 0:
            pad: torch.Tensor = torch.Tensor([0] * pad_len)

            inputIdsList[i]: torch.Tensor = torch.cat([
                inputIdsList[i],
                pad
            ])

            masksList[i]: torch.Tensor = torch.cat([
                masksList[i],
                pad
            ])


    # so each item len = 512, and the last one may have some padding
    input_ids: torch.Tensor = torch.stack(inputIdsList).squeeze()  # what's with this squeeze / unsqueeze thing? o.O
    attention_mask: torch.Tensor = torch.stack(masksList)
    label_tensor = torch.tensor(label)

    # print(f"{input_ids.shape = }")

    encoded_map: dict = {
        "input_ids": input_ids.long(),
        "attention_mask": attention_mask.int(),
        "labels": label_tensor
    }

    batchEncodingDict: BatchEncoding = BatchEncoding(encoded_map)
    return batchEncodingDict

def sequenceEncodePlusCompact(
        splitSequence: bool,
        tokenizer: BertTokenizer,
        sequence: str,
        label: int
) -> BatchEncoding:
    if splitSequence:
        return sequenceEncodePlusWithSplitting(tokenizer, sequence, label)
    else:
        return sequenceEncodePlusForHyenaDna(tokenizer, sequence, label)


class PagingMQTLDataset(IterableDataset):
    def __init__(
            self,
            someDataset: Dataset,
            bertTokenizer: BertTokenizer,
            seqLength: int,
            splitSequenceRequired: bool
        ):
        self.someDataset = someDataset
        self.bertTokenizer = bertTokenizer
        self.seqLength = seqLength
        self.splitSequenceRequired = splitSequenceRequired
        pass

    def __iter__(self):
        for row in self.someDataset:
            processed = self.preprocess(row)
            if processed is not None:
                yield processed

    def preprocess(self, row: dict):
        sequence = row["sequence"]
        label = row["label"]

        if len(sequence) != self.seqLength:
            return None  # skip a few problematic rows

        return sequenceEncodePlusCompact(self.splitSequenceRequired, self.bertTokenizer, sequence, label)

def isMyLaptop() -> bool:
    is_my_laptop = os.path.isfile("/home/gamegame/PycharmProjects/mqtl-classification/src/datageneration/dataset_4000_train_binned.csv")
    return is_my_laptop


def signInToHuggingFaceAndWandbToUploadModelWeightsAndBiases():
    # Load the .env file, but don't crash if it's not found (e.g., in Hugging Face Space)
    try:
        load_dotenv()  # Only useful on your laptop if .env exists
        print(".env file loaded successfully.")
    except Exception as e:
        print(f"Warning: Could not load .env file. Exception: {e}")

    # Try to get the token from environment variables
    try:
        token = os.getenv("HF_TOKEN")

        if not token:
            raise ValueError("HF_TOKEN not found. Make sure to set it in the environment variables or .env file.")

        # Log in to Hugging Face Hub
        huggingface_hub.login(token)
        print("Logged in to Hugging Face Hub successfully.")

    except Exception as e:
        print(f"Error during Hugging Face login: {e}")
        # Handle the error appropriately (e.g., exit or retry)

    # wand db login
    try:
        api_key = os.getenv("WAND_DB_API_KEY")
        timber.info(f"{api_key = }")

        if not api_key:
            raise ValueError(
                "WAND_DB_API_KEY not found. Make sure to set it in the environment variables or .env file.")

        # Log in to Hugging Face Hub
        wandb.login(key=api_key)
        print("Logged in to wand db successfully.")

    except Exception as e:
        print(f"Error during wand db Face login: {e}")
    pass

def createPagingTrainValTestDatasets(tokenizer, window, splitSequenceRequired) -> (PagingMQTLDataset, PagingMQTLDataset, PagingMQTLDataset):
    prefix = "/home/gamegame/PycharmProjects/mqtl-classification/"
    data_files = {
        # small samples
        "train_binned_200": f"{prefix}src/datageneration/dataset_200_train_binned.csv",
        "validate_binned_200": f"{prefix}src/datageneration/dataset_200_validate_binned.csv",
        "test_binned_200": f"{prefix}src/datageneration/dataset_200_test_binned.csv",
        # medium samples
        "train_binned_1000": f"{prefix}src/datageneration/dataset_1000_train_binned.csv",
        "validate_binned_1000": f"{prefix}src/datageneration/dataset_1000_train_binned.csv",
        "test_binned_1000": f"{prefix}src/datageneration/dataset_1000_train_binned.csv",

        # large samples
        "train_binned_4000": f"{prefix}src/datageneration/dataset_4000_train_binned.csv",
        "validate_binned_4000": f"{prefix}src/datageneration/dataset_4000_train_binned.csv",
        "test_binned_4000": f"{prefix}src/datageneration/dataset_4000_train_binned.csv",
    }

    dataset_map = None
    is_my_laptop = isMyLaptop()
    if is_my_laptop:
        dataset_map = load_dataset("csv", data_files=data_files, streaming=True)
    else:
        dataset_map = load_dataset("fahimfarhan/mqtl-classification-datasets", streaming=True)

    train_dataset = PagingMQTLDataset(someDataset=dataset_map[f"train_binned_{window}"],
                                    bertTokenizer=tokenizer,
                                    seqLength=window,
                                    splitSequenceRequired=splitSequenceRequired
                                    )
    val_dataset = PagingMQTLDataset(dataset_map[f"validate_binned_{window}"],
                                  bertTokenizer=tokenizer,
                                  seqLength=window,
                                  splitSequenceRequired=splitSequenceRequired
                                  )
    test_dataset = PagingMQTLDataset(dataset_map[f"test_binned_{window}"],
                                   bertTokenizer=tokenizer,
                                   seqLength=window,
                                   splitSequenceRequired=splitSequenceRequired
                                   )
    return train_dataset, val_dataset, test_dataset


# Load metrics
# global variables. bad practice...
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
roc_auc_metric = evaluate.load("roc_auc")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def computeMetricsUsingTorchEvaluate(args):
    logits, labels = args
    predictions = np.argmax(logits, axis=1)  # Get predicted class

    positive_logits = logits[:, 1]  # Get positive class logits

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    roc_auc = roc_auc_metric.compute(prediction_scores=positive_logits, references=labels)["roc_auc"]  # using positive_logits repairs the error
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]

    return {
        "accuracy": accuracy,
        "roc_auc": roc_auc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# use sklearn cz torchmetrics.classification gave array index out of bound exception :/ (whatever it is called in python)
def computeMetricsUsingSkLearn(args):
    #try:
    logits, labels = args
    # Get predicted class labels
    predictions = np.argmax(logits, axis=1)

    # Get predicted probabilities for the positive class
    positive_logits = logits[:, 1]  # Assuming binary classification and 2 output classes

    accuracy = accuracy_score(y_true=labels, y_pred=predictions)
    recall = recall_score(y_true=labels, y_pred=predictions)
    precision = precision_score(y_true=labels, y_pred=predictions)
    f1 = f1_score(y_true=labels, y_pred=predictions)
    roc_auc = roc_auc_score(y_true=labels, y_score=positive_logits)

    return {
      "accuracy": accuracy,
      "roc_auc": roc_auc,
      "precision": precision,
      "recall": recall,
      "f1": f1
    }
    #except Exception as x:
    #    timber.error(f"compute_metrics_using_sklearn failed with exception: {x}")
    #    return {"accuracy": 0, "roc_auc": 0, "precision": 0, "recall": 0, "f1": 0}
print("init common completed")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

init common completed


In [6]:
""" dynamic section. may be some consts,  changes based on model, etc. Try to keep it as small as possible """
""" THIS IS THE MOST IMPORTANT PART """

RUN_NAME = "hyena-dna-mqtl-classifier" # "dna-bert-6-mqtl-classifier" #
MODEL_NAME = "LongSafari/hyenadna-small-32k-seqlen-hf" # "zhihan1996/DNA_bert_6" # 
SPLIT_SEQUENCE_REQUIRED=False          # False
WINDOW = 4000  # use 200 on your local pc.

SAVE_MODEL_IN_LOCAL_DIRECTORY= f"fine-tuned-{RUN_NAME}-{WINDOW}"
SAVE_MODEL_IN_REMOTE_REPOSITORY = f"fahimfarhan/{RUN_NAME}-{WINDOW}"


NUM_ROWS = 40_000    # hardcoded value
PER_DEVICE_BATCH_SIZE = getDynamicBatchSize()
EPOCHS = 10
NUM_GPUS = max(torch.cuda.device_count(), 1)  # fallback to 1 if no GPU

effective_batch_size = PER_DEVICE_BATCH_SIZE * NUM_GPUS
STEPS_PER_EPOCH = NUM_ROWS // effective_batch_size
MAX_STEPS = EPOCHS * STEPS_PER_EPOCH

print("init arguments completed")

init arguments completed


In [7]:
""" main """
# def start():
timber.info(green)
timber.info("---Inside start function---")
timber.info(f"{PER_DEVICE_BATCH_SIZE = }")

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # to prevent out of memory error

# wandb.init(mode="offline")  # Logs only locally
signInToHuggingFaceAndWandbToUploadModelWeightsAndBiases()

config = AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
print("Model architecture:", config.architectures)

mainTokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
mainModel = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, trust_remote_code=True, num_labels=2)

isGpuAvailable = torch.cuda.is_available()
if isGpuAvailable:
    mainModel = mainModel.to("cuda")  # not sure if it is necessary in the kaggle / huggingface virtual-machine


train_dataset, val_dataset, test_dataset = createPagingTrainValTestDatasets(tokenizer=mainTokenizer, window=WINDOW, splitSequenceRequired=SPLIT_SEQUENCE_REQUIRED)


trainingArgs = TrainingArguments(
    run_name=RUN_NAME,
    output_dir="output_checkpoints",
    eval_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    eval_steps=STEPS_PER_EPOCH,
    save_steps=STEPS_PER_EPOCH,
    logging_steps=STEPS_PER_EPOCH,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE,
    max_steps=MAX_STEPS,
    weight_decay=0.01,
    learning_rate=1e-3,
    logging_dir="./logs",
    save_safetensors=False,
    gradient_checkpointing=True,  # to prevent out of memory error
    fp16=True
)

dataCollator = DataCollatorWithPadding(tokenizer=mainTokenizer)


print("create trainer")
trainer = Trainer(
    model=mainModel,
    args=trainingArgs,
    train_dataset=train_dataset,  # train
    eval_dataset=val_dataset,  # validate
    data_collator=dataCollator,
    compute_metrics=computeMetricsUsingTorchEvaluate
)


# train, and validate
result = trainer.train()
try:
    print("-------Training completed. Results--------\n")
    print(result)
except Exception as x:
    print(f"{x = }")
    

.env file loaded successfully.


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in to Hugging Face Hub successfully.


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mfahimfarhan[0m ([33mnotredamians[0m). Use [1m`wandb login --relogin`[0m to force relogin




[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Logged in to wand db successfully.


config.json:   0%|          | 0.00/981 [00:00<?, ?B/s]

configuration_hyena.py:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LongSafari/hyenadna-small-32k-seqlen-hf:
- configuration_hyena.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Model architecture: ['HyenaDNAForCausalLM']


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

tokenization_hyena.py:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LongSafari/hyenadna-small-32k-seqlen-hf:
- tokenization_hyena.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

modeling_hyena.py:   0%|          | 0.00/22.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LongSafari/hyenadna-small-32k-seqlen-hf:
- modeling_hyena.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Some weights of HyenaDNAForSequenceClassification were not initialized from the model checkpoint at LongSafari/hyenadna-small-32k-seqlen-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


README.md:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

create trainer


[34m[1mwandb[0m: Tracking run with wandb version 0.19.1


[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250409_041743-pmm3u931[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.


[34m[1mwandb[0m: Syncing run [33mhyena-dna-mqtl-classifier[0m


[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/notredamians/huggingface[0m


[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/notredamians/huggingface/runs/pmm3u931[0m




Step,Training Loss,Validation Loss


In [None]:
test_results = trainer.evaluate(eval_dataset=test_dataset)
try:
    print("-------Test completed. Results--------\n")
    print(test_results)
except Exception as x:
    print(f"{x = }")


In [None]:
mainModel.save_pretrained(save_directory=SAVE_MODEL_IN_LOCAL_DIRECTORY, safe_serialization=False)
# push to the hub
is_my_laptop = isMyLaptop()

commit_message = f":tada: Push model for window size {WINDOW} from huggingface space"
if is_my_laptop:
  commit_message = f":tada: Push model for window size {WINDOW} from my laptop"

mainModel.push_to_hub(
  repo_id=SAVE_MODEL_IN_REMOTE_REPOSITORY,
  # subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/
  commit_message=commit_message,  # f":tada: Push model for window size {WINDOW}"
  safe_serialization=False
)
# pass