In [1]:
!pip install -q --upgrade transformers peft trl datasets accelerate bitsandbytes scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m366.3/366.3 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
import torch

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = "/content/drive/MyDrive/"
csv_path =  path + "data/dataset_convert.csv"
# Cargar el archivo CSV
df = pd.read_csv(csv_path)

In [20]:
# Estratificación manual por quantiles del label (para regresión)
df['label_bin'] = pd.qcut(df['label'], q=10, duplicates='drop')
train_df, temp_df = train_test_split(df, test_size=0.19, stratify=df['label_bin'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=10/19, stratify=temp_df['label_bin'], random_state=42)

# Eliminar columna auxiliar
for d in [train_df, val_df, test_df]:
    d.drop(columns='label_bin', inplace=True)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [21]:
# Guardar como .jsonl para usar con HuggingFace Datasets
def save_jsonl(df, filename):
    with open(filename, 'w') as f:
        for _, row in df.iterrows():
            prompt = f"Given the peptide sequence '{row['sequence']}', predict its activity value."
            response = str(row['label'])
            json.dump({"prompt": prompt, "response": response}, f)
            f.write('\n')

save_jsonl(train_df, path + "model_Llama3_2_trans/train.jsonl")
save_jsonl(val_df,  path + "model_Llama3_2_trans/val.jsonl")
save_jsonl(test_df, path + "model_Llama3_2_trans/test.jsonl")

In [22]:
model_id = "meta-llama/Llama-3.2-1B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

train_dataset = load_dataset("json", data_files=path + "model_Llama3_2_trans/train.jsonl", split="train")
val_dataset = load_dataset("json", data_files=path + "model_Llama3_2_trans/val.jsonl", split="train")

def formatting_func(example):
    return {
        "prompt": example['prompt'],
        "completion": example['response']
    }

train_dataset = train_dataset.map(formatting_func)
val_dataset = val_dataset.map(formatting_func)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

#entrenamiento
training_args = TrainingArguments(
    output_dir="./llama3-regression",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    num_train_epochs=50,
    logging_dir="./logs",
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
    peft_config=lora_config
)


trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/383 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/383 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/383 [00:00<?, ? examples/s]



Truncating train dataset:   0%|          | 0/383 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/43 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/43 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/43 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/43 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
500,4.1897
1000,3.5343
1500,3.5009
2000,3.4775


TrainOutput(global_step=2400, training_loss=3.6404798380533854, metrics={'train_runtime': 1608.6062, 'train_samples_per_second': 11.905, 'train_steps_per_second': 1.492, 'total_flos': 7880460677591040.0, 'train_loss': 3.6404798380533854})

In [23]:
#Evaluación sobre test
test_dataset = load_dataset("json", data_files=path + "model_Llama3_2_trans/test.jsonl", split="train")
import numpy as np

def predict(sequence):
    prompt = f"Given the peptide sequence '{sequence}', predict its activity value."
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=10)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return float(result.replace(prompt, "").strip())

preds = []
truth = []

for row in test_dataset:
    try:
        y_pred = predict(row['prompt'].split("'")[1])
        y_true = float(row['response'])
        preds.append(y_pred)
        truth.append(y_true)
    except:
        continue



Generating train split: 0 examples [00:00, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [24]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print("Test RMSE:", np.sqrt(mean_squared_error(truth, preds)))
print("Test MAE:", mean_absolute_error(truth, preds))
print("R^2 Score:", r2_score(truth, preds))

Test RMSE: 0.46143615750193034
Test MAE: 0.3731255220416667
R^2 Score: 0.48142771351604385


In [None]:
output_model_dir = path + "model_Llama3_2_trans/model/llama3-regression-model"

# Guardar el modelo entrenado con LoRA
trainer.save_model(output_model_dir)

# Guardar el tokenizador
tokenizer.save_pretrained(output_model_dir)