<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/FINAL_LLM_JEPA_MISTRAL_FT_BTC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Wed Oct  8 05:48:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   37C    P8             11W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# --- 1. ENVIRONMENT SETUP AND DEPENDENCY INSTALLATION ---
print("\n--- Installing dependencies and cleaning environment ---")
!pip cache purge
!rm -rf /root/.cache/huggingface/* /root/.cache/pip/* /tmp/*
!pip uninstall -y transformers peft bitsandbytes accelerate trl datasets pandas pandas_ta huggingface_hub quanto triton numpy scipy scikit-learn torch
!pip install -q -U numpy==1.26.4
!pip install -q -U scipy==1.14.1
!pip install -q -U scikit-learn==1.5.1
!pip install -q -U torch==2.3.0
!pip install -q -U transformers==4.44.2
!pip install -q -U peft==0.12.0
!pip install -q -U bitsandbytes==0.43.3
!pip install -q -U accelerate==0.34.2
!pip install -q -U trl==0.11.1
!pip install -q -U huggingface_hub==0.25.1
!pip install -q -U quanto==0.2.0
!pip install -q -U datasets==3.0.1
!pip install -q -U triton==2.3.0
!pip install -q -U pandas pandas_ta

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
!ls /content/gdrive/MyDrive/CryptoFT/dataset/

btc_instruction_dataset.jsonl  temp_tokenized


In [5]:
!df -h /
!du -sh /root/.cache/huggingface/

Filesystem      Size  Used Avail Use% Mounted on
overlay         236G   49G  188G  21% /
du: cannot access '/root/.cache/huggingface/': No such file or directory


In [6]:
#--- CLEAN UP TEMPORARY STORAGE ---
print("\n--- Cleaning up temporary storage ---")
!rm -rf /root/.cache/huggingface/*
!rm -rf /tmp/*
print("✅ Temporary storage cleared.")


--- Cleaning up temporary storage ---
rm: cannot remove '/tmp/colab_runtime.sock': Device or resource busy
✅ Temporary storage cleared.


In [7]:
from google.colab import userdata
access_token_write = userdata.get('HF_TOKEN')
from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## MODEL-BASE

In [None]:
# --- FULL CODE: TRAINING + DEPLOYMENT + EVALUATION FOR MISTRAL-7B-BTC-JEPA-LLM-EXPERT ---

# --- 1. IMPORTS AND CUSTOM CLASSES ---
import os
import torch
import torch.nn.functional as F
import re
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from torch.utils.data import DataLoader
from peft import LoraConfig, get_peft_model, PeftModel
from huggingface_hub import login, HfApi, create_repo
from huggingface_hub.utils import RepositoryNotFoundError
from google.colab import drive
from tqdm import tqdm
import shutil

# Mount Google Drive
drive.mount('/content/gdrive')

# --- LLM-JEPA CONFIGURATION ---
LLM_JEPA_CONFIG = {
    "lbd": 2.0,               # Lambda (JEPA loss weight)
    "gamma": 1.0,             # Gamma (LLM loss weight)
    "predictors": 3,          # k (Number of predictor tokens)
    "last_token": -1,         # Index of last token for embedding (typically EOS/</s> or -1)
}
SPECIAL_PREDICTOR_TOKENS = [f"<|predictor_{i}|>" for i in range(1, LLM_JEPA_CONFIG["predictors"] + 1)]

# --- FILE PATHS ---
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
DATASET_PATH = "/content/gdrive/MyDrive/CryptoFT/dataset/btc_instruction_dataset.jsonl"
OUTPUT_DIR = "/content/gdrive/MyDrive/CryptoFT/models/results_btc_jepa"
FINAL_MODEL_DIR = "/content/gdrive/MyDrive/CryptoFT/models/Mistral-7B-BTC-JEPA_FINAL"
HUB_MODEL_ID = "frankmorales2020/Mistral-7B-BTC-JEPA-LLM-Expert"
MAX_SEQ_LENGTH = 1024
TEMP_DATASET_DIR = "/content/gdrive/MyDrive/CryptoFT/dataset/temp_tokenized"

# --- UTILITIES ---
def get_messages_from_sample(sample_text):
    """Parses the Mistral template: '<s>[INST] Instruction [/INST] Response</s>'"""
    inst_match = re.search(r"\[INST\](.*?)\[/INST\]", sample_text, re.DOTALL)
    user_content = inst_match.group(1).strip() if inst_match else ""
    resp_match = re.search(r"\[/INST\](.*?)\s*</s>", sample_text, re.DOTALL)
    assistant_content = resp_match.group(1).strip() if resp_match else ""
    return {"user_content": user_content, "assistant_content": assistant_content}

# --- CUSTOM DATA COLLATOR ---
class CustomJEPABatchCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.keys = [
            "input_ids", "labels", "attention_mask",
            "input_ids_user", "labels_user", "attention_mask_user",
            "input_ids_assistant", "labels_assistant", "attention_mask_assistant"
        ]

    def __call__(self, features):
        batch = {key: [] for key in self.keys}
        if not features or not any(features):
            return None
        for feature in features:
            if all(key in feature for key in self.keys):
                for key in self.keys:
                    batch[key].append(feature[key])
        if not any(batch.values()):
            return None
        for key in self.keys:
            if not batch[key]:
                raise ValueError(f"No valid samples for key '{key}' in batch. Check dataset tokenization.")
            batch[key] = torch.LongTensor(batch[key])
        return batch

# --- CUSTOM TRAINER ---
class RepresentationTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        self.lbd = kwargs.pop('lbd', 1.0)
        self.gamma = kwargs.pop('gamma', 1.0)
        self.last_token = kwargs.pop('last_token', -1)
        self._last_metrics = {}  # Store metrics for logging
        super().__init__(*args, **kwargs)

    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            collate_fn=self.data_collator,
            shuffle=False,
            num_workers=0
        )

    def get_eval_dataloader(self, eval_dataset=None):
        dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        return DataLoader(
            dataset,
            batch_size=self.args.per_device_eval_batch_size,
            collate_fn=self.data_collator,
            shuffle=False,
            num_workers=0
        )

    def _last_token_index(self, input_ids, attention_mask):
        last_indices = torch.sum(attention_mask, dim=1) + self.last_token
        return torch.clamp(last_indices, min=0)

    def forward(self, model, inputs):
        batch_size = inputs["input_ids"].shape[0]
        llm_inputs = {
            "input_ids": torch.cat([inputs["input_ids"], inputs["input_ids_user"], inputs["input_ids_assistant"]], dim=0),
            "labels": torch.cat([inputs["labels"], inputs["labels_user"], inputs["labels_assistant"]], dim=0),
            "attention_mask": torch.cat([inputs["attention_mask"], inputs["attention_mask_user"], inputs["attention_mask_assistant"]], dim=0),
        }
        with torch.enable_grad():
            outputs = model(**llm_inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1].requires_grad_(True)
        user_hidden_states = hidden_states[batch_size: batch_size * 2]
        assistant_hidden_states = hidden_states[batch_size * 2:]
        return {
            'main_outputs': outputs,
            'user_hidden_states': user_hidden_states,
            'assistant_hidden_states': assistant_hidden_states,
        }

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        batch_size = inputs["input_ids"].shape[0]
        index_user = self._last_token_index(inputs["input_ids_user"], inputs["attention_mask_user"])
        index_assistant = self._last_token_index(inputs["input_ids_assistant"], inputs["attention_mask_assistant"])
        forward_results = self.forward(model, inputs)
        lm_loss = forward_results['main_outputs'].loss
        user_hidden_states = forward_results['user_hidden_states']
        assistant_hidden_states = forward_results['assistant_hidden_states']
        user_embedding = user_hidden_states[range(batch_size), index_user, :]
        assistant_embedding = assistant_hidden_states[range(batch_size), index_assistant, :]
        cosine_similarity = F.cosine_similarity(user_embedding, assistant_embedding, dim=-1)
        jepa_loss = 1.0 - torch.mean(cosine_similarity)
        total_loss = self.gamma * lm_loss + self.lbd * jepa_loss
        self._last_metrics = {'jepa_loss': jepa_loss, 'lm_loss': lm_loss}

        # FIXED: Return only loss tensor for training, metrics are stored separately
        if return_outputs:
            return (total_loss, forward_results['main_outputs'])
        return total_loss

    def log(self, logs):
        if self._last_metrics:
            logs["jepa_loss"] = self._nested_gather(self._last_metrics["jepa_loss"]).mean().item()
            logs["lm_loss"] = self._nested_gather(self._last_metrics["lm_loss"]).mean().item()
        super().log(logs)

# --- 2. MODEL AND DATASET SETUP ---
print("\n--- Setting up model and dataset ---")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
os.makedirs(TEMP_DATASET_DIR, exist_ok=True)

# Clear any existing dataset cache
if os.path.exists(TEMP_DATASET_DIR):
    shutil.rmtree(TEMP_DATASET_DIR)
os.makedirs(TEMP_DATASET_DIR, exist_ok=True)

# PEFT (LoRA) Config
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)


# Training Arguments - OPTIMIZED FOR JEPA MONITORING
training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    optim="paged_adamw_8bit",
    save_steps=0,  # Disable saving during demo
    logging_steps=50,  # See JEPA metrics every 10 steps
    max_steps=500,  # Just enough to see JEPA loss decreasing
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    disable_tqdm=False,
    report_to="none",
    # ↓↓↓ CRITICAL CHANGES FOR DEMO ↓↓↓
    evaluation_strategy="no",  # Disable evaluation during training
    eval_steps=None,  # No evaluation steps
    metric_for_best_model=None,  # Not needed for demo
    # ↑↑↑ CRITICAL CHANGES FOR DEMO ↑↑↑
    dataloader_drop_last=True,
    dataloader_num_workers=0,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    resume_from_checkpoint=True
)

# Load Model with Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_PREDICTOR_TOKENS})
model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, peft_config)
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True
print("✅ PEFT (LoRA) configuration applied")

# Load and Prepare Dataset
print("\n--- Loading and Splitting Custom Dataset ---")
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")

# Filter malformed samples
def filter_valid_samples(example):
    text = example["text"]
    parsed = get_messages_from_sample(text)
    valid = bool(parsed["user_content"] and parsed["assistant_content"])
    if not valid:
        print(f"Skipping malformed sample: {text[:50]}...")
    return valid

filtered_dataset = dataset.filter(filter_valid_samples, desc="Filtering valid samples")
if len(filtered_dataset) < 3000:
    raise ValueError(f"Filtered dataset has {len(filtered_dataset)} samples, need at least 3000 for 2500 train + 500 eval.")
print(f"Filtered dataset size: {len(filtered_dataset)} samples")

# Select 2500 for train, 500 for eval
train_dataset = filtered_dataset.select(range(2500))
test_dataset = filtered_dataset.select(range(2500, 3000))
tokenized_dataset = {"train": train_dataset, "test": test_dataset}

def tokenize_jepa_views(example):
    sample_text = example["text"]
    parsed = get_messages_from_sample(sample_text)
    predictor_str = "".join(SPECIAL_PREDICTOR_TOKENS)
    user_text_with_pred = f"<s>[INST] {parsed['user_content']} {predictor_str} [/INST]"
    assistant_text = f"{parsed['assistant_content']}</s>"
    tokenized_full = tokenizer(sample_text, truncation=True, max_length=MAX_SEQ_LENGTH, padding="max_length", return_tensors="np")
    tokenized_user = tokenizer(user_text_with_pred, truncation=True, max_length=MAX_SEQ_LENGTH, padding="max_length", return_tensors="np")
    tokenized_assistant = tokenizer(assistant_text, truncation=True, max_length=MAX_SEQ_LENGTH, padding="max_length", return_tensors="np")

    return {
        "input_ids": tokenized_full["input_ids"][0].tolist(),
        "attention_mask": tokenized_full["attention_mask"][0].tolist(),
        "labels": tokenized_full["input_ids"][0].copy().tolist(),
        "input_ids_user": tokenized_user["input_ids"][0].tolist(),
        "attention_mask_user": tokenized_user["attention_mask"][0].tolist(),
        "labels_user": [-100] * MAX_SEQ_LENGTH,
        "input_ids_assistant": tokenized_assistant["input_ids"][0].tolist(),
        "attention_mask_assistant": tokenized_assistant["attention_mask"][0].tolist(),
        "labels_assistant": [-100] * MAX_SEQ_LENGTH
    }

# Tokenize and save to disk to avoid caching issues
tokenized_dataset["train"] = tokenized_dataset["train"].map(
    tokenize_jepa_views,
    batched=False,
    remove_columns=["text"],
    desc="Tokenizing train samples for LLM-JEPA views",
    load_from_cache_file=False
)
tokenized_dataset["test"] = tokenized_dataset["test"].map(
    tokenize_jepa_views,
    batched=False,
    remove_columns=["text"],
    desc="Tokenizing test samples for LLM-JEPA views",
    load_from_cache_file=False
)
tokenized_dataset["train"].save_to_disk(os.path.join(TEMP_DATASET_DIR, "train"))
tokenized_dataset["test"].save_to_disk(os.path.join(TEMP_DATASET_DIR, "test"))

# Reload dataset to ensure consistency
from datasets import load_from_disk
tokenized_dataset = {
    "train": load_from_disk(os.path.join(TEMP_DATASET_DIR, "train")),
    "test": load_from_disk(os.path.join(TEMP_DATASET_DIR, "test"))
}


## DATASET

In [2]:
# Validate all samples in dataset
print("\n--- Validating Dataset Keys ---")
required_keys = [
    "input_ids", "labels", "attention_mask",
    "input_ids_user", "labels_user", "attention_mask_user",
    "input_ids_assistant", "labels_assistant", "attention_mask_assistant"
]
for split in ["train", "test"]:
    print(f"\nChecking {split} split...")
    missing_samples = []
    for idx, sample in tqdm(enumerate(tokenized_dataset[split]), total=len(tokenized_dataset[split]), desc=f"Validating {split} split"):
        missing_keys = [key for key in required_keys if key not in sample]
        if missing_keys:
            missing_samples.append((idx, missing_keys))
    if missing_samples:
        print(f"Found {len(missing_samples)} problematic samples in {split} split:")
        for idx, missing_keys in missing_samples[:5]:
            print(f"Sample {idx} missing keys: {missing_keys}")
        raise ValueError(f"Dataset validation failed in {split} split.")
    print(f"{split} split: All {len(tokenized_dataset[split])} samples validated.")

COLUMNS_TO_KEEP = required_keys
try:
    tokenized_dataset["train"] = tokenized_dataset["train"].select_columns(COLUMNS_TO_KEEP)
    tokenized_dataset["test"] = tokenized_dataset["test"].select_columns(COLUMNS_TO_KEEP)
    print(f"Dataset columns filtered to: {COLUMNS_TO_KEEP}")
except Exception as e:
    print(f"Error during column selection: {e}")
    print(f"Available columns in train: {tokenized_dataset['train'].column_names}")
    print(f"Available columns in test: {tokenized_dataset['test'].column_names}")
    raise



--- Validating Dataset Keys ---

Checking train split...


Validating train split: 100%|██████████| 2500/2500 [00:10<00:00, 242.73it/s]


train split: All 2500 samples validated.

Checking test split...


Validating test split: 100%|██████████| 500/500 [00:02<00:00, 239.19it/s]

test split: All 500 samples validated.
Dataset columns filtered to: ['input_ids', 'labels', 'attention_mask', 'input_ids_user', 'labels_user', 'attention_mask_user', 'input_ids_assistant', 'labels_assistant', 'attention_mask_assistant']





## TRAINING

In [3]:
# Debug dataset before training
print("\n--- Debugging Dataset Before Training ---")
for split in ["train", "test"]:
    print(f"\n{split} split sample keys:")
    for idx in range(min(5, len(tokenized_dataset[split]))):
        sample = tokenized_dataset[split][idx]
        print(f"Sample {idx} keys: {list(sample.keys())}")

# --- 3. TRAINER INITIALIZATION AND EXECUTION ---
print("\n--- Initializing and Starting RepresentationTrainer (LLM-JEPA) ---")
custom_collator = CustomJEPABatchCollator(tokenizer)

# Log GPU memory before training
def log_gpu_memory():
    if torch.cuda.is_available():
        print(f"GPU Memory Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MiB")
        print(f"GPU Memory Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MiB")
        print(f"GPU Memory Free: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_reserved()) / 1024**2:.2f} MiB")

print("\n--- GPU Memory Before Training ---")
log_gpu_memory()

# ADD FIXED JEPA MONITORING CALLBACK HERE

from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl

# --- CALLBACK 1: MONITORING AND LOGGING (Your Original Role) ---
class JEPAMonitorCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        # Force JEPA metrics display every 10 steps (matching logging_steps=10)
        if state.global_step % 50 == 0 and hasattr(trainer, '_last_metrics'):
            metrics = trainer._last_metrics
            jepa_loss = metrics.get('jepa_loss')
            lm_loss = metrics.get('lm_loss')
            if jepa_loss is not None and lm_loss is not None:
                print(f"\n🎯 JEPA Metrics [Step {state.global_step}]:")
                print(f"   JEPA Loss: {jepa_loss.item():.4f}")
                print(f"   LM Loss: {lm_loss.item():.4f}")
                print(f"   Cosine Sim: {1 - jepa_loss.item():.4f}")
                print(f"   Total Loss: {trainer.gamma * lm_loss + trainer.lbd * jepa_loss:.4f}")

    def on_log(self, args, state, control, logs=None, **kwargs):
        # Also add to official logs for potential WandB/etc.
        if logs is not None and hasattr(trainer, '_last_metrics'):
            jepa_loss = trainer._last_metrics.get('jepa_loss')
            lm_loss = trainer._last_metrics.get('lm_loss')
            if jepa_loss is not None:
                logs["jepa_loss"] = jepa_loss.item()
                logs["lm_loss"] = lm_loss.item()

# --- CALLBACK 2: CONTROL AND EARLY STOPPING (The New Role) ---
# This class handles the conditional stopping logic.
class JEPALossStabilityCallback(TrainerCallback):
    """
    Stops training when the JEPA Loss on the training batch drops below a threshold
    and stabilizes for a specified number of logging steps.
    """
    def __init__(self, jepa_threshold=0.0001, stability_steps=3):
        self.jepa_threshold = jepa_threshold
        self.stability_steps = stability_steps
        self.stable_count = 0

    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        trainer = kwargs.get('trainer')

        # Check only at logging steps (every 50 steps)
        if state.global_step > 0 and state.global_step % args.logging_steps == 0 and hasattr(trainer, '_last_metrics'):
            metrics = trainer._last_metrics
            jepa_loss = metrics.get('jepa_loss')

            if jepa_loss is not None:
                current_jepa_loss = jepa_loss.item()

                # Check for Early Stop Condition
                if current_jepa_loss <= self.jepa_threshold:
                    self.stable_count += 1

                    if self.stable_count >= self.stability_steps:
                        # SET THE CONTROL FLAG TO STOP
                        control.should_stop = True
                else:
                    self.stable_count = 0

        return control

# Ensure both classes are defined in a single block before proceeding.
# --- RE-INITIALIZE TRAINER WITH TWO CUSTOM CALLBACKS ---

# Ensure the custom collator is ready
custom_collator = CustomJEPABatchCollator(tokenizer)

# Pass both callback instances in a list
trainer = RepresentationTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"], # Keep this here for structure, even if eval_strategy="no" is in TrainingArguments
    args=training_arguments,
    data_collator=custom_collator,
    lbd=LLM_JEPA_CONFIG["lbd"],
    gamma=LLM_JEPA_CONFIG["gamma"],
    last_token=LLM_JEPA_CONFIG["last_token"],
    callbacks=[
        JEPAMonitorCallback(),                     # 1. For Logging (Monitors progress)
        JEPALossStabilityCallback(jepa_threshold=0.0001, stability_steps=3) # 2. For Control (Stops the process)
    ]
)

print("\n--- Trainer Initialized with Separate Logging and Control Callbacks ---")
# You can now call trainer.train() in the next cell

max_steps is given, it will override any value given in num_train_epochs



--- Debugging Dataset Before Training ---

train split sample keys:
Sample 0 keys: ['input_ids', 'labels', 'attention_mask', 'input_ids_user', 'labels_user', 'attention_mask_user', 'input_ids_assistant', 'labels_assistant', 'attention_mask_assistant']
Sample 1 keys: ['input_ids', 'labels', 'attention_mask', 'input_ids_user', 'labels_user', 'attention_mask_user', 'input_ids_assistant', 'labels_assistant', 'attention_mask_assistant']
Sample 2 keys: ['input_ids', 'labels', 'attention_mask', 'input_ids_user', 'labels_user', 'attention_mask_user', 'input_ids_assistant', 'labels_assistant', 'attention_mask_assistant']
Sample 3 keys: ['input_ids', 'labels', 'attention_mask', 'input_ids_user', 'labels_user', 'attention_mask_user', 'input_ids_assistant', 'labels_assistant', 'attention_mask_assistant']
Sample 4 keys: ['input_ids', 'labels', 'attention_mask', 'input_ids_user', 'labels_user', 'attention_mask_user', 'input_ids_assistant', 'labels_assistant', 'attention_mask_assistant']

test split

In [4]:
print(f"✅ JEPA Monitoring Enabled - will show metrics every {training_arguments.logging_steps}/{training_arguments.max_steps} steps")
print("\n--- Starting LLM-JEPA Fine-Tuning with Monitoring ---")
trainer.train()
print("\n--- Fine-Tuning Complete! ---")

# Clean up old checkpoints
print("\n--- Cleaning Up Old Checkpoints ---")
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save to Google Drive
print("\n--- Saving Final Adapter Weights to Google Drive ---")
trainer.model.save_pretrained(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)
print(f"\n✅ LoRA adapter and tokenizer saved to: '{FINAL_MODEL_DIR}'")
print(f"Checkpoints and logs saved to '{OUTPUT_DIR}'")

✅ JEPA Monitoring Enabled - will show metrics every 50/500 steps

--- Starting LLM-JEPA Fine-Tuning with Monitoring ---


Step,Training Loss
50,0.8764
100,0.2354
150,0.2245
200,0.2292
250,0.2313



🎯 JEPA Metrics [Step 50]:
   JEPA Loss: 0.0001
   LM Loss: 0.2534
   Cosine Sim: 0.9999
   Total Loss: 0.2535

🎯 JEPA Metrics [Step 100]:
   JEPA Loss: 0.0000
   LM Loss: 0.2319
   Cosine Sim: 1.0000
   Total Loss: 0.2320

🎯 JEPA Metrics [Step 150]:
   JEPA Loss: 0.0000
   LM Loss: 0.2438
   Cosine Sim: 1.0000
   Total Loss: 0.2438

🎯 JEPA Metrics [Step 200]:
   JEPA Loss: 0.0000
   LM Loss: 0.2384
   Cosine Sim: 1.0000
   Total Loss: 0.2384

🎯 JEPA Metrics [Step 250]:
   JEPA Loss: 0.0000
   LM Loss: 0.2216
   Cosine Sim: 1.0000
   Total Loss: 0.2217


Step,Training Loss
50,0.8764
100,0.2354
150,0.2245
200,0.2292
250,0.2313
300,0.2215
350,0.2253
400,0.2293
450,0.2204
500,0.2242



🎯 JEPA Metrics [Step 300]:
   JEPA Loss: 0.0000
   LM Loss: 0.1579
   Cosine Sim: 1.0000
   Total Loss: 0.1580

🎯 JEPA Metrics [Step 350]:
   JEPA Loss: 0.0000
   LM Loss: 0.2287
   Cosine Sim: 1.0000
   Total Loss: 0.2287

🎯 JEPA Metrics [Step 400]:
   JEPA Loss: 0.0000
   LM Loss: 0.2151
   Cosine Sim: 1.0000
   Total Loss: 0.2152

🎯 JEPA Metrics [Step 450]:
   JEPA Loss: 0.0000
   LM Loss: 0.1994
   Cosine Sim: 1.0000
   Total Loss: 0.1994

🎯 JEPA Metrics [Step 500]:
   JEPA Loss: 0.0000
   LM Loss: 0.2188
   Cosine Sim: 1.0000
   Total Loss: 0.2189





--- Fine-Tuning Complete! ---

--- Cleaning Up Old Checkpoints ---

--- Saving Final Adapter Weights to Google Drive ---





✅ LoRA adapter and tokenizer saved to: '/content/gdrive/MyDrive/CryptoFT/models/Mistral-7B-BTC-JEPA_FINAL'
Checkpoints and logs saved to '/content/gdrive/MyDrive/CryptoFT/models/results_btc_jepa'


## DEPLOYMENT

In [None]:
from google.colab import userdata

# --- 4. DEPLOYMENT TO HUGGING FACE ---
print("\n--- Authenticating with Hugging Face Hub ---")
try:
    access_token_write = userdata.get('HF_TOKEN')
    if not access_token_write:
        raise ValueError("HF_TOKEN secret not found or is empty.")
    login(token=access_token_write, add_to_git_credential=True)
    api = HfApi(token=access_token_write)
    print("✅ Successfully logged into Hugging Face Hub.")
except Exception as e:
    print(f"FATAL ERROR during login: {e}")
    raise

print(f"\n--- Checking/Creating Repository: {HUB_MODEL_ID} ---")
try:
    api.repo_info(repo_id=HUB_MODEL_ID, repo_type="model")
    print(f"✅ Repository {HUB_MODEL_ID} already exists.")
except RepositoryNotFoundError:
    print(f"⚠️ Repository {HUB_MODEL_ID} not found. Creating it now...")
    create_repo(repo_id=HUB_MODEL_ID, repo_type="model", private=False, token=access_token_write)
    print(f"✅ Repository {HUB_MODEL_ID} created successfully.")

print(f"\n--- Pushing LoRA Adapter and Tokenizer ---")
if not os.path.exists(FINAL_MODEL_DIR):
    raise FileNotFoundError(f"Adapter directory not found at {FINAL_MODEL_DIR}.")
try:
    trainer.model.push_to_hub(
        repo_id=HUB_MODEL_ID,
        commit_message="Initial QLoRA adapter for Bitcoin price prediction",
        private=False
    )
    tokenizer.push_to_hub(HUB_MODEL_ID)
    print("\n✅ SUCCESS: Adapter and tokenizer uploaded to Hugging Face Hub.")
except Exception as e:
    print(f"\n⚠️ WARNING: Trainer push failed. Using HfApi fallback. Error: {e}")
    api.upload_folder(
        folder_path=FINAL_MODEL_DIR,
        repo_id=HUB_MODEL_ID,
        commit_message="LoRA adapter and tokenizer pushed from Google Drive checkpoint",
        ignore_patterns=["*.pt", "*.bin", "optimizer.pt", "scheduler.pt", "rng_state.pth"]
    )
    print("\n✅ SUCCESS (Fallback): Adapter and tokenizer uploaded to Hugging Face Hub.")
print(f"\nDeployment Complete. Model available at: https://huggingface.co/{HUB_MODEL_ID}")

## EVALUATION

In [None]:
# 🚨 FIRST: COMPLETELY RESET PEFT INSTALLATION
!pip uninstall -y peft
!pip install peft==0.10.0  # Use stable version

# Restart runtime after this command

In [None]:
# --- FILE PATHS ---
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
DATASET_PATH = "/content/gdrive/MyDrive/CryptoFT/dataset/btc_instruction_dataset.jsonl"
OUTPUT_DIR = "/content/gdrive/MyDrive/CryptoFT/models/results_btc_jepa"
FINAL_MODEL_DIR = "/content/gdrive/MyDrive/CryptoFT/models/Mistral-7B-BTC-JEPA_FINAL"
HUB_MODEL_ID = "frankmorales2020/Mistral-7B-BTC-JEPA-LLM-Expert"
MAX_SEQ_LENGTH = 1024
TEMP_DATASET_DIR = "/content/gdrive/MyDrive/CryptoFT/dataset/temp_tokenized"


In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
import torch
from peft import LoraConfig, get_peft_model, PeftModel

In [None]:
# Install necessary libraries for QLoRA and SFTTrainer
!pip install -q -U bitsandbytes transformers peft accelerate trl datasets -q

In [None]:
# --- FILE PATHS ---
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
FINAL_MODEL_DIR = "/content/gdrive/MyDrive/CryptoFT/models/Mistral-7B-BTC-JEPA_FINAL"
HUB_MODEL_ID = "frankmorales2020/Mistral-7B-BTC-JEPA-LLM-Expert"

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import torch
from peft import PeftModel
import peft
import os

# --- FILE PATHS (Replicated from original code) ---
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
HUB_MODEL_ID = "frankmorales2020/Mistral-7B-BTC-JEPA-LLM-Expert"

# --- 1. SETUP ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_TYPE = "FINE_TUNED"

# --- 2. MODEL AND TOKENIZER LOADING (Single block for efficiency) ---
print("\n--- Model and Tokenizer Setup ---")
print(f"--- Loading Model onto {DEVICE} ---")
print(f'BASE MODEL: {MODEL_NAME}\nFINE TUNE MODEL: {HUB_MODEL_ID}')

try:
    # 4-bit Quantization Config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    # Add special tokens and resize embeddings (essential for fine-tuned LoRA)
    SPECIAL_PREDICTOR_TOKENS = ["<pred>", "<targ>", "<jepa>"]
    tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_PREDICTOR_TOKENS})

    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    base_model.resize_token_embeddings(len(tokenizer))

    # Load fine-tuned adapter weights
    model = PeftModel.from_pretrained(base_model, HUB_MODEL_ID).eval()
    print("🎉 SUCCESS: Loaded fine-tuned JEPA model!")

except Exception as e:
    print(f"❌ Model loading failed: {e}")
    # Fallback to base model logic is removed for this test since the specific fine-tuned
    # model must be used, so we raise the error.
    raise

# --- 3. INFERENCE FUNCTION (Encapsulating fixed parameters) ---

def run_inference_test(btc_data_input, tokenizer, model, device):
    """Runs a single inference test with the fixed strict extraction logic."""

    # 1. Create Strict Prompt
    user_prompt_content = f"Current BTC data: {btc_data_input}. Give ONLY the 12-hour direction (UP, DOWN, or FLAT). The output MUST be a single word: UP, DOWN, or FLAT."
    input_text = f"<s>[INST] {user_prompt_content} [/INST]"

    # 2. Tokenize and Set Fixed Generation Params
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # Fixed generation parameters proven to work with robust extraction
    generation_params = {
        "max_new_tokens": 15,
        "do_sample": False,
        "temperature": 0.1,
        "top_p": 1.0,
        "repetition_penalty": 1.0,
    }

    # 3. Generate Output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            **generation_params,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 4. Robust Extraction Logic (Final Corrected Logic)
    prediction_output = "❌ PREDICTION NOT FOUND"
    target_words = ["UP", "DOWN", "FLAT"]

    if "[/INST]" in response_text:
        prediction_output_raw = response_text.split("[/INST]")[-1].strip()

        # Iterate through the expected target words
        for target in target_words:
            if target in prediction_output_raw.upper():
                prediction_output = target
                break
    else:
        prediction_output_raw = response_text.strip()
        for target in target_words:
            if target in prediction_output_raw.upper():
                prediction_output = target
                break

    return input_text, prediction_output_raw, prediction_output

# --- 4. MULTI-INPUT EXECUTION ---

# Define the 3 test cases to check for all directions
TEST_CASES = [
    {
        "name": "Test 1: Downward Reversal (Original Input)",
        "input": "[O:30000, H:30500, C:30200]",
        "expected_logic": "DOWN"
    },
]


In [13]:
for i, test in enumerate(TEST_CASES):
    print(f"\n[{test['name']} - Expecting {test['expected_logic']}]")

    # Run the test
    input_text, raw_output, cleaned_prediction = run_inference_test(
        test['input'], tokenizer, model, DEVICE
    )

    # Print results
    print("=" * 70)
    print(f"📤 INPUT: {test['input']}")
    print(f"🤖 RAW OUTPUT: {raw_output}")
    print(f"✅ CLEANED PREDICTION: **{cleaned_prediction}**")
    print("=" * 70)


[Test 1: Downward Reversal (Original Input) - Expecting DOWN]
📤 INPUT: [O:30000, H:30500, C:30200]
🤖 RAW OUTPUT: The 12-hour prediction is **DOWN**. The final prediction is
✅ CLEANED PREDICTION: **DOWN**
