<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/FPU_MISTRAL_LAMBDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
import torch
from rouge_score import rouge_scorer
from tqdm import tqdm
from peft import LoraConfig, get_peft_model
from evaluate import load
import numpy as np
import nltk
import re


import gc

# Example: Inside or after your training loop, if you suspect memory issues
gc.collect()
torch.cuda.empty_cache() # Specifically for CUDA memory


nltk.download('punkt_tab')

import warnings
warnings.filterwarnings("ignore", message="Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.", category=UserWarning)

# ---------------------- Configuration ----------------------
model_checkpoint = "mistralai/Mistral-7B-Instruct-v0.1"
train_dataset_path = "/home/ubuntu/work/cmapss_FD004_train_text.jsonl"
validation_dataset_path = "/home/ubuntu/work/cmapss_FD004_test_text.jsonl"
output_dir = "./fine-tuned-mistral-peft-lambda"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
num_train_epochs = 10
learning_rate = 2e-5
weight_decay = 0.01
warmup_steps = 100
max_seq_length = 512
logging_steps = 10
save_steps = 10
eval_steps = 10
evaluation_strategy = "steps"
save_total_limit = 2
fp16 = torch.cuda.is_available()
gradient_checkpointing = False

# ---------------------- 1. Load Datasets ----------------------
def load_jsonl_dataset(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return Dataset.from_list(data)

train_dataset = load_jsonl_dataset(train_dataset_path)
eval_dataset = load_jsonl_dataset(validation_dataset_path)

# ---------------------- 2. Load Tokenizer ----------------------
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

# ---------------------- 3. Preprocess Data ----------------------
def tokenize_function(examples):
    prompts = []
    responses = []

    contents_list = examples['contents']

    for item in contents_list:
        try:
            if (item and
                len(item) == 2 and
                item[0]['role'] == 'user' and
                item[1]['role'] == 'model' and
                item[0]['parts'] and
                item[1]['parts'] and
                item[0]['parts'][0]['text'] and
                item[1]['parts'][0]['text']):
                user_text = item[0]['parts'][0]['text']
                if "Engine sensor readings over time:" in user_text:
                    sensor_data = user_text.replace("Engine sensor readings over time: ", "")
                else:
                    sensor_data = user_text
                prompts.append(f"Predict the remaining useful life for this engine with sensor readings: {sensor_data}")

                model_text = item[1]['parts'][0]['text']
                rul_match = re.search(r'(\d+)', model_text)
                if rul_match:
                    responses.append(rul_match.group(1))
                else:
                    responses.append("0") # Default if no number found
            else:
                print(f"Skipping invalid data point: {item}")
                continue
        except (KeyError, IndexError):
            print(f"Skipping invalid data point: {item}")
            continue

    tokenized_prompts = tokenizer(prompts,
                                 padding="max_length",
                                 truncation=True,
                                 max_length=max_seq_length,
                                 return_tensors="pt")
    tokenized_responses = tokenizer(responses,
                                  padding="max_length",
                                  truncation=True,
                                  max_length=max_seq_length,
                                  return_tensors="pt")

    input_ids = []
    attention_mask = []
    labels = []

    for i in range(len(prompts)):
        full_text = prompts[i] + tokenizer.eos_token + responses[i] + tokenizer.eos_token
        tokenized_full_text = tokenizer.encode(full_text, max_length=max_seq_length, truncation=True)
        current_input_ids = tokenized_full_text

        vocab_size = tokenizer.vocab_size
        max_index = max(current_input_ids) if current_input_ids else -1
        min_index = min(current_input_ids) if current_input_ids else float('inf')

        if max_index >= vocab_size or min_index < 0:
            print(f"Warning: Out-of-bounds index found in tokenized input (example {i}):")
            print(f"  Max index: {max_index}, Vocabulary size: {vocab_size}")
            print(f"  Min index: {min_index}")

        input_ids.append(current_input_ids)
        attention_mask.append([1] * len(current_input_ids))
        labels.append(current_input_ids.copy())

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=['contents'])
tokenized_eval_datasets = eval_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=['contents'])

# ---------------------- 4. Data Collator ----------------------
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# ---------------------- 5. Load Model with PEFT ----------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# ---------------------- 6. Training Arguments ----------------------
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=2e-5,
    weight_decay=weight_decay,
    warmup_steps=100,
    num_train_epochs=num_train_epochs,
    logging_steps=logging_steps,
    save_steps=save_steps,
    eval_steps=eval_steps,
    eval_strategy=evaluation_strategy,
    save_total_limit=2,
    fp16=fp16,
    gradient_checkpointing=False,
    report_to="tensorboard",
    label_names=["labels"],
    lr_scheduler_type="cosine",
)

# ---------------------- 7. Trainer ----------------------
from evaluate import load
import numpy as np
import torch
import nltk
import re

nltk.download('punkt_tab')

rouge_metric = load("rouge")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=-1)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = torch.sum(predictions != tokenizer.pad_token_id, dim=1)
    result["gen_len"] = torch.mean(prediction_lens.float()).item()
    return result

# ---------------------- 7. Trainer ----------------------

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_eval_datasets,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
# ---------------------- 8. Train ----------------------
trainer.train()

# ---------------------- 9. Save Model ----------------------
trainer.save_model(output_dir)

print(f"Fine-tuning complete! Model saved to: {output_dir}")

## Evaluation

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
from tqdm import tqdm
from rouge_score import rouge_scorer
import re
import numpy as np

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-mistral-peft-lambda"
tokenizer = AutoTokenizer.from_pretrained(model_path)
inference_model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

# Explicitly set pad_token_id if it's not already set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Load your evaluation dataset
validation_dataset_path = "/home/ubuntu/work/cmapss_FD004_test_text.jsonl"  # Use your actual path
eval_dataset = []
with open(validation_dataset_path, 'r') as f:
    for line in f:
        eval_dataset.append(json.loads(line))

# Number of evaluation prompts to use
num_eval_prompts = 2
predicted_ruls = []
ground_truth_ruls = []

for i in tqdm(range(num_eval_prompts), desc="Evaluating Inference with MAE"):
    try:
        prompt_data = eval_dataset[i]['contents'][0]['parts'][0]['text']
        ground_truth_text = eval_dataset[i]['contents'][1]['parts'][0]['text']
    except (IndexError, KeyError):
        print(f"Skipping invalid data point at index {i}")
        continue

    prompt = f"Predict the remaining useful life for this engine with sensor readings: {prompt_data}"
    inputs = tokenizer.encode_plus(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
    ).to(inference_model.device)

    with torch.no_grad():
        output = inference_model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id
        )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"\n--- Example {i+1} ---")
    print(f"Prompt: {prompt}")
    print(f"Generated Text: {generated_text}")
    print(f"Ground Truth Data: {ground_truth_text}")

    # Extract numerical RUL from generated text
    predicted_rul_match = re.search(r'(\d+)', generated_text)
    predicted_rul = int(predicted_rul_match.group(1)) if predicted_rul_match else None

    # Extract numerical RUL from ground truth
    ground_truth_rul_match = re.search(r'(\d+)', ground_truth_text)
    ground_truth_rul = int(ground_truth_rul_match.group(1)) if ground_truth_rul_match else None

    if predicted_rul is not None and ground_truth_rul is not None:
        predicted_ruls.append(predicted_rul)
        ground_truth_ruls.append(ground_truth_rul)
    else:
        print("Could not extract numerical RUL from generated or ground truth text.")

# Calculate and print Mean Absolute Error
if predicted_ruls and ground_truth_ruls:
    mae = np.mean(np.abs(np.array(predicted_ruls) - np.array(ground_truth_ruls)))
    print(f"\n--- Mean Absolute Error (MAE) on Inference ---")
    print(f"MAE: {mae}")
else:
    print("\nCould not calculate MAE due to missing predictions or ground truth.")