<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/cmapss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## installation

In [None]:
# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 OR L4 IN GOOGLE COLAB
#!pip install -U transformers
!pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

# Uncomment only if you're using A100 GPU
#!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet
!pip install colab-env --quiet

!pip install accelerate --quiet

## Environment

In [2]:
# Dynamically check for sliding window support in flash_attn
_flash_supports_window_size = False  # Initialize to False
try:
    import flash_attn  # Try to import flash_attn

    if hasattr(flash_attn, "flash_attn_func"):
        from flash_attn.flash_attn_interface import _flash_supports_window_size
    else:
        from flash_attn.flash_attention import _flash_supports_window_size
except ImportError:
    pass  # If flash_attn is not installed, keep _flash_supports_window_size as False

In [None]:
import os
from transformers import TrainingArguments
import accelerate

# Initialize the Accelerator
accelerator = accelerate.Accelerator()

#!pip install diffusers safetensors  --quiet
#!pip install colab-env --quiet

import colab_env
import os

access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

In [None]:
import torch
import os
import sys
import json
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)
from trl import SFTTrainer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')

## data

In [None]:
import os
import pandas as pd
import json
import zipfile
from google.cloud import storage
from google.colab import auth


# --- Data Loading from Google Drive ---
zip_path = '/content/gdrive/MyDrive/datasets/CMAPSSData.zip'
extract_dir = 'data/cmapss'
os.makedirs(extract_dir, exist_ok=True)

!mkdir -p /content/gdrive/MyDrive/datasets/CMAPSSData/

if not os.path.exists(zip_path):
    print(f"Error: CMAPSSData.zip not found at {zip_path}. Please ensure the file is correctly located in your Google Drive.")
    raise FileNotFoundError(f"CMAPSSData.zip not found at {zip_path}")

try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        if zip_ref.testzip() is None:  # Check for ZIP file integrity
            zip_ref.extractall(extract_dir)
            print(f"Extracted dataset files to: {extract_dir}")
        else:
            print("Error: ZIP file integrity check failed. The file may not be a valid ZIP file.")
            raise zipfile.BadZipFile("ZIP file integrity check failed.")

except zipfile.BadZipFile as e:
    print(f"Error extracting ZIP file: {e}")
    print(
        "The uploaded file may not be a valid or complete ZIP file. "
        "Please ensure you have uploaded the correct file, that it is not corrupted, "
        "and that it is a standard ZIP archive."
    )
    raise  # Stop execution if extraction fails

# --- Prepare NASA CMAPSS Data and Save to JSONL in GCS ---
extract_dir = 'data/cmapss'
os.makedirs(extract_dir, exist_ok=True)

# Process all four subsets
data_subsets = ['FD001', 'FD002', 'FD003', 'FD004']

for data_subset in data_subsets:
    train_file = os.path.join(extract_dir, f'train_{data_subset}.txt')
    test_file = os.path.join(extract_dir, f'test_{data_subset}.txt')
    rul_file = os.path.join(extract_dir, f'RUL_{data_subset}.txt')

    SENSOR_COLUMNS = ['sensor' + str(i).zfill(2) for i in range(1, 22)]
    OP_SETTING_COLUMNS = ['op_setting_' + str(i) for i in range(1, 4)]
    DATA_COLUMNS = ['unit_nr', 'time_cycles'] + OP_SETTING_COLUMNS + SENSOR_COLUMNS

    # Load training data
    try:
        train_df = pd.read_csv(train_file, names=DATA_COLUMNS, delim_whitespace=True, header=None)
        test_df = pd.read_csv(test_file, names=DATA_COLUMNS, delim_whitespace=True, header=None)
        rul_df = pd.read_csv(rul_file, names=['RUL'], delim_whitespace=True, header=None)

        train_df.columns = DATA_COLUMNS
        test_df.columns = DATA_COLUMNS

        print(f"\nProcessing data subset: {data_subset}")
        print("Shape of train_df after loading:", train_df.shape)
        print("train_df head after loading:\n", train_df.head())
        print("Shape of test_df:", test_df.shape)
        print("test_df head after loading:\n", test_df.head())
        print("Shape of RUL data:", rul_df.shape)

    except FileNotFoundError as e:
        print(f"Error loading data files for subset {data_subset}: {e}")
        raise  # Stop execution if a file is missing

    def create_jsonl(df, rul_df, output_path, sequence_length=30, is_test=False):
        grouped_data = df.groupby('unit_nr')
        rul_values = rul_df.values.tolist()  # Convert RUL DataFrame to list
        engine_count = 0  # To track which RUL value to use

        with open(output_path, 'w') as f:
            for unit_nr, unit_data in grouped_data:
                num_cycles = len(unit_data)
                data_values = unit_data.drop(['unit_nr'], axis=1).values.tolist()
                json_data = []  # Initialize an empty list to hold JSON objects

                for i in range(max(0, num_cycles - sequence_length + 1)):
                    sequence = data_values[i:i + sequence_length]
                    rul = num_cycles - (i + sequence_length)

                    # Ensure RUL is not out of bounds
                    if engine_count < len(rul_values):
                        current_rul = rul_values[engine_count][0]  # Get the RUL value
                    else:
                        current_rul = 0  # Or some default value if RUL data is exhausted

                    if len(sequence) == sequence_length:
                        json_record = {"sequence": sequence, "sequence_length": len(sequence), "rul": current_rul}  # Include sequence length
                        json_data.append(json_record)

                # Write all JSON objects to the file at once
                with open(output_path, 'w') as f:
                    for json_record in json_data:
                        f.write(json.dumps(json_record) + '\n')

                engine_count += 1  # Increment engine counter

    local_train_jsonl_path = f"cmapss_{data_subset}_train_sequences.jsonl"
    local_test_jsonl_path = f"cmapss_{data_subset}_test_sequences.jsonl"

    # Create JSONL for training
    create_jsonl(train_df, rul_df, local_train_jsonl_path, is_test=False)
    print(f"Created {local_train_jsonl_path}")

    # Create JSONL for testing
    create_jsonl(test_df, rul_df, local_test_jsonl_path, is_test=True)
    print(f"Created {local_test_jsonl_path}")

!cp *.jsonl /content/gdrive/MyDrive/datasets/CMAPSSData/
print("JSONL files created and uploaded.")

In [None]:
import json
import numpy as np

def create_textual_dataset(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            try:
                data = json.loads(line)
                sequence = data.get("sequence")
                rul = data.get("rul") # Assuming your data has an RUL

                if sequence:
                    # Create a simple textual description (you can make this more sophisticated)
                    description = f"Engine sensor readings over time: {np.array(sequence).flatten().tolist()}"
                    if rul is not None:
                        output_data = {"contents": [{"role": "user", "parts": [{"text": description}]}, {"role": "model", "parts": [{"text": f"Remaining Useful Life: {rul}"}]}]}
                        outfile.write(json.dumps(output_data) + '\n')
                    else:
                        output_data = {"contents": [{"role": "user", "parts": [{"text": description}]}, {"role": "model", "parts": [{"text": "RUL prediction needed."}]}]}
                        outfile.write(json.dumps(output_data) + '\n')
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
            except Exception as e:
                print(f"An error occurred: {e}")

# Define your input and output file paths
input_train_file = "cmapss_FD004_train_sequences.jsonl"
output_train_file_text = "cmapss_FD004_train_text.jsonl"

input_test_file = "cmapss_FD004_test_sequences.jsonl"
output_test_file_text = "cmapss_FD004_test_text.jsonl"

# Create the textual datasets
create_textual_dataset(input_train_file, output_train_file_text)
create_textual_dataset(input_test_file, output_test_file_text)

print(f"Textual training data created: {output_train_file_text}")
print(f"Textual testing data created: {output_test_file_text}")

In [None]:
import json

def transform_jsonl_to_prompt_completion(input_file_path, output_file_path):
    """Transforms chat-style JSONL to prompt-completion JSONL."""
    with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
        for line in infile:
            try:
                data = json.loads(line)
                # Extract prompt and completion from 'contents'
                prompt = "".join([part["text"] for part in data["contents"][0]["parts"]])  # Assumes user role is first
                completion = str(data.get("completion", "")) # Handle if completion is missing

                # Construct prompt-completion dictionary
                prompt_completion_data = {"prompt": prompt, "completion": completion}

                # Write to output file
                outfile.write(json.dumps(prompt_completion_data) + "\n")

            except (json.JSONDecodeError, KeyError, IndexError) as e:
                print(f"Skipping invalid or unprocessable line: {line.strip()}, Error: {e}")

# Example usage:
input_file_path = "cmapss_FD004_train_text.jsonl"
output_file_path = "cmapss_FD004_train_text_transformed.jsonl"

transform_jsonl_to_prompt_completion(input_file_path, output_file_path)
print(f"Transformed data written to: {output_file_path}")

input_file_path = "cmapss_FD004_test_text.jsonl"
output_file_path = "cmapss_FD004_test_text_transformed.jsonl"

transform_jsonl_to_prompt_completion(input_file_path, output_file_path)
print(f"Transformed data written to: {output_file_path}")

In [None]:
{
  "contents": [
    {
      "role": "user",
      "parts": [
        {
          "text": "Engine sensor readings over time: [1.0, 41.9993, 0.8409, 100.0, 445.0, 548.68, 1343.85, 1111.03, 3.91, 5.69, 137.26, 2211.96, 8296.96, ..., 8054.65, 9.2728, 0.02, 331.0, 2223.0, 100.0, 14.78, 8.8922]"
        }
      ]
    },
    {
      "role": "model",
      "parts": [
        {
          "text": "Remaining Useful Life: 0"
        }
      ]
    }
  ]
}

## model

In [None]:
# Hugging Face model id
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
#model_id = "meta-llama/Llama-3.1-8B-Instruct"


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True)
tokenizer.padding_side = 'right' # to prevent warnings

# Set padding token if not present (common requirement for training)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Instead of using the unk_token, add a dedicated padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer)) #Important: update the model's embedding layer to accommodate the new padding token.
print("Model and tokenizer loaded.")

## Fine tuning

In [None]:
from datasets import load_dataset # Example for loading data
from trl import SFTTrainer # Simplified Fine-tuning Trainer
from peft import LoraConfig # For LoRA efficient tuning
from transformers import TrainingArguments
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import gc # Import the garbage collector


# Ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)
simplefilter(action='ignore', category=UserWarning)
simplefilter(action='ignore', category=RuntimeWarning)
simplefilter(action='ignore', category=Warning)
simplefilter(action='ignore', category=ResourceWarning)
simplefilter(action='ignore')
simplefilter(action='ignore', category=UnicodeWarning)

# 1. --- Prepare your Dataset ---
# Needs to be in a format the trainer understands (e.g., instruction/response pairs)
# Example: Load a dataset from Hugging Face Hub
# dataset = load_dataset("your_dataset_name", split="train")
# Or create your own Dataset object
# Formatted dataset usually has a 'text' column with structured prompts/responses
print("Preparing dataset...")


#Load your datasets
train_dataset = load_dataset("json", data_files="/content/cmapss_FD004_train_text.jsonl", split="train")
test_dataset = load_dataset("json", data_files="/content/cmapss_FD004_test_text.jsonl", split="train") # Using 'train' split for test as well for simplicity

def preprocess_function(examples):
    # Process each example individually within the batch
    processed_examples = []  # Store processed examples here

    for example in examples['contents']:
        prompt = "".join([part["text"] for part in example[0]["parts"]])
        completion = "".join([part["text"] for part in example[1]["parts"]])
        #Combine the prompt and completion and then tokenize
        inputs = tokenizer(prompt + completion, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        # Extract RUL information from the completion instead of the prompt
        # Assume the completion has the format "Remaining Useful Life: {RUL}"
        try:
            rul = int(completion.split("Remaining Useful Life: ")[-1])
        except ValueError:
            # If the completion format is incorrect or RUL is missing, set rul to 0
            rul = 0

        #Add the RUL to the inputs to be passed to the model
        inputs['labels'] = inputs['input_ids'].clone()
        inputs['rul'] = rul

        processed_examples.append(inputs)  # Append the dictionary

    # Stack tensors to get correct dimensions
    input_ids = torch.stack([d['input_ids'] for d in processed_examples]).squeeze(1)
    labels = torch.stack([d['labels'] for d in processed_examples]).squeeze(1)
    ruls = torch.tensor([d['rul'] for d in processed_examples])

    return {'input_ids': input_ids, 'labels': labels, 'rul': ruls}  # Return the final dictionary
# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# 2. --- Configure LoRA (Optional but recommended for efficiency) ---
lora_config = LoraConfig(
     r=16, # Rank
     lora_alpha=32,
     target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Adapt these layers
     lora_dropout=0.05,
     bias="none",
     task_type="CAUSAL_LM"
)

# 3. --- Define Training Arguments ---
output_dir = "./llama3-8b-finetuned"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,            # Adjust as needed
    per_device_train_batch_size=1, # Adjust based on your GPU memory
    gradient_accumulation_steps=8, # Increased gradient accumulation steps
    gradient_checkpointing=True,   # Enabled gradient checkpointing
    learning_rate=2e-4,            # Adjust as needed
    max_grad_norm=0.3,             # Gradient clipping
    weight_decay= 0.01,           # Regularization
    logging_steps=10,
    optim="paged_adamw_8bit",      # Optimizer for quantized models
    save_strategy="steps",
    eval_strategy="steps",  # Evaluate at specified intervals
    eval_steps=10,             # Evaluate every 100 steps (adjust as needed)
    #eval_strategy="no",  # Disable evaluation during training to save memory
    load_best_model_at_end=True,  # Load the best model based on validation
    metric_for_best_model="loss",  # Metric to use for selecting the best model
    report_to="none",
    # Add more arguments as needed (fp16, etc.)
)

# 4. --- Create the Trainer ---
# Using SFTTrainer for supervised fine-tuning on conversational/instruction data
trainer = SFTTrainer(
    model=model,
    args=training_args,
    peft_config=lora_config,  # Pass LoRA config if using PEFT/LoRA
    train_dataset=train_dataset, # Your formatted training dataset
    eval_dataset=test_dataset,   # Your formatted testing dataset (for evaluation)
)

# 5. --- Run Fine-Tuning ---
print("Starting fine-tuning...")
# This is the core training step
trainer.train()
print("Fine-tuning finished (Conceptual - train() call commented out).")

# Delete unused variables to free up memory
del train_dataset, test_dataset
gc.collect()  # Run garbage collection


# 6. --- Save the Fine-Tuned Model (Adapter or Full Model) ---
print(f"Saving fine-tuned model to {output_dir}...")
# trainer.save_model(output_dir) # Saves adapter config (if LoRA) & weights
# Alternatively, if not using LoRA or want to merge weights:
# merged_model = model.merge_and_unload() # Merge LoRA weights back if needed
# merged_model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)
print("Model saved.")

Preparing dataset...


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/226 [00:00<?, ? examples/s]

Map:   0%|          | 0/252 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/226 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/252 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting fine-tuning...


The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
10,1.3808,1.063199


## evaluation

In [None]:
from datasets import load_dataset
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the test dataset
test_dataset = load_dataset("json", data_files="/content/cmapss_FD004_test_text.jsonl", split="train")
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Get predictions
predictions = trainer.predict(test_dataset)

# Get actual RUL values
actual_rul = test_dataset['rul']

# Get predicted RUL values (logits for the last token)
predicted_rul = predictions.predictions[0][:, -1]
predicted_rul = predicted_rul[:len(actual_rul)]  # Truncate to match actual RUL length

# Get predicted token IDs using argmax
predicted_token_ids = np.argmax(predictions.predictions[0], axis=-1)

# Extract the last token ID as the predicted RUL
#predicted_rul_decoded = [seq[-1] for seq in predicted_token_ids]  # Removed this line
predicted_rul_decoded = predicted_token_ids[-1]  # Get the last element

# Decode the predicted token ID to get the RUL value as a string
predicted_rul_decoded = tokenizer.decode(predicted_rul_decoded, skip_special_tokens=True)

# Convert the decoded RUL string to an integer
try:
    predicted_rul_decoded = int(predicted_rul_decoded)
except ValueError:
    # Handle cases where the decoded string is not a valid integer (e.g., empty string)
    predicted_rul_decoded = 0  # Or any other default value you prefer

# Duplicate the single predicted RUL to match the length of actual RUL
predicted_rul_decoded = [predicted_rul_decoded] * len(actual_rul) # Modified this line to to fill predicted_rul_decoded using predicted_rul

# Calculate metrics
mse = mean_squared_error(actual_rul, predicted_rul_decoded)
rmse = np.sqrt(mse)
r2 = r2_score(actual_rul, predicted_rul_decoded)

print(f"MSE: {mse}, RMSE: {rmse}, R2: {r2}")