<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/flan_t5_base_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers datasets accelerate evaluate bitsandbytes --quiet
!pip install -U kagglehub -q

## CustomTrainer and RegressionHead

In [2]:
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, EvalLoopOutput
import torch
from torch import nn
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from torch.cuda import amp  # Add this line to import amp
from typing import Dict, Union, Any, List, Optional, Tuple  # Import Tuple

class RegressionHead(nn.Module):
    def __init__(self, hidden_size, output_size=1, **kwargs):
        super().__init__()
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, hidden_states):
        output = self.linear(hidden_states)
        return output

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        # Get fare_scaler from kwargs before calling super().__init__
        self.fare_scaler = kwargs.pop("fare_scaler", None)
        super().__init__(*args, **kwargs)
        self.scaler = torch.cuda.amp.GradScaler() if self.args.fp16 or self.args.bf16 else None

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Computes the loss for the model.
        """
        labels = inputs.pop("labels", None)

        # Pass decoder_input_ids for the decoder
        decoder_input_ids = inputs.get("decoder_input_ids")
        if decoder_input_ids is None:
            decoder_input_ids = inputs.get("input_ids")

        # Ensure output_hidden_states=True is passed to the underlying model
        outputs = model(**inputs, decoder_input_ids=decoder_input_ids, output_hidden_states=True)

        # Access the encoder's hidden states
        encoder_hidden_states = outputs.encoder_hidden_states

        # Get the hidden states and pass them through the regression head
        hidden_states = torch.stack(encoder_hidden_states[-4:]).mean(dim=0)
        hidden_states = hidden_states[:, -1, :]  # Get the last hidden state
        predicted_fares = model.regression_head(hidden_states)

        # Calculate the loss
        if labels is not None:
            labels = labels.to(predicted_fares.device)
            labels = labels.view(-1, 1)

            # Calculate the loss using MSE
            loss_fn = nn.MSELoss()
            loss = loss_fn(predicted_fares, labels.float())

            if return_outputs:
                return (loss, {"logits": predicted_fares})

            return loss

        else:
            loss = torch.tensor(0.0, device=predicted_fares.device)
            return (loss, {"logits": predicted_fares})


    def training_step(self, model, inputs, num_items_in_batch=None):
        """
        Perform a training step on a batch of inputs.
        """
        model.train()
        inputs = self._prepare_inputs(inputs)

        # Use amp.autocast to enable mixed precision for both loss computation and backward pass
        with amp.autocast(enabled=self.args.fp16 or self.args.bf16):
            loss = self.compute_loss(model, inputs)

        # Gradient accumulation if needed (adjust gradient scaling accordingly)
        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        # Apply gradient scaling only if self.scaler is available (for fp16/bf16)
        if self.scaler is not None:
            self.scaler.scale(loss).backward()  # <-- Scale the loss here
        else:
            loss.backward()  # Call backward directly in other cases

        return loss.detach()


    def optimizer_step(self, optimizer: torch.optim.Optimizer, model: nn.Module, *args, **kwargs):
        """
        Perform an optimizer step.
        """

        if self.scaler is not None:
            # Unscale gradients before optimizer step
            self.scaler.unscale_(optimizer)

        # Gradient clipping if enabled
        if self.args.max_grad_norm is not None and self.args.max_grad_norm > 0:
            if hasattr(optimizer, "clip_grad_norm"):
                # Some optimizers have a specific method to clip gradients
                optimizer.clip_grad_norm(self.args.max_grad_norm)
            elif hasattr(model, "clip_grad_norm_"):
                # Some models have a specific method to clip gradients
                model.clip_grad_norm_(self.args.max_grad_norm)
            else:
                # Use PyTorch's clip_grad_norm if no specific method is available
                nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer) if self.use_apex else model.parameters(),
                    self.args.max_grad_norm,
                )

        # Apply optimizer step
        optimizer.step()

        if self.scaler is not None:
            # Update scaler after optimizer step
            self.scaler.update()

        optimizer.zero_grad()  # Reset gradients

        return True


    def prediction_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
        """
        Perform an evaluation step on `model` using `inputs`.
        """
        has_labels = all(inputs.get(k) is not None for k in self.label_names)
        inputs = self._prepare_inputs(inputs)
        if ignore_keys is None:
            if hasattr(self.model, "config"):
                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
            else:
                ignore_keys = []

        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
        if has_labels:
            labels = inputs.pop("labels").to(self.model.device)
        else:
            labels = None

        with torch.no_grad():
            if has_labels:
                with self.autocast_smart_context_manager():
                    loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
                loss = loss.mean().detach()
                if isinstance(outputs, dict):
                    logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
                else:
                    logits = outputs[1:]
            else:
                loss = None
                with self.autocast_smart_context_manager():
                    outputs = model(**inputs, output_hidden_states=True)

                if isinstance(outputs, dict):
                    logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
                else:
                    # Access the encoder's hidden states
                    encoder_hidden_states = outputs.encoder_hidden_states

                    # Apply Regression Head for predictions during evaluation
                    hidden_states = torch.stack(encoder_hidden_states[-4:]).mean(dim=0)
                    hidden_states = hidden_states[:, -1, :]
                    logits = model.regression_head(hidden_states)

        if prediction_loss_only:
            return (loss, None, None)

        if isinstance(logits, tuple):
            logits = logits[0]

        # Apply inverse scaling to predictions before returning
        logits = self.fare_scaler.inverse_transform(logits.cpu().numpy())
        logits = torch.from_numpy(logits).to(self.model.device)

        return (loss, logits, labels)

## MODEL TOKENIZER DATA

In [4]:
import os
import pandas as pd
import kagglehub
from transformers import AutoTokenizer
import sqlite3
import random
import torch
from torch import nn
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [5]:
# 1. Model and Tokenizer
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2a. Add the regression head to the model (Place it here)
model.regression_head = RegressionHead(model.config.hidden_size)
model.regression_head.to(model.device)

# 2b. Data Loading and Preprocessing
db_name = "akadir0223/flights-after-eda"  # Replace with your Kaggle dataset name
dataset_path = kagglehub.dataset_download(db_name)
files = os.listdir(dataset_path)
csv_file_path = next((os.path.join(dataset_path, f) for f in files if f.endswith('.csv')), None)

if csv_file_path:
    flights_df = pd.read_csv(csv_file_path)
else:
    print("No CSV file found in the dataset directory.")
    exit()

selected_columns = ['airport_1', 'airport_2', 'fare', 'carrier_lg']
flights_subset_df = flights_df[selected_columns]

def create_and_populate_tables(flights_subset_df, num_records=1000):
    conn = sqlite3.connect('flights.db')
    cursor = conn.cursor()
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS flight_qa (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        question TEXT,
        fare REAL
    );
    """)
    conn.commit()

    airport_codes = flights_subset_df['airport_1'].unique().tolist()
    airline_codes = flights_subset_df['carrier_lg'].unique().tolist()

    for _ in range(num_records):
        airport_1 = random.choice(airport_codes)
        airport_2 = random.choice(airport_codes)
        airline_code = random.choice(airline_codes)
        fare = round(random.uniform(50, 500), 2)
        question = f"What is the fare for a flight from {airport_1} to {airport_2} with {airline_code}?"
        cursor.execute("INSERT INTO flight_qa (question, fare) VALUES (?, ?)", (question, fare))

    conn.commit()
    conn.close()

# 3. Data Tokenization
def create_tokenized_dataset(tokenizer, fare_scaler):
    conn = sqlite3.connect('flights.db')
    cursor = conn.cursor()
    cursor.execute("SELECT question, fare FROM flight_qa")
    data = cursor.fetchall()
    conn.close()

    flight_data = [{"question": question, "fare": fare} for question, fare in data]
    dataset = Dataset.from_list(flight_data)
    dataset = dataset.map(lambda example: {'fare': np.nan_to_num(example['fare'])}, batched=False)

    def tokenize_function(examples):
        # Prepare target text without placeholders, just the fare
        target_text = [str(fare) for fare in examples["fare"]]

        tokenized_examples = tokenizer(
            examples["question"],
            text_target=target_text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        tokenized_examples["labels"] = fare_scaler.transform(np.array(examples["fare"]).reshape(-1, 1)).flatten()
        return tokenized_examples

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset


# 7. Main Execution
number_records = 1000
create_and_populate_tables(flights_subset_df, num_records=number_records)

# Data Scaling and Tokenization
fare_scaler = MinMaxScaler()
fare_scaler.fit(flights_subset_df[['fare']])
tokenized_dataset = create_tokenized_dataset(tokenizer, fare_scaler)

# Split the dataset
train_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)['train']
val_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)['test']

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading from https://www.kaggle.com/api/v1/datasets/download/akadir0223/flights-after-eda?dataset_version_number=1...


100%|██████████| 10.3M/10.3M [00:01<00:00, 6.88MB/s]

Extracting files...





Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
print(f"Train Data: {train_dataset}")
print(f"Validation Data: {val_dataset}")

Train Data: Dataset({
    features: ['question', 'fare', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})
Validation Data: Dataset({
    features: ['question', 'fare', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 200
})


## compute_metrics

In [7]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def compute_metrics(eval_pred):
    """Calculates evaluation metrics (MSE, MAE, RMSE)."""
    predictions, labels = eval_pred

    # Check if predictions and labels are already NumPy arrays
    if not isinstance(predictions, np.ndarray):
        predictions = predictions.detach().cpu().numpy()
    if not isinstance(labels, np.ndarray):
        labels = labels.detach().cpu().numpy()

    # Reshape predictions and labels to 1D arrays
    predictions = predictions.reshape(-1)
    labels = labels.reshape(-1)

    # Remove NaN values from both predictions and labels
    not_nan_mask = np.logical_and(np.isfinite(predictions), np.isfinite(labels))
    predictions = predictions[not_nan_mask]
    labels = labels[not_nan_mask]

    # Check if the arrays are empty after removing NaNs
    if len(predictions) == 0 or len(labels) == 0:
        # If empty, return default values for metrics to avoid the error
        return {"mse": 0.0, "mae": 0.0, "rmse": 0.0}

    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    rmse = np.sqrt(mse)  # Calculate RMSE manually
    return {"mse": mse, "mae": mae, "rmse": rmse}

## LORA AND TRAINER

In [8]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],  # Example target modules for FLAN-T5
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

# Apply LoRA to the model
#model = get_peft_model(model, lora_config)
#model.print_trainable_parameters()

In [9]:
training_args = TrainingArguments(
    output_dir="/content/deepseek_coder_output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    optim="adamw_torch_fused",
    num_train_epochs=3,
    max_steps=300,
    learning_rate=2e-5,
    logging_steps=20,
    #fp16=True,
    bf16=True,
    #tf32=True,
    lr_scheduler_type="constant",
    weight_decay=0.01,
    eval_steps=20,
    report_to="none",
    save_steps=20,
    #label_smoother=None,
    evaluation_strategy="steps",
    #eval_steps=20,  # Already specified above
    logging_strategy="steps",
    save_strategy="steps",
    label_names=["labels"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # Or "mse" if your compute_metrics returns "mse"
)

In [11]:
# Trainer Initialization and Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #fare_scaler=fare_scaler
)
trainer.train()

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)

## evaluation

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)