<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/FP_POC_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers datasets accelerate evaluate bitsandbytes --quiet
!pip install -U kagglehub -q

 ## Classes Definition

In [None]:
from transformers import TrainerCallback

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=3):
        self.early_stopping_patience = early_stopping_patience
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):

        metric_to_check = args.metric_for_best_model

        if metric_to_check is not None and metrics is not None:
            current_metric = metrics.get(metric_to_check)

            if self.best_metric is None or (
                (args.greater_is_better and current_metric > self.best_metric) or
                (not args.greater_is_better and current_metric < self.best_metric)
            ):
                self.best_metric = current_metric
                self.patience_counter = 0  # Reset patience
                # Optionally save the best model here
            else:
                self.patience_counter += 1
                if self.patience_counter >= self.early_stopping_patience:
                    print(f"Early stopping triggered after {self.patience_counter} epochs without improvement.")
                    control.should_training_stop = True

In [None]:
import torch.nn as nn

# Define the RegressionHead
class RegressionHead(nn.Module):
    def __init__(self, hidden_size, output_size=1, **kwargs):
        super().__init__()
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, hidden_states):
        output = self.linear(hidden_states)
        return output

# Define the FarePredictionModel
class FarePredictionModel(nn.Module): # Inherit from nn.Module
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
        self.config = base_model.config # Add this line to store the base model's
        self.regression_head = RegressionHead(base_model.config.hidden_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        labels=None,
        **kwargs
    ):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask) #for bert
        hidden_state = outputs.last_hidden_state[:, 0, :]  # for bert
        predicted_fare = self.regression_head(hidden_state)

        loss = None
        if labels is not None:
            loss_fn = nn.MSELoss() # You can move this outside if you don't want to recreate it every time
            loss = loss_fn(predicted_fare, labels.view(-1, 1).float()) if labels is not None else None

        return {"loss": loss, "logits": predicted_fare} # Return a dictionary with loss and logits


## Fine Tuning

In [None]:
import os
import random

import numpy as np
import pandas as pd
import sqlite3
import torch
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch import nn
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer

from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer
import kagglehub

import warnings
warnings.filterwarnings("ignore")


# Get the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model and Tokenizer
model_name = "bert-base-uncased"  # Use a standard BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name) # Instantiate the base model using from_pretrained
model = FarePredictionModel(base_model) # Initialize your custom model with the base model
model.to(device)  # Move the model to the device


# Data Loading and Preprocessing
db_name = "akadir0223/flights-after-eda"  # Replace with your Kaggle dataset name
dataset_path = kagglehub.dataset_download(db_name)
files = os.listdir(dataset_path)
csv_file_path = next((os.path.join(dataset_path, f) for f in files if f.endswith('.csv')), None)

if csv_file_path:
    flights_df = pd.read_csv(csv_file_path)
else:
    print("No CSV file found in the dataset directory.")
    exit()

selected_columns = ['airport_1', 'airport_2', 'fare', 'carrier_lg']
flights_subset_df = flights_df[selected_columns]

# Function to create and populate SQLite tables
def create_and_populate_tables(flights_subset_df, num_records=10000):
    conn = sqlite3.connect('flights.db')
    cursor = conn.cursor()
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS flight_qa (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        question TEXT,
        fare REAL
    );
    """)
    conn.commit()

    airport_codes = flights_subset_df['airport_1'].unique().tolist()
    airline_codes = flights_subset_df['carrier_lg'].unique().tolist()

    for _ in range(num_records):
        airport_1 = random.choice(airport_codes)
        airport_2 = random.choice(airport_codes)
        airline_code = random.choice(airline_codes)
        fare = round(random.uniform(50, 500), 2)
        question = f"What is the fare for a flight from {airport_1} to {airport_2} with {airline_code}?"
        cursor.execute("INSERT INTO flight_qa (question, fare) VALUES (?, ?)", (question, fare))
        conn.commit()
    conn.close()

# Create and populate tables
number_records = 200000
create_and_populate_tables(flights_subset_df, num_records=number_records)

# Data Scaling and Tokenization
fare_scaler = MinMaxScaler()
fare_scaler.fit(flights_subset_df[['fare']])

# Function to tokenize the dataset
def create_tokenized_dataset(tokenizer, fare_scaler):
    conn = sqlite3.connect('flights.db')
    cursor = conn.cursor()
    cursor.execute("SELECT question, fare FROM flight_qa")
    data = cursor.fetchall()
    conn.close()

    # Convert data to DataFrame
    df = pd.DataFrame(data, columns=['question', 'fare'])

    # Tokenize the questions
    tokenized_data = tokenizer(
        df['question'].tolist(),
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128
    )

    # Scale the fares and convert to PyTorch tensor
    fares = torch.tensor(fare_scaler.transform(df[['fare']]), dtype=torch.float32)

    # Create a Dataset from the tokenized data and labels
    dataset = Dataset.from_dict({
        "input_ids": tokenized_data["input_ids"],
        "attention_mask": tokenized_data["attention_mask"],
        "labels": fares,
    })

    return dataset

# Create the tokenized dataset
tokenized_dataset = create_tokenized_dataset(tokenizer, fare_scaler)

# Split the dataset
train_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)['train']
val_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)['test']

# Define LORA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    #target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"], # MISTRAL
    target_modules=["query", "key", "value", "dense"],  # Correct target modules for BERT
    lora_dropout=0.1,
    bias="none",
    #task_type="CAUSAL_LM" # MISTRAL
    task_type="SEQ_CLS",

)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Define compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if not isinstance(predictions, np.ndarray):
        predictions = predictions.detach().cpu().numpy()
    if not isinstance(labels, np.ndarray):
        labels = labels.detach().cpu().numpy()
    predictions = predictions.reshape(-1)
    labels = labels.reshape(-1)
    not_nan_mask = np.logical_and(np.isfinite(predictions), np.isfinite(labels))
    predictions = predictions[not_nan_mask]
    labels = labels[not_nan_mask]
    if len(predictions) == 0 or len(labels) == 0:
        return {"mse": 0.0, "mae": 0.0, "rmse": 0.0}
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    rmse = np.sqrt(mse)
    return {"mse": mse, "mae": mae, "rmse": rmse}
    #return {"mse": mse, "mae": mae, "rmse": rmse,"eval_loss":loss}

# TrainingArguments
training_args = TrainingArguments(
    output_dir="/content/bert_fpllm_output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    optim="adamw_torch_fused",
    num_train_epochs=3,
    max_steps=800,
    learning_rate=2e-5,
    logging_steps=50,
    fp16=True,  # Enable mixed precision training #fr bert
    #bf16=True, #Mistral
    # bf16=True,
    lr_scheduler_type="constant",
    weight_decay=0.1,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=50,
    label_names=[],
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
    #logging_dir="./logs",  # Specify a directory for logs
    #report_to="tensorboard"  # Enable TensorBoard logging
)

def custom_compute_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")  # Get labels from inputs

    # Forward pass to get predictions (logits)
    outputs = model(**inputs)

    # Calculate loss
    loss_fn = nn.MSELoss() # You can move this outside if you don't want to recreate it every time
    loss = loss_fn(outputs, labels.view(-1, 1).float()) if labels is not None else None


# Trainer Initialization and Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 1,339,392 || all params: 110,822,401 || trainable%: 1.2086


Step,Training Loss,Validation Loss,Mse,Mae,Rmse
50,1.2563,0.134497,0.134497,0.306069,0.366738
100,0.5112,0.116738,0.116738,0.290131,0.341669
150,0.4853,0.136197,0.136197,0.307691,0.369049
200,0.5405,0.119269,0.119269,0.292412,0.345353
250,0.4764,0.116383,0.116383,0.2898,0.34115
300,0.472,0.103739,0.103739,0.278182,0.322086
350,0.4371,0.111907,0.111907,0.285705,0.334526
400,0.4396,0.102986,0.102986,0.277415,0.320915
450,0.4603,0.102871,0.102871,0.277251,0.320735
500,0.4358,0.122314,0.122314,0.295174,0.349734


Early stopping triggered after 3 epochs without improvement.


TrainOutput(global_step=600, training_loss=0.5297805404663086, metrics={'train_runtime': 2363.9025, 'train_samples_per_second': 2.707, 'train_steps_per_second': 0.338, 'total_flos': 0.0, 'train_loss': 0.5297805404663086, 'epoch': 0.027149321266968326})

## Analytics

In [None]:
print('\n')
print(f'Dataset tokenize structure: {tokenized_dataset}')
print('\n')
print(f"Train Dataset Size: {len(train_dataset)}")
print(f" Eval Dataset Size: {len(val_dataset)}")
print('\n')
print(f"Maximun fare: {fare_scaler.data_max_[0]}")
print(f"Minimum fare: {fare_scaler.data_min_[0]}")




Dataset tokenize structure: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 221000
})


Train Dataset Size: 176800
 Eval Dataset Size: 44200


Maximun fare: 457.0
Minimum fare: 50.0


## Prompt Design

In [None]:
def generate_final_prompt():
    conn = sqlite3.connect('flights.db')
    cursor = conn.cursor()

    # Retrieve question and fare from flight_qa table
    cursor.execute("SELECT question, fare FROM flight_qa")
    data = cursor.fetchall()

    final_prompt = ""
    for question, fare in data:
        final_prompt += f"{question} [INST] {fare} [/INST]"  # Changed $$ to $$$

    conn.close()
    return final_prompt

# Main execution
create_and_populate_tables(flights_subset_df, num_records=1000)
final_prompt = generate_final_prompt()
print(final_prompt[0:78])

What is the fare for a flight from BLI to LIT with UK? [INST] 263.63 [/INST]Wh


## Evaluation

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)

Early stopping triggered after 5 epochs without improvement.
{'eval_loss': 0.10287108272314072, 'eval_mse': 0.10287108272314072, 'eval_mae': 0.27725088596343994, 'eval_rmse': 0.32073522214303296, 'eval_runtime': 188.0918, 'eval_samples_per_second': 234.992, 'eval_steps_per_second': 29.374, 'epoch': 0.027149321266968326}
