<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/MISTRAL_FARE_PREDICTION_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://www.kaggle.com/datasets/giovamata/airlinedelaycauses

https://www.kaggle.com/datasets/bhavikjikadara/us-airline-flight-routes-and-fares-1993-2024

# MISTRAL

## LIBRARIES

In [None]:
# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 OR L4 IN GOOGLE COLAB
#!pip install -U transformers
!pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

# Uncomment only if you're using A100 GPU
#!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet
!pip install colab-env --quiet


In [None]:
import torch
import os
import sys
import json
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)
from trl import SFTTrainer

## FEATURE ENGINEERING  

In [None]:
import kagglehub
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from transformers import TrainingArguments, Trainer
import numpy as np
import torch
from torch.utils.data import Dataset
import torch.nn as nn

# Data Loading and Preprocessing
db_name="bhavikjikadara/us-airline-flight-routes-and-fares-1993-2024"
dataset_path = kagglehub.dataset_download(db_name)
files = os.listdir(dataset_path)
csv_file_path = next((os.path.join(dataset_path, f) for f in files if f.endswith('.csv')), None)

if csv_file_path:
    flights_df = pd.read_csv(csv_file_path)
    # Data Cleaning
    flights_df.dropna(subset=['Geocoded_City1', 'Geocoded_City2', 'city1', 'city2', 'carrier_lg', 'Year', 'quarter', 'nsmiles', 'passengers', 'fare'], inplace=True)
    flights_df['Geocoded_City1'] = flights_df['Geocoded_City1'].astype(str)
    flights_df['Geocoded_City2'] = flights_df['Geocoded_City2'].astype(str)
    flights_df['city1'] = flights_df['city1'].astype(str)
    flights_df['city2'] = flights_df['city2'].astype(str)
    flights_df['carrier_lg'] = flights_df['carrier_lg'].astype(str)
    flights_df['Year'] = flights_df['Year'].astype(int)
    flights_df['quarter'] = flights_df['quarter'].astype(int)
    flights_df['nsmiles'] = flights_df['nsmiles'].astype(float)
    flights_df['passengers'] = flights_df['passengers'].astype(int)
    flights_df['fare'] = flights_df['fare'].astype(float)

else:
    raise FileNotFoundError("No CSV file found in the dataset directory.")

# Textual Representation of Flight Data
def flight_to_text(row):
    text = f"Flight from {row['city1']} to {row['city2']} on {row['carrier_lg']} in {row['Year']}, quarter {row['quarter']}. Distance: {row['nsmiles']} miles. Passengers: {row['passengers']}."
    return text

flights_df['text'] = flights_df.apply(flight_to_text, axis=1)

# Dataset Creation and Splitting with Sample Size Control
sample_size = 10000  # Set your desired sample size

train_texts, test_texts, train_fares, test_fares = train_test_split(
    flights_df['text'].tolist(),
    flights_df['fare'].tolist(),
    train_size=sample_size,  # Use train_size to specify the number of training samples
    test_size=sample_size,    # Use test_size to specify the number of testing samples
    random_state=42
)

class FlightDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor([self.labels[idx]], dtype=torch.float32).unsqueeze(0)  # Reshape to (1, 1)

        return item

    def __len__(self):
        return len(self.labels)

# Before tokenizing, set padding_side to 'left':
tokenizer.padding_side = 'left'  # Set padding to the left

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = FlightDataset(train_encodings, train_fares)
test_dataset = FlightDataset(test_encodings, test_fares)

  flights_df = pd.read_csv(csv_file_path)


## FINE TUNING

In [None]:
from peft import  LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer

# Fine-tuning LLM for Fare Prediction

# Move the base model to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 6. Fine-tuning DeepSeek for Fare Prediction with PEFT (LoRA)
# Define LoRA configuration

target_modules=["model.layers.*.self_attn.qkv"]
target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Wrap the model with PEFT
base_model = get_peft_model(base_model, lora_config)

# Print trainable parameters for verification
base_model.print_trainable_parameters()

output_dir="/content/gdrive/MyDrive/model/Mistral-7B-fareprediction-attention-2"

training_args = TrainingArguments(
    output_dir=output_dir,
    #use_cache=False,
    per_device_train_batch_size=3,
    gradient_accumulation_steps=2,
    warmup_steps=10,
    optim="adamw_torch_fused",
    num_train_epochs=1,
    max_steps=100,
    learning_rate=5e-6,
    logging_steps=10,
    bf16=True,
    tf32=True,
    lr_scheduler_type="constant",
    weight_decay=0.2,
    eval_steps=10,  # Assuming you want eval_steps to be 10
    report_to="none",
    save_steps=20,
    evaluation_strategy="steps",
    max_grad_norm=1.0,
    logging_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # For minimizing eval_loss (accuracy is maximized by default)

    #no_cuda=False if torch.cuda.is_available() else True # Add this line if needed for CPU training

)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    #compute_metrics=compute_metrics
)

trainer.model.to(training_args.device)  # Move to device specified in training_args

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 20,971,520 || all params: 7,262,707,713 || trainable%: 0.2888


Step,Training Loss,Validation Loss
10,2.8642,2.632018
20,2.3786,2.060677
30,1.7753,1.545895
40,1.4302,1.293041
50,1.2375,1.170424
60,1.1704,1.130794
70,1.133,1.107469
80,1.1032,1.089792
90,1.0685,1.071693
100,1.0646,1.058095


TrainOutput(global_step=100, training_loss=1.522563991546631, metrics={'train_runtime': 3722.0377, 'train_samples_per_second': 0.161, 'train_steps_per_second': 0.027, 'total_flos': 1540432429056000.0, 'train_loss': 1.522563991546631, 'epoch': 0.059988002399520096})

## EVALUATION

In [None]:
from peft import PeftModel
# 7. Evaluate the Fine-tuned DeepSeek Model for Fare Prediction

# Load the PEFT weights and apply them to the base model
fine_tuned_model = PeftModel.from_pretrained(base_model, '/content/gdrive/MyDrive/model/Mistral-7B-fareprediction-attention-2/checkpoint-100')

# Add the regression head to the loaded model
hidden_size = fine_tuned_model.config.hidden_size  # Get hidden size from config
fine_tuned_model.base_model.model.regression_head = nn.Linear(hidden_size, 1)  # Add the linear layer for regression


def predict_fare(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)


    # Move inputs and model to the same device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = inputs.to(device)
    fine_tuned_model.to(device)  # Move the model to the device

    # Use the fine-tuned model for prediction:
    outputs = fine_tuned_model(**inputs, output_hidden_states=True) # Added output_hidden_states=True
    # Access hidden states and get the last one
    hidden_states = outputs.hidden_states
    last_hidden_state = hidden_states[-1][:, -1, :] # Get last hidden state

    # Cast last_hidden_state to float32
    last_hidden_state = last_hidden_state.type(torch.float32)

    predicted_fare = fine_tuned_model.regression_head(last_hidden_state).item()

    return predicted_fare

y_pred = [predict_fare(text) for text in test_texts]
mse = mean_squared_error(test_fares, y_pred)
r2 = r2_score(test_fares, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

## GEMINI ANALYSIS

In [None]:
!pip install google-generativeai -q
# Used to securely store your API key
from google.colab import userdata
import google.generativeai as genai

GOOGLE_API_KEY=userdata.get('GEMINI')
genai.configure(api_key=GOOGLE_API_KEY)
model_name = 'gemini-1.5-pro'
model = genai.GenerativeModel(model_name)

In [None]:


def generate_gemini_feedback_prompt(observations, feedback):
    """Generates a customized prompt for Gemini feedback without placeholders."""
    prompt = f"""
**Task:** Flight Fare Prediction

**Model:** {model_id}

**Dataset:** {db_name}

**Sample Size:** {sample_size} (training and testing)

**Target of Prediction:** Flight Fare

**Evaluation Metrics:**
- R2: {r2}
- MSE: {mse}

**Observations:**
{observations}

**Feedback:**
{feedback}
"""
    return prompt

# ... (rest of your existing code for submitting to Gemini and printing analysis) ...

In [None]:
# Generating and submitting the prompt to Gemini:
observations_value = "The model showed promising results... (your observations)"
feedback_value = "It would be beneficial if Gemini... (your feedback)"
# Creating the prompt with evaluation details:
prompt_with_eval_details = generate_gemini_feedback_prompt(observations_value, feedback_value)


prompt_for_analysis = """
You are a helpful fare prediction expert.
Please, make an additional analysis of this Fine-Tuning experiment report with metrics susch as MSE and R2.
"""

# Combining the prompts:
final_prompt = prompt_with_eval_details + prompt_for_analysis


model_name = "gemini-1.5-pro"  # Replace with desired model
model = genai.GenerativeModel(model_name)
response = model.generate_content(final_prompt)
llm_analysis = response.text

print("\n\n## LLM Analysis:\n")
print(llm_analysis)