🔗 Ensemble Model
---
We’ll reuse core components built earlier:

- A fine-tuned LLaMA model
- An XGBoost regression model, stored in Hugging Face
- A ChromaDB vector store, stored on Google Drive and also available on AWS S3
- A GPT-4o mini + RAG pipeline

We'll run all three models on the same test data, gather their predictions, and train a Linear Regression Ensemble. The ensemble learns how to combine these predictions to output a more accurate final price.

Once trained, we'll save the ensemble as ensemble_model.pkl, ready for later use.

- 🧑‍💻 Skill Level: Advanced
- ⚙️ Hardware: ⚠️ GPU required (use Google Colab)
- 🛠️ Requirements: 

    - 🔑 Hugging Face Token and OpenAI Key — must be set in Google Colab secrets or .env files if you are running with your own GPU
    - completion of Part 9 of [this series of notebooks](https://github.com/lisekarimi/lexo)
- 🎯 Task: Train and save the Ensemble Model

---
📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)

In [None]:
# Install required packages in Google Colab
%pip install -q tqdm huggingface_hub numpy sentence-transformers datasets chromadb xgboost peft torch bitsandbytes

In [None]:
# imports

import os
import re
import zipfile
import chromadb
import joblib
import numpy as np
import pandas as pd
import requests
import torch
from datasets import load_dataset
from google.colab import userdata
from huggingface_hub import HfApi, hf_hub_download, login
from openai import OpenAI
from peft import PeftModel
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import r2_score
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
# Mount Google Drive to access saved ChromaDB and XGBoost model files

from google.colab import drive
drive.mount("/content/drive")

In [None]:
# Load from Colab's secure storage

openai_api_key = userdata.get("OPENAI_API_KEY")
openai = OpenAI(api_key=openai_api_key)

hf_token = userdata.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [None]:
# Configuration

HF_USER = "lisekarimi"
ROOT = "/content/drive/MyDrive/snapr"
os.makedirs(ROOT, exist_ok=True)

api = HfApi(token=hf_token)
REPO_NAME = "smart-deal-finder-models"
REPO_ID = f"{HF_USER}/{REPO_NAME}"

### 📥 Load Test Dataset

In [None]:
# #If you face NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported run:
# %pip install -U datasets

In [None]:
DATASET_NAME = f"{HF_USER}/pricer-data"
dataset = load_dataset(DATASET_NAME)
test = dataset["test"]

In [None]:
# Format description function (no price in text)
def description(item):
    text = item["text"].replace(
        "How much does this cost to the nearest dollar?\n\n", ""
    )
    text = text.split("\n\nPrice is $")[0]
    return f"passage: {text}"


description(test[0])

### 📥 Load Models and ChromaDB

In [None]:
# ChromaDB

CHROMA_PATH = f"{ROOT}/chroma"
COLLECTION_NAME = "price_items"
CHROMA_ZIP_URL = "https://aiprojects-lise-karimi.s3.eu-west-3.amazonaws.com/smart-deal-finder/chroma.zip"

# Download and unzip if CHROMA_PATH doesn't exist
if not os.path.exists(CHROMA_PATH):
    os.makedirs(CHROMA_PATH, exist_ok=True)
    r = requests.get(CHROMA_ZIP_URL)
    with open("/tmp/chroma.zip", "wb") as f:
        f.write(r.content)
    with zipfile.ZipFile("/tmp/chroma.zip", "r") as zip_ref:
        zip_ref.extractall(CHROMA_PATH)

client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection(name=COLLECTION_NAME)

In [None]:
# Embedding Model

embedding_model = SentenceTransformer("intfloat/e5-small-v2", device="cuda")

In [None]:
# Fine Tuned Llama Model

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
FINETUNED_MODEL = "ed-donner/pricer-2024-09-13_13.04.39"
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"

# Quantization config (4-bit)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, quantization_config=quant_config, device_map="auto"
)

# Load fine-tuned model
fine_tuned_model = PeftModel.from_pretrained(
    base_model, FINETUNED_MODEL, revision=REVISION
)

# Align generation config
fine_tuned_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")

In [None]:
# XGBoost Trained Model

MODEL_FILENAME = "xgboost_model.pkl"
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, token=hf_token)
xgb_model = joblib.load(model_path)

### 📊 Model prediction collection

In [None]:
def extract_tagged_price(output: str):
    """Extracts a float price from a string based on 'Price is $' keyword."""
    try:
        contents = output.split("Price is $")[1].replace(",", "")
        match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
        return float(match.group()) if match else 0.0
    except Exception:
        return 0.0

In [None]:
def ft_llama_price(description: str):
    prompt = (
        f"How much does this cost to the nearest dollar?\n\n{description}\n\nPrice is $"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = fine_tuned_model.generate(
        **inputs, max_new_tokens=5, num_return_sequences=1
    )

    result = tokenizer.decode(outputs[0])
    price = extract_tagged_price(result)
    return price

In [None]:
def xgboost_price(description: str):
    vector = embedding_model.encode([description], normalize_embeddings=True)[0]
    pred = xgb_model.predict([vector])[0]
    return round(float(max(0, pred)), 2)

In [None]:
def gpt4o_price(item):
    def get_embedding(text):
        return embedding_model.encode([text], normalize_embeddings=True)

    def find_similars(text):
        results = collection.query(
            query_embeddings=get_embedding(text).astype(float).tolist(), n_results=5
        )
        docs = results["documents"][0]
        prices = [m["price"] for m in results["metadatas"][0]]
        return docs, prices

    def format_context(similars, prices):
        context = (
            "To provide some context, here are similar products and their prices:\n\n"
        )
        for sim, price in zip(similars, prices):
            context += f"Product:\n{sim}\nPrice is ${price:.2f}\n\n"
        return context

    def build_messages(description, similars, prices):
        system_message = (
            "You are a pricing expert. "
            "Given a product description and a few similar products with their prices, "
            "estimate the most likely price. "
            "Respond ONLY with a number, no words."
        )
        context = format_context(similars, prices)
        user_prompt = (
            "Estimate the price for the following product:\n\n"
            + description
            + "\n\n"
            + context
        )
        return [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": "Price is $"},
        ]

    docs, prices = find_similars(description(item))
    messages = build_messages(description(item), docs, prices)
    response = openai.chat.completions.create(
        model="gpt-4o-mini", messages=messages, seed=42, max_tokens=5
    )
    reply = response.choices[0].message.content
    return float(
        re.search(r"[-+]?\d*\.\d+|\d+", reply.replace("$", "").replace(",", "")).group()
        or 0
    )

### ✂️ Split dataset and process

In [None]:
print("Splitting entire dataset...")
np.random.seed(42)
all_indices = list(range(len(test)))
np.random.shuffle(all_indices)

train_split_size = int(0.8 * len(all_indices))
train_indices = all_indices[:train_split_size]  # 80% of total
test_indices = all_indices[train_split_size:]  # 20% of total

train_indices = train_indices[:250]  # First 250 from training split
test_indices = test_indices[:50]  # First 50 from testing split

In [None]:
# Process subset of TRAINING data
ft_llama_preds_train = []
gpt4omini_preds_train = []
xgboost_preds_train = []
true_prices_train = []

for i in tqdm(train_indices):
    item = test[i]
    text = description(item)
    true_prices_train.append(item["price"])
    ft_llama_preds_train.append(ft_llama_price(text))
    gpt4omini_preds_train.append(gpt4o_price(item))
    xgboost_preds_train.append(xgboost_price(text))

In [None]:
print("True Prices:", true_prices_train)
print("FT-LLaMA Predictions:", ft_llama_preds_train)
print("GPT-4o-mini Predictions:", gpt4omini_preds_train)
print("XGBoost Predictions:", xgboost_preds_train)

Example :
- True Prices: [245.0, 24.99, 302.4, 737.0, ...]
- FT-LLaMA Predictions: [99.0, 53.0, 550.0, 852.0, ...]
- GPT-4o-mini Predictions: [179.99, 97.0, 348.0, 769.0, ...]
- XGBoost Predictions: [220.19, 59.85, 254.29, 335.76, 165.04, ...]

In [None]:
# Create features for TRAINING data
maxes_train = [
    max(a, b, c)
    for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, xgboost_preds_train)
]
means_train = [
    np.mean([a, b, c])
    for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, xgboost_preds_train)
]

# Create TRAINING dataframe
X_train = pd.DataFrame(
    {
        "FT_LLaMA": ft_llama_preds_train,
        "GPT4oMini": gpt4omini_preds_train,
        "XGBoost": xgboost_preds_train,
        "Max": maxes_train,
        "Mean": means_train,
    }
)

y_train = pd.Series(true_prices_train)

### 🏋️Train the Ensemble Model

In [None]:
np.random.seed(42)
lr = LinearRegression()
lr.fit(X_train, y_train)

# Print feature coefficients
feature_columns = X_train.columns.tolist()
for feature, coef in zip(feature_columns, lr.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept={lr.intercept_:.2f}")

- FT_LLaMA: 0.52
- GPT4oMini: 0.17
- XGBoost: -0.31
- Max: 0.45
- Mean: 0.13
- Intercept=-6.06

---
FT_LLaMA is the most influential model in the ensemble.

Max prediction also has strong positive impact.

GPT4oMini and Mean contribute less, but still add value.

XGBoost has a negative coefficient, acting as a counterbalance.


Overall: FT_LLaMA leads, max adds value, XGBoost corrects for overestimation—resulting in a balanced ensemble.

### 🔮 Prediction

In [None]:
# Process subset of TEST data
ft_llama_preds_test = []
gpt4omini_preds_test = []
xgboost_preds_test = []
true_prices_test = []

print("Processing TEST data (50 items)...")
for i in tqdm(test_indices):
    item = test[i]
    text = description(item)
    true_prices_test.append(item["price"])
    ft_llama_preds_test.append(ft_llama_price(text))
    gpt4omini_preds_test.append(gpt4o_price(item))
    xgboost_preds_test.append(xgboost_price(text))

# Create features for TEST data
maxes_test = [
    max(a, b, c)
    for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, xgboost_preds_test)
]
means_test = [
    np.mean([a, b, c])
    for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, xgboost_preds_test)
]

# Create TEST dataframe
X_test = pd.DataFrame(
    {
        "FT_LLaMA": ft_llama_preds_test,
        "GPT4oMini": gpt4omini_preds_test,
        "XGBoost": xgboost_preds_test,
        "Max": maxes_test,
        "Mean": means_test,
    }
)

y_test = pd.Series(true_prices_test)

### 🧪 Evaluation

In [None]:
# Evaluate on the test set
print("Evaluating model...")
y_pred = lr.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R² score: {r2:.4f}")

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

# Calculate MAPE
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f"MAPE: {mape:.2f}%")

Evaluating model...
- R² score: 0.7376
- RMSE: 127.62
- MAPE: 29.70%

---

- R² = 0.74: This is a solid R² value, indicating our model explains about 74% of the variance in the price data
Generally, an R² above 0.7 is considered good for price prediction tasks
- RMSE = 127.6: Average error; good if prices are in the thousands.
- MAPE = 29.7%: This means our predictions are off by roughly 30% on average. Typical for price prediction, but there’s room for improvement.


### 🚀 Push to HF

In [None]:
# Serialize Ensemble model locally for Hugging Face upload

MODEL_DIR = os.path.join(ROOT, "models")
MODEL_FILENAME = "ensemble_model.pkl"
LOCAL_MODEL = os.path.join(MODEL_DIR, MODEL_FILENAME)

os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(lr, LOCAL_MODEL)

# Create the model repo if it doesn't exist
api.create_repo(repo_id=REPO_ID, repo_type="model", private=True, exist_ok=True)

# Upload the saved model
api.upload_file(
    path_or_fileobj=LOCAL_MODEL,
    path_in_repo=MODEL_FILENAME,
    repo_id=REPO_ID,
    repo_type="model",
)