In [1]:
import os
import re
import math
import json
import joblib
from tqdm import tqdm
import torch
import random
from dotenv import load_dotenv
from huggingface_hub import login,HfApi
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from xgboost import XGBRegressor
from sklearn.manifold import TSNE

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
DB = "../chroma_db"

In [3]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
HF_USER = "Codexankit"
DATASET_NAME = f"{HF_USER}/items-prompt-lite"

In [5]:
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 20000/20000 [00:00<00:00, 196263.77 examples/s]
Generating val split: 100%|██████████| 1000/1000 [00:00<00:00, 167070.46 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 104421.64 examples/s]


In [7]:
print(train[0]["prompt"])

What does this cost to the nearest dollar?

Title: Schlage F59 & 613 Andover Interior Knob (Deadbolt Included)  
Category: Home Hardware  
Brand: Schlage  
Description: A single‑piece oil‑rubbed bronze knob that mounts to a deadbolt for secure, easy interior door use.  
Details: Designed for a 4" minimum center‑to‑center door prep, it offers a lifetime mechanical and finish warranty and comes ready for quick installation.

Price is $


In [8]:
print(test[0]["prompt"])

What does this cost to the nearest dollar?

Title: Excess V2 Distortion/Modulation Pedal  
Category: Music Pedals  
Brand: Old Blood Noise  
Description: A versatile pedal offering distortion and three modulation modes—delay, chorus, and harmonized fifths—with full control over signal routing and expression.  
Details: Features include separate gain, tone, and volume controls; time, depth, and volume per modulation; order switching, soft‑touch bypass, and expression jack for dynamic control.

Price is $


In [9]:
# Load embedding model
model_embedding = SentenceTransformer("intfloat/e5-small-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 297.52it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: intfloat/e5-small-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [10]:
model_embedding

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

### Create a Chroma VectorStore
You can configure Chroma to save and load the database from your local machine, using the PersistentClient.

Data will be persisted automatically and loaded on start (if it exists).

Collections are where you'll store your embeddings, documents, and any additional metadata. Collections index your embeddings and documents, and enable efficient retrieval and filtering. You can create a collection with a name:

In [11]:
client = chromadb.PersistentClient(path=DB)

In [12]:
# Check if the collection exists and delete it if it does
collection_name = "price_items"
existing_collection_names = client.list_collections()

if collection_name in existing_collection_names:
    client.delete_collection(collection_name)
    print(f"Deleted existing collection: {collection_name}")

collection = client.create_collection(collection_name)

In [14]:
# Format description function (no price in text)
def description(item):
    text = item["prompt"].replace(
        "How much does this cost to the nearest dollar?\n\n", ""
    )
    text = text.split("\n\nPrice is $")[0]
    return f"passage: {text}"


description(train[0])

'passage: What does this cost to the nearest dollar?\n\nTitle: Schlage F59 & 613 Andover Interior Knob (Deadbolt Included)  \nCategory: Home Hardware  \nBrand: Schlage  \nDescription: A single‑piece oil‑rubbed bronze knob that mounts to a deadbolt for secure, easy interior door use.  \nDetails: Designed for a 4" minimum center‑to‑center door prep, it offers a lifetime mechanical and finish warranty and comes ready for quick installation.'

In [16]:
batch_size = 300  # how many items to insert into Chroma at once
encode_batch_size = 1024  # how many items to encode at once in GPU memory

for i in tqdm(range(0, len(train), batch_size), desc="Processing batches"):
    end_idx = min(i + batch_size, len(train))

    # Collect documents and metadata
    documents = [description(train[j]) for j in range(i, end_idx)]
    metadatas = [{"completion": train[j]["completion"]} for j in range(i, end_idx)]
    ids = [f"doc_{j}" for j in range(i, end_idx)]

    # GPU batch encoding
    vectors = model_embedding.encode(
        documents,
        batch_size=encode_batch_size,
        show_progress_bar=False,
        normalize_embeddings=True,
    ).tolist()

    # Insert into Chroma
    collection.add(
        ids=ids, documents=documents, embeddings=vectors, metadatas=metadatas
    )

print("✅ Embedding and storage to ChromaDB completed.")

Processing batches: 100%|██████████| 67/67 [12:40<00:00, 11.36s/it]

✅ Embedding and storage to ChromaDB completed.





##  Embedding-Based Regression with XGBoost

In [21]:
# Step 1: Load vectors and prices from Chroma
result = collection.get(include=["embeddings", "documents", "metadatas"])

vectors = np.asarray(result["embeddings"], dtype=np.float32)
documents = result["documents"]

prices = np.asarray(
    [float(meta["completion"]) for meta in result["metadatas"]],
    dtype=np.float32
)

In [23]:
# Step 2: Train XGBoost model
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(vectors, prices)

In [24]:
# Step 3: Serialize XGBoost model locally for Hugging Face upload 
ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
MODEL_DIR = os.path.join(ROOT, "models")
MODEL_FILENAME = "xgboost_model.pkl"
LOCAL_MODEL = os.path.join(MODEL_DIR, MODEL_FILENAME)

os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(xgb_model, LOCAL_MODEL)

['d:\\AmaZon_Price_Analyzer\\models\\xgboost_model.pkl']

In [25]:
# Step 4: Push serialized XGBoost model to Hugging Face Hub
api = HfApi(token=hf_token)
REPO_NAME = "smart-deal-finder-models"
REPO_ID = f"{HF_USER}/{REPO_NAME}"

# Create the model repo if it doesn't exist
api.create_repo(repo_id=REPO_ID, repo_type="model", private=True, exist_ok=True)

# Upload the saved model
api.upload_file(
    path_or_fileobj=LOCAL_MODEL,
    path_in_repo=MODEL_FILENAME,
    repo_id=REPO_ID,
    repo_type="model",
)

Processing Files (1 / 1): 100%|██████████| 1.35MB / 1.35MB,  321kB/s  
New Data Upload: 100%|██████████| 1.35MB / 1.35MB,  321kB/s  


CommitInfo(commit_url='https://huggingface.co/Codexankit/smart-deal-finder-models/commit/3b6b8399daf31a4861b9b37572c17d5d815dfa62', commit_message='Upload xgboost_model.pkl with huggingface_hub', commit_description='', oid='3b6b8399daf31a4861b9b37572c17d5d815dfa62', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Codexankit/smart-deal-finder-models', endpoint='https://huggingface.co', repo_type='model', repo_id='Codexankit/smart-deal-finder-models'), pr_revision=None, pr_num=None)