In [None]:
# imports

import os
import re
import math
import json
from tqdm import tqdm
import random
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
import pickle
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from items import Item
from testing import Tester
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [None]:
# CONSTANTS

QUESTION = "How much does this cost to the nearest dollar?\n\n"
DB = "products_vectorstore"
# environment

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

from items import Item

with open('test.pkl', 'rb') as file:
    test = pickle.load(file)

client = chromadb.PersistentClient(path=DB)
collection = client.get_or_create_collection('products')
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
prices = [metadata['price'] for metadata in result['metadatas']]


# Catboost GBT

We will now train a Random Forest model.

Can you spot the difference from what we did in Week 6? In week 6 we used the word2vec model to form vectors; this time we'll use the vectors we already have in Chroma, from the SentenceTransformer model.

In [None]:
from catboost import CatBoostRegressor
import numpy as np

# Initialize the model
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    loss_function='RMSE',
    verbose=100
)

model.fit(vectors, prices)
joblib.dump(model, 'random_forest_model.pkl')

In [None]:
Tester.test(model, test)

In [None]:
from agents.specialist_agent import SpecialistAgent
from agents.frontier_agent import FrontierAgent
from agents.random_forest_agent import RandomForestAgent
from agents.my_specialist_agent import MySpecialistAgent

specialist = SpecialistAgent()
my_specialist = MySpecialistAgent()
frontier = FrontierAgent(collection)
random_forest = RandomForestAgent()

def description(item):
    return item.prompt.split("to the nearest dollar?\n\n")[1].split("\n\nPrice is $")[0]
def rf(item):
    return random_forest.price(description(item))

In [None]:
product = "Quadcast HyperX condenser mic for high quality audio for podcasting"
print(specialist.price(product))
print(my_specialist.price(product))

print(frontier.price(product))
print(random_forest.price(product))

In [None]:
specialists = []
my_specialists = []
frontiers = []
random_forests = []
prices = []
for item in tqdm(test[1040:1250]):
    text = description(item)
    specialists.append(specialist.price(text))
    my_specialists.append(my_specialist.price(text))
    frontiers.append(frontier.price(text))
    random_forests.append(random_forest.price(text))
    prices.append(item.price)

In [None]:
mins = [min(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]
maxes = [max(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]
means = [np.mean([s,ms,f,r]) for s, ms, f, r, in zip(specialists, my_specialists, frontiers, random_forests)]

X = pd.DataFrame({
    'Specialist': specialists,
    'MySpecialist': my_specialists,
    'Frontier': frontiers,
    'RandomForest': random_forests,
    'Min': mins,
    'Max': maxes,
    'Mean': means,
})

# Convert y to a Series
y = pd.Series(prices)

# Ensemble GBT

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

np.random.seed(42)


lr = GradientBoostingRegressor(
    n_estimators=150, 
    max_depth=3, 
    random_state=42,
    learning_rate=0.05,
    subsample=0.8,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features='sqrt'
)

lr.fit(X, y)

feature_columns = X.columns.tolist()

print("Feature importances:")
for feature, importance in zip(feature_columns, lr.feature_importances_):
    print(f"{feature}: {importance:.4f}")

joblib.dump(lr, 'ensemble_model.pkl')

In [None]:
from agents.ensemble_agent import EnsembleAgent
ensemble = EnsembleAgent(collection)

In [None]:
ensemble.price(product)

In [None]:
def ensemble_pricer(item):
    return max(0,ensemble.price(description(item)))

In [None]:
Tester.test(ensemble_pricer, test)

# More changes

## Added my_specialist_agent

In [None]:
import modal
from agents.agent import Agent


class MySpecialistAgent(Agent):
    """
    An Agent that runs our fine-tuned LLM that's running remotely on Modal
    """

    name = "Specialist Agent"
    color = Agent.RED

    def __init__(self):
        """
        Set up this Agent by creating an instance of the modal class
        """
        self.log("Specialist Agent is initializing - connecting to modal")
        Pricer = modal.Cls.from_name("my_pricer-service", "Pricer") #it just points to my modal service with custom model
        self.pricer = Pricer()
        self.log("Specialist Agent is ready")
        
    def price(self, description: str) -> float:
        """
        Make a remote call to return the estimate of the price of this item
        """
        self.log("Specialist Agent is calling remote fine-tuned model")
        result = self.pricer.price.remote(description)
        self.log(f"Specialist Agent completed - predicting ${result:.2f}")
        return result


## Modified ensemble_agent

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import joblib
import numpy as np
from agents.agent import Agent
from agents.specialist_agent import SpecialistAgent
from agents.frontier_agent import FrontierAgent
from agents.random_forest_agent import RandomForestAgent
from agents.my_specialist_agent import MySpecialistAgent

specialist = SpecialistAgent()

class EnsembleAgent(Agent):

    name = "Ensemble Agent"
    color = Agent.YELLOW
    
    def __init__(self, collection):
        """
        Create an instance of Ensemble, by creating each of the models
        And loading the weights of the Ensemble
        """
        self.log("Initializing Ensemble Agent")
        self.specialist = SpecialistAgent()
        self.my_specialist = MySpecialistAgent() #added my specialist
        self.frontier = FrontierAgent(collection)
        self.random_forest = RandomForestAgent() #my model here is a cabtoost regularized and pruned
        self.model = joblib.load('ensemble_model.pkl') #my model is actually a gbt
        self.log("Ensemble Agent is ready")

    def price(self, description: str) -> float:
        """
        Run this ensemble model
        Ask each of the models to price the product
        Then use the Linear Regression model to return the weighted price
        :param description: the description of a product
        :return: an estimate of its price
        """
        self.log("Running Ensemble Agent - collaborating with specialist, frontier and random forest agents")
        specialist = self.specialist.price(description)
        my_specialist = self.my_specialist.price(description) #added my specialist estimate
        frontier = self.frontier.price(description)
        random_forest = self.random_forest.price(description)
        X = pd.DataFrame({
            'Specialist': [specialist],
            'MySpecialist': [my_specialist],
            'Frontier': [frontier],
            'RandomForest': [random_forest],
            'Min': [min(specialist, frontier, random_forest)],
            'Max': [max(specialist, frontier, random_forest)],
            'Mean': [np.mean([specialist, my_specialist, frontier, random_forest])], #added the mean and myspecialist prediction.
        })
        y = max(0, self.model.predict(X)[0])
        self.log(f"Ensemble Agent complete - returning ${y:.2f}")
        return y