<a href="https://colab.research.google.com/github/goelnikhils-lgtm/languagemodels/blob/main/RecommendationEngine_TwoTower_Ranker_Using_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#feature computing
#credit - >https://github.com/decodingml/personalized-recommender-course/blob/main/notebooks/1_fp_computing_features.ipynb
#https://medium.com/data-science-collective/4-deploy-scalable-tiktok-like-recommenders-bdf117c55648
#https://medium.com/data-science-collective/using-llms-to-build-tiktok-like-recommenders-bd001c1329d2
#in two tower networks important point is that each of that the two tower neural nets can have any data set to either make then use Collaborative Filtering or Content Filtering
#if we add interaction features in item or user tower then we orient TTF to content based filtering aspect
#the two towers that have been trained are used at online inference time
#the retrieval towers are trained so that user and candidate items are in nearby embedding space. THAT IS THE OBJECTIVE OF TWO TOWER
#once embeddings trained thru tower then candidate items are found by using these embeddings and these candidate items are stored in a Vector DB
#when new user comes at real time / online - then user embeddings are generated then a similarity search candidate Item embeddings and a list of item is generated
#this list of items is given to online ranker model to rank items for user. the ranker model can use extra features as well

In [None]:
import time
notebook_start_time = time.time()

In [None]:
#setup enviornment
import sys
from pathlib import Path

def is_google_colab() ->bool:
  if "google.colab" in str(get_ipython()):
    return True
  return False

def clone_repository() ->None:
  !git clone https://github.com/decodingml/hands-on-recommender-system.git
  %cd hands-on-recommender-system

def install_dependencies() ->None:
  !pip install --upgrade uv
  !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
  clone_repository()
  install_dependencies()

  root_dir = str(Path().absolute())
  print("Colab Enviornment")
else:
  root_dir = str(Path().absolute().parent)
  print("Local Enviornment")

#add the root directory to the 'PYPYTHONPATH' to use the 'recys' Python module from the notebook.

if root_dir not in sys.path:
  print(f"adding the following directory to the PYTHONPATH:{root_dir}")
  sys.path.append(root_dir)


In [None]:
!pip install tensorflow==2.19.0

In [None]:
# Install a compatible tensorflow version to resolve dependency conflict
!pip install loguru
!pip install hopsworks
!pip install langchain
!pip install langchain-openai
!pip install catboost
!pip install tensorflow_recommenders

In [None]:
!pip install --upgrade recsys

In [None]:
  pip show recsys

In [None]:
#Feature pipeline computing features
#imports
#%load_ext autoreload
#%autoreload 2

# Workaround for imp module error in autoreload
try:
    from IPython.extensions import autoreload
    get_ipython().run_line_magic('load_ext', 'IPython.extensions.autoreload')
    get_ipython().run_line_magic('autoreload', '2')
except ImportError:
    print("Could not load autoreload extension.")

import warnings
from pprint import pprint

import polars as pl
import torch
from loguru import logger
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

from recsys import hopsworks_integration
from recsys.config import settings
from recsys.features.articles import(
    compute_features_articles,
    generate_embeddings_for_dataframe
)


from recsys.features.customers import DatasetSampler , compute_features_customers
from recsys.features.interaction import generate_interaction_data
from recsys.features.ranking import compute_ranking_dataset
from recsys.features.transactions import compute_features_transactions
from recsys.hopsworks_integration import feature_store
from recsys.raw_data_sources import h_and_m as h_and_m_raw_data


In [None]:
pprint(dict(settings))
DatasetSampler.get_supported_sizes()

In [None]:
#connect to Hopsworks feature store
project , fs = hopsworks_integration.get_feature_store()

In [None]:
#H&M dataset
articles_df = h_and_m_raw_data.extract_articles_df()
articles_df.shape

In [None]:
articles_df.head(3)

In [None]:
articles_df.null_count()
#only one null record

In [None]:
articles_df = compute_features_articles(articles_df)
articles_df.shape

In [None]:
articles_df.head(3)

In [None]:
#create embeddings from the articles description
for i , desc in enumerate(articles_df['article_description'].head(n=3)):
  logger.info(f"Item{i+1}:\n{desc}")

In [None]:
#generate embeddings for articles
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
logger.info(f"loading '{settings.FEATURES_EMBEDDING_MODEL_ID}' embedding model to {device=}")
embedding_model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)
#generate embeddings for articles
articles_df = generate_embeddings_for_dataframe(articles_df,"article_description",embedding_model,batch_size = 128)

In [None]:
articles_df[["article_description","embeddings"]].head(3)

In [None]:
#Customer Data
customers_df = h_and_m_raw_data.extract_customers_df()
customers_df.shape

In [None]:
customers_df.head(3)
customers_df.null_count()

In [None]:
customers_df = compute_features_customers(customers_df,drop_null_age=True)
customers_df.shape

In [None]:
#Transactions Data
transactions_df = h_and_m_raw_data.extract_transactions_df()
transactions_df.shape

In [None]:
transactions_df.head(3)

In [None]:
#transactions feature engineering
transactions_df = compute_features_transactions(transactions_df)
transactions_df.shape

In [None]:
#sampling smaller data set as Transactions has 30M transactions
sampler = DatasetSampler(size=settings.CUSTOMER_DATA_SIZE)
dataset_subset = sampler.sample(customers_df = customers_df ,transations_df = transactions_df)
customers_df = dataset_subset["customers_df"]
transactions_df = dataset_subset["transactions_df"]

In [None]:
#interaction data as we need interaction of customer with transactions
interaction_df = generate_interaction_data(transactions_df)
interaction_df.shape

In [None]:
interaction_df.head(3)

In [None]:
#look at interaction score distribution
interaction_df.groupby("interaction_score").agg(pl.count("interaction_score").alias("total_interactions"))
#0 - not interaction
#1 - A customer clicked on item
#2 - A customer bought an item

In [None]:
#create hopworks feature groups
logger.info("Uploading 'customers' Feature Group to Hopsworks.")
customers_fg = feature_store.create_customers_feature_group(
    fs,
    df=customers_df,
    customers_age_embedding_dim=model.get_sentence_embedding_dimension(),
    online_enabled=True,
)

In [None]:
#for Articles
logger.info("Uploading 'articles' Feature Group to Hopsworks.")
articles_fg = feature_store.create_articles_feature_group(
    fs,
    df=articles_df,
    articles_embedding_dim=embedding_model.get_sentence_embedding_dimension(),
    online_enabled=True,
)

In [None]:
#transactions
logger.info("Uploading 'transactions' Feature Group to Hopsworks.")
transactions_fg = feature_store.create_transactions_feature_group(
    fs,
    df=transactions_df,
    online_enabled=True,
)

In [None]:
#interactions
logger.info("Uploading 'interactions' Feature Group to Hopsworks.")
interactions_fg = feature_store.create_interactions_feature_group(
    fs,
    df=interaction_df,
    online_enabled=True,
)

In [None]:
#compute Ranking dataset
ranking_df = compute_ranking_dataset(transactions_fg,customers_fg,articles_fg)
ranking_df.shape

In [None]:
ranking_df.head(3)

In [None]:
ranking_df.get_column_names("label").value_counts()

In [None]:
#loading this dataset to hopworks
logger.info("Uploading 'ranking' Dataset to Hopsworks.")
ranking_ds = feature_store.create_ranking_dataset(
    fs,
    df=ranking_df,
    parents =[articles_fg,customers_fg,transactions_fg],
    online_enabled=True,
)

In [None]:
#MOVING TO NOW CODE RETRIEVAL PIPELINE
from recsys import hopworks_integration , training
from recsys.config import settings
from pprint import pprint

pprint(dict(settings))

In [None]:
#connect to hopworks feature store
project , fs = hopsworks_integration.get_feature_store()
#

In [None]:
#create hopworks feature views
feature_view = hopsworks_integration.feature_store.create_retrieval_feature_view(fs)

In [None]:
#creating a training data set for training retrieval two tower neural network
#there are two towers
#customer tower independent of articles tower
#item tower

#for training customer tower we will use following data set
#ID , age , month_sin and month_cos(indicating purchase month)

#for article tower we will use
#article_id,garment_group_name , index_group_name

dataset = training.two_tower.TwoTowerDataset(
    feature_view = feature_view, batch_size = settings.TWO_TOWER_MODEL_BATCH_SIZE
)
train_ds , val_ds = dataset.get_train_val_split()

#let's look at the dataset
dataset.properties["train_df"].head()


In [None]:
#creating two towers 1 - customer 2 articles - we will be using hopworks for this

query_model_factory = training.two_tower.QueryTowerFactory(dataset=dataset)
query_model = query_model_factory.build()

item_model_factory = training.two_tower.ItemTowerFactory(dataset=dataset)
item_model = item_model_factory.build()

model_factory = training.two_tower.TwoTowerModelFactory(dataset = dataset)
model = model_factory.build(query_model,item_model)

#training the model
trainer = training.two_tower.TwoTowerTrainer(dataset = dataset , model=model)
history = trainer.train(train_ds,val_ds)


In [None]:
#plot the training loss
import matplotlib.pyplot as plt

#creating figure with two subplots
fig, (ax1,ax2) = plt.subplots(2,1,figsize=(10,6))

#Training Loss Subplot
ax1.plot(history.history["loss"] , label = "Training Loss", color = "blue")
ax1.set_title("Training Loss over time")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss")
ax1.legend()
as1.grid(True)

#Validation Loss Subplot
ax2.plot(history.history["val_loss"] , label = "Validation Loss", color = "red")
ax2.set_title("Validation Loss over time")
ax2.set_xlabel("Epoch")
ax2.set_ylabel("Loss")
ax2.legend()
as2.grid(True)

#
plt.tight_layout()
plt.show()

In [None]:
#push the models to hopsworks registry
m = project.get_model_registry()

query_model = hopsworks_integration.two_tower_serving.HopsworksQueryModel(model = model.query_model)
query_model.register(
    mr= mr,
    feature_view = feature_view,
    query_df = dataset.properties["query_df"]
)


item_model = hopsworks_integration.two_tower_serving.HopsworksCandidate(model = model.item_model)
item_model.register(
    mr= mr,
    feature_view = feature_view,
    item_df = dataset.properties["query_df"]
)


In [None]:
#ranking model
#get the training data for ranking model
feature_view_ranking = hopsworks_integration.feature_store.create_ranking_feature_view(fs)

X_train, X_val, y_train, y_val = feature_view_ranking.train_test_split(
    test_size = settings.RANKING_DATASET_VALIDATION_SPLIT_SIZE,
    description = "Ranking Training Dataset"
)
X_train.head(3)
y_train.head(3)

In [None]:
#training the ranking model - Catboost is the ranker model
model = training.ranking.RankingModelFactory.build()
trainer = training.ranking.RankingTrainer(model = model , train_dataset = (X_train,y_train) , eval_dataset = (X_val,y_val))
trainer.fit()

In [None]:
#evaluate the ranking model
metrics = trainer.evaluate(log=True)
trainer.get_feature_importance()
mr = project.get_model_registry()

ranking_model = hopsworks_integration.ranking_serving.HopsworksRankingModel(model = model)
ranking_model.register(mr, feature_view_ranking, X_train,metrics)

In [35]:
#using LLM for ranking
import logging
import hopsworks
from langchain import PromptTemplate , LLMChain
from langchain_core.output_parsers import BaseOutputParser
from langchain_openai import ChatOpenAI

class ScoreOutputParser(BaseOutputParser[float]):

  def parse(self,output) ->float:
      text = output['text']
      if "Probability:" not in text:
        raise ValueError("Text doesn't not contain probability:'  label")
      probability_str = text.split("Probability:")[1].strip()
      probability = float(probability_str)
      #ensure the probability is in the valid range[0,1]
      if not(0.0<=probability<=1.0):
        raise ValueError(f"Invalid probability:{probability}")
      return probability

  PROMPT_TEMPLATE: str = """
      You are a helpful assistant specialized in predicting customer behavior . Your task is to analyze the features of product and predict the probability of it being purchased by customer

      ###Instructions:
      1.Use the provided features of the product to make your prediction.
      2.Consider the following numeric and categorical features:
      -Numeric features: These are quantitaive attributes , such as numerical identifiers or measurements.
      -Categorical features: These describe qualitative aspects , like product category , color, and material
      3.Your response should only include the probability of purchase for the positive class(e.g likelihood of being purchased) , as a value between 0 and 1.

       ### Product and User Features:
            Numeric features:
            - Age: {age}
            - Month Sin: {month_sin}
            - Month Cos: {month_cos}

            Categorical features:
            - Product Type: {product_type_name}
            - Product Group: {product_group_name}
            - Graphical Appearance: {graphical_appearance_name}
            - Colour Group: {colour_group_name}
            - Perceived Colour Value: {perceived_colour_value_name}
            - Perceived Colour Master Value: {perceived_colour_master_name}
            - Department Name: {department_name}
            - Index Name: {index_name}
            - Department: {index_group_name}
            - Sub-Department: {section_name}
            - Group: {garment_group_name}

            ### Your Task:
            Based on the features provided, predict the probability that the customer will purchase this product to 4-decimals precision. Provide the output in the following format:
            Probability:
        """
  class Predict(object):
    def __init__(self):
        self.input_features = ["age","month_sin","month_cos","product_type_name","product_group_name","graphical_appearance_name","colour_group_name",
                              "perceived_colour_value_name","perceived_colour_master_name","department_name","index_name","index_group_name",
                              "section_name","garment_group_name"]
        self._retrieve_secrets()
        self.LLM = self._build_lang_chain()
        self.parser = ScoreOutputParser()

    def _retrieve_secrets(self):
        project = hopsworks.login()
        secrets_api = hopsworks.get_secrets_api()
        self.openai_api_key = secrets_api.get_secret("OPENAI_API_KEY").value

    def predict(self, inputs):
        #extract rank features
        features = inputs[0].pop("ranking_features")[:20]
        articles_ids = inputs[0].pop("article_ids")[:20]

        #preprocess features for LLM model
        preprocessed_feature_candidates = self._preprocess_features(features)

        scores =[]
        for candidates in preprocessed_feature_candidates:
          try:
            text = self.llm_invoke(candidate)
            score = self.parser.parse(text)
          except Exception as exception:
            score = 0
          scores.append(score)
        return {
            "article_ids":articles_ids,
            "scores":scores
        }

    def _preprocess_features(self,features):
      """ Convert ranking features into natural language """
      preprocessed = []
      for feature_set in features:
          query_parameters = {}
          for key,value in zip(self.input_features,feature_set):
            query_parameters[key] = value
          preprocessed.append(query_parameters)
      return preprocessed

    def _build_lang_chain(self):
            model = ChatOpenAI(
            model_name='gpt-4o-mini-2024-07-18',
            temperature=0.7,
            openai_api_key=self.openai_api_key,)

            prompt = PromptTemplate(
                input_variables=self.input_features,
                template=self.PROMPT_TEMPLATE,
            )
            langchain = LLMChain(
                llm= model,
                prompt=prompt,
                verbose = True
            )
            return langchain

In [34]:
# Example usage of the Predict class for LLM ranking

# Instantiate the Predict class
# This will attempt to retrieve the OpenAI API key and build the LangChain model
try:
    llm_ranker = Predict()
    print("Predict class instantiated successfully.")
except Exception as e:
    print(f"Error instantiating Predict class: {e}")
    llm_ranker = None # Set to None if instantiation fails

if llm_ranker:
    # Prepare some sample input data
    # Replace this with your actual ranking features and article IDs
    sample_inputs = [
        {
            "ranking_features": [
                [49.0, 0.707, 0.707, "Tops", "Garment Upper Body", "Solid", "Black", "Dark", "Black", "Ladieswear", "Ladieswear", "Ladieswear", "Blouses", "Woven Tops"],
                [25.0, -0.707, 0.707, "Shoes", "Shoes", "Metallic", "Silver", "Light", "Silver", "Divided", "Divided", "Divided", "Outdoor", "Outdoor"],
                # Add more sample feature sets as needed
            ],
            "article_ids": [
                "663713001",
                "541518023",
                # Add corresponding article IDs
            ]
        }
    ]

    # Get predictions from the LLM ranker
    try:
        predictions = llm_ranker.predict(sample_inputs)
        print("\nPredictions:")
        print(predictions)
    except Exception as e:
        print(f"\nError during prediction: {e}")

Error instantiating Predict class: name 'Predict' is not defined
