# Content-based Filtering

The content-based filtering recommendation algorithm suggests items similar to ones the user has liked before by analyzing the features of the items and the user’s preferences. It compares the content (e.g., product descriptions, keywords, tags) of items with a user’s historical interactions (such as movies they have watched or articles they have read) and suggests items with similar attributes.

Imagine a user watching a sci-fi movie, and the system extracts features like “genre: sci-fi,” “director: Christopher Nolan,” and “themes: space exploration.” The system would then recommend other movies sharing these features, e.g., another sci-fi film with similar themes and directors.

In this notebook, we would take a very simple and straight-forward approach to implement Content-based Filtering. We make recommendations by finding the most similar items to the user's last interacted item, where similarity is defined based on the textual description of the items. The score is the similarity between the target item and the neighbors.

Specifically, we will:
- Select a few description fields from the item metadata
- Transform the features into numerical format, e.g. TFIDF for text data, Multi-Hot Encoders for categorical features
- Concat and normalize the transformation output
- Use cosine similarity to look up neighbor items and use it as the prediction score

# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import dill
import mlflow
import numpy as np  # required for the scikit-learn pipeline to work
import pandas as pd
import torch
from dotenv import load_dotenv
from loguru import logger
from pydantic import BaseModel
from sklearn.preprocessing import MinMaxScaler

load_dotenv()

sys.path.insert(0, "..")


from src.eval import (
    create_label_df,
    create_rec_df,
    log_classification_metrics,
    log_ranking_metrics,
    merge_recs_with_target,
)
from src.id_mapper import IDMapper
from src.model import ContentBased
from src.train_utils import map_indice

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    log_to_mlflow: bool = True
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = "005-content-based"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    top_K: int = 100
    top_k: int = 10

    batch_size: int = 128

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")

        if not os.environ.get("MLFLOW_TRACKING_URI"):
            logger.warning(
                f"Environment variable MLFLOW_TRACKING_URI is not set. Setting self.log_to_mlflow to false."
            )
            self.log_to_mlflow = False

        if self.log_to_mlflow:
            logger.info(
                f"Setting up MLflow experiment {self.experiment_name} - run {self.run_name}..."
            )

            mlflow.set_experiment(self.experiment_name)
            mlflow.start_run(run_name=self.run_name)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

[32m2024-09-22 21:05:47.117[0m | [1mINFO    [0m | [36m__main__[0m:[36minit[0m:[36m29[0m - [1mSetting up MLflow experiment FSDS RecSys - L5 - Reco Algo - run 005-content-based...[0m


{
  "testing": false,
  "log_to_mlflow": true,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "005-content-based",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/005-content-based",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "top_K": 100,
  "top_k": 10,
  "batch_size": 128
}


# Implement

In [4]:
def init_model(train_item_features, device):
    model = ContentBased(item_features=train_item_features, device=device)
    return model

In [5]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
# device = 'cpu'
logger.info(f"Using {device} device")

[32m2024-09-22 21:05:47.257[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mUsing mps device[0m


In [6]:
with open("../data/item_metadata_pipeline.dill", "rb") as f:
    item_metadata_pipeline = dill.load(f)

# Test implementation

In [7]:
# Mock data
user_indices = [0, 0, 1, 2, 2]
item_indices = [0, 1, 2, 3, 4]
ratings = [1, 4, 5, 3, 2]
timestamps = [0, 1, 2, 3, 4]
main_category = [
    "All Electronics",
    "Video Games",
    "All Electronics",
    "Video Games",
    "Unknown",
]
title = ["All Electronics", "Video Games", "All Electronics", "Video Games", "Unknown"]
description = [[], [], ["Video games blah blah"], [], ["blah blah"]]
categories = [[], ["Headsets"], ["Video Games"], [], ["blah blah"]]
price = ["from 14.99", "14.99", "price: 9.99", "20 dollars", "None"]

train_df = pd.DataFrame(
    {
        "user_indice": user_indices,
        "item_indice": item_indices,
        args.rating_col: ratings,
        args.timestamp_col: timestamps,
        "main_category": main_category,
        "title": title,
        "description": description,
        "categories": categories,
        "price": price,
    }
)
# Drop duplicated item features so that the ContentBased model will fit correctly in terms of index mapping
fit_df = train_df.drop_duplicates(subset=["item_indice"])
train_item_features = item_metadata_pipeline.transform(fit_df).astype(np.float32)

val_user_indices = [0, 1, 2]
val_item_indices = [2, 1, 2]
val_ratings = [2, 4, 5]
val_timestamps = [5, 6, 7]
val_main_category = ["All Electronics", "Video Games", "All Electronics"]
val_title = ["All Electronics", "Video Games", "All Electronics"]
val_description = [["Video games blah blah"], [], ["Video games blah blah"]]
val_categories = [["Video Games"], ["Headsets"], ["Video Games"]]
val_price = ["price: 9.99", "14.99", "price: 9.99"]

val_df = pd.DataFrame(
    {
        "user_indice": val_user_indices,
        "item_indice": val_item_indices,
        args.rating_col: val_ratings,
        args.timestamp_col: val_timestamps,
        "main_category": val_main_category,
        "title": val_title,
        "description": val_description,
        "categories": val_categories,
        "price": val_price,
    }
)
val_item_features = item_metadata_pipeline.transform(val_df).astype(np.float32)

In [8]:
n_users = len(set(user_indices))
n_items = len(set(item_indices))

model = init_model(train_item_features, device)

items1 = [1, 2]
items2 = [0, 3]
predictions = model.predict(items1, items2)
print(predictions)

print("\n\n")

users = [0, 1]
anchor_items = [2, 3]
recommendations = model.recommend(users, anchor_items, k=args.top_K)
print(recommendations)

tensor([0.1166, 0.0384], device='mps:0')





Generating Recommendations:   0%|          | 0/2 [00:00<?, ?it/s]

{'user_indice': [0, 0, 0, 0, 1, 1, 1, 1], 'recommendation': [0, 4, 1, 3, 1, 4, 0, 2], 'score': [0.6481854319572449, 0.13637012243270874, 0.03882099688053131, 0.03839520364999771, 0.9612818360328674, 0.3155585825443268, 0.16881369054317474, 0.03839520364999771]}


# Prep data

In [9]:
def get_last_item(df, item_sequence_col="item_sequence"):
    return df.assign(
        last_item_indice=lambda df: df[item_sequence_col].apply(lambda s: s[-1])
    )

In [10]:
train_df = pd.read_parquet("../data/train_features_neg_df.parquet")
val_df = pd.read_parquet("../data/val_features_neg_df.parquet")
idm = IDMapper().load("../data/idm.json")
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
print(f"{val_timestamp=}")

val_timestamp=np.int64(1628641464793)


In [11]:
train_df = train_df.pipe(get_last_item)
val_df = val_df.pipe(get_last_item)
full_df = pd.concat([train_df, val_df], axis=0)
user_ids = train_df[args.user_col].values
item_ids = train_df[args.item_col].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))

logger.info(f"{len(unique_user_ids)=:,.0f}, {len(unique_item_ids)=:,.0f}")

[32m2024-09-22 21:05:54.993[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mlen(unique_user_ids)=20,366, len(unique_item_ids)=4,696[0m


In [12]:
train_df = train_df.pipe(map_indice, idm, args.user_col, args.item_col)
val_df = val_df.pipe(map_indice, idm, args.user_col, args.item_col)
full_df = full_df.pipe(map_indice, idm, args.user_col, args.item_col)

In [13]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence,last_item_indice
0,AFFJSIHCRAXI3J3NRBLGKNNCR2RA,B00CXTX2YW,0.0,1515421489907,13258,528,Video Games,Xbox 360 Wireless Controller - Camouflage,[Ambush your opponents with the Xbox 360 Speci...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",40.99,"[4148.0, 400.0, 791.0, 3069.0, 3062.0, 1152.0,...",2895.0
1,AFVONP56RLBX43PFSSG3YCXNLKUQ,B00267S2A0,0.0,1504677642111,7725,4233,Video Games,Call of Duty: Modern Warfare 2 - Playstation 3,"[Product Description, On November 10, 2009, ac...","[Video Games, Legacy Systems, PlayStation Syst...",23.88,"[-1.0, -1.0, -1.0, -1.0, 742.0, 4535.0, 1985.0...",3091.0
2,AFCB6BTUBDB4OFJWXPOITK44EZJA,B01A5BEBX0,0.0,1577800282838,9434,1424,Computers,PECHAM Vertical Stand for PS4 Slim / PS4 with ...,[],"[Video Games, PlayStation 4, Accessories, Cool...",,"[731.0, 3617.0, 2522.0, 393.0, 592.0, 362.0, 2...",762.0
3,AHNKYFBDA2RBJBABIDP4LBP2W4EQ,B004MPR0ZC,0.0,1397353145000,12208,1919,Video Games,CTA Digital Nintendo 3Ds Cartridge Storage Sol...,"[Product Description, The Nintendo 3DS is the ...","[Video Games, Legacy Systems, Nintendo Systems...",,"[1286.0, 2124.0, 1518.0, 941.0, 1668.0, 1659.0...",2159.0
4,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,17057,4555,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 108.0, 24...",986.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
340085,AGEBVJUFAAPDBFKJ2DIJGCDDSAEQ,B0036F0V4G,0.0,1452044872000,9445,414,Video Games,Metal Gear Rising Revengeance - Xbox 360,"[Product Description, Product Overview, METAL ...","[Video Games, Legacy Systems, Xbox Systems, Xb...",26.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2702.0, 6...",525.0
340086,AF6ROEEZD4VJPH6D6JFRRC6UZ3LQ,B073232J24,0.0,1282760896000,5419,3449,Video Games,NHL 18 - PlayStation 4,"[EA SPORTS NHL 18 delivers the speed, creativi...","[Video Games, PlayStation 4, Games]",11.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",4453.0
340087,AG27GY5VFWMIOQLBQY5C6R6FD4JQ,B000A2R54M,0.0,1468348193000,18174,30,Video Games,Mario Kart DS,"[Amazon.com, The ultimate, Mario Kart, race is...","[Video Games, Kids and Family]",37.51,"[-1.0, -1.0, -1.0, 3739.0, 3171.0, 2109.0, 322...",144.0
340088,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,10245,4603,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 391...",1026.0


# Fit

In [14]:
fit_df = train_df.drop_duplicates(subset=[args.item_col])
train_item_features = item_metadata_pipeline.transform(fit_df).astype(np.float32)
model = init_model(train_item_features, device)

# Predict

In [15]:
user_id = val_df.sample(1)[args.user_col].values[0]
test_df = val_df.loc[lambda df: df[args.user_col].eq(user_id)]
test_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence,last_item_indice
5,AE3U66S5YBEMPF36PVYR6QAS5ETA,B005OGKLK4,0.0,1630814199104,16399,4366,Video Games,Bejeweled 3 (with Zuma & Feeding Frenzy 2) - P...,"[Product Description, The biggest, brightest B...","[Video Games, Legacy Systems, PlayStation Syst...",16.68,"[-1, -1, -1, 1907, 2351, 845, 725, 812, 4299, ...",4281
190,AE3U66S5YBEMPF36PVYR6QAS5ETA,B00DDILSBG,0.0,1630816891154,16399,840,Video Games,Final Fantasy XV Deluxe Edition - PlayStation 4,"[""Get ready to be at the centre of the ultimat...","[Video Games, PlayStation 4, Games]",129.95,"[1907, 2351, 845, 725, 812, 4299, 4281, 4459, ...",1367
199,AE3U66S5YBEMPF36PVYR6QAS5ETA,B01N6S068R,5.0,1630570514906,16399,4299,Video Games,Nintendo Switch Wired Internet LAN Adapter by ...,[Enjoy the speed and stability of a wired inte...,"[Video Games, Legacy Systems, Nintendo Systems...",34.99,"[-1, -1, -1, -1, -1, 1907, 2351, 845, 725, 812]",812
426,AE3U66S5YBEMPF36PVYR6QAS5ETA,B007MXX3J8,0.0,1630814322899,16399,331,Video Games,BlazBlue: Continuum Shift EXTEND Limited Editi...,"[Product Description, The next installment in ...","[Video Games, Legacy Systems, Xbox Systems, Xb...",32.95,"[-1, -1, 1907, 2351, 845, 725, 812, 4299, 4281...",4459
529,AE3U66S5YBEMPF36PVYR6QAS5ETA,B00002NDRY,0.0,1630816495703,16399,3059,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1, 1907, 2351, 845, 725, 812, 4299, 4281, 44...",650
588,AE3U66S5YBEMPF36PVYR6QAS5ETA,B07DK1H3H5,5.0,1630814199104,16399,4459,Video Games,Cyberpunk 2077 - PC [Game Download Code in Box],"[Cyberpunk 2077 is an open world, an action ad...","[Video Games, PC, Games]",,"[-1, -1, -1, 1907, 2351, 845, 725, 812, 4299, ...",4281
627,AE3U66S5YBEMPF36PVYR6QAS5ETA,B087LSSNG1,5.0,1630814322899,16399,650,Video Games,Xenoblade Chronicles: Definitive Edition - Nin...,[Discover the origins of Shulk as he and his c...,"[Video Games, Nintendo Switch, Games]",54.98,"[-1, -1, 1907, 2351, 845, 725, 812, 4299, 4281...",4459
866,AE3U66S5YBEMPF36PVYR6QAS5ETA,B00Z9TMBOU,0.0,1630570514906,16399,3259,Video Games,Battlefield 1 - Xbox One,"[Product Description, Experience the dawn of a...","[Video Games, Xbox One, Games]",9.99,"[-1, -1, -1, -1, -1, 1907, 2351, 845, 725, 812]",812
990,AE3U66S5YBEMPF36PVYR6QAS5ETA,B08L6L6KQL,0.0,1630817766917,16399,162,Video Games,HORI Compact Playstand for Nintendo Switch Off...,[Officially Licensed by Nintendo. Enjoy playin...,"[Video Games, Nintendo Switch, Accessories, Mo...",22.33,"[2351, 845, 725, 812, 4299, 4281, 4459, 650, 1...",1340
1527,AE3U66S5YBEMPF36PVYR6QAS5ETA,B06XP8FMM2,5.0,1630817766917,16399,2721,All Electronics,"Keten Repair Kit for NS Switch, 17in1 Professi...",[],"[Video Games, Nintendo Switch, Accessories, Re...",11.98,"[2351, 845, 725, 812, 4299, 4281, 4459, 650, 1...",1340


In [16]:
item_id = test_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(
    f"Test predicting before training with {args.user_col} = {user_id} and {args.item_col} = {item_id}"
)
anchor_item_indice = test_df["last_item_indice"].values[0]
item_indice = idm.get_item_index(item_id)
model.predict([item_indice], [anchor_item_indice])

[32m2024-09-22 21:06:00.692[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTest predicting before training with user_id = AE3U66S5YBEMPF36PVYR6QAS5ETA and parent_asin = B01N6S068R[0m


tensor([0.0050], device='mps:0')

# Recommend

In [17]:
val_anchor_item_indices = val_df["last_item_indice"].values
val_user_indices = val_df["user_indice"].values

In [18]:
recommendations = model.recommend(val_user_indices, val_anchor_item_indices, args.top_K)

Generating Recommendations:   0%|          | 0/1898 [00:00<?, ?it/s]

# Evaluate

## Ranking metrics

In [19]:
recommendations_df = pd.DataFrame(recommendations).pipe(
    create_rec_df, idm, args.user_col, args.item_col
)
recommendations_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin
0,11638,2913,0.735254,1.0,AHO3T4INZKH5C7IIV5JERA3H2SKQ,B076FCM9B5
1,11638,3899,0.707038,3.0,AHO3T4INZKH5C7IIV5JERA3H2SKQ,B003JBHG9K
2,11638,4343,0.589987,5.0,AHO3T4INZKH5C7IIV5JERA3H2SKQ,B002BSC5HA
3,11638,720,0.420673,7.0,AHO3T4INZKH5C7IIV5JERA3H2SKQ,B01GVOFIPK
4,11638,281,0.334560,9.0,AHO3T4INZKH5C7IIV5JERA3H2SKQ,B002I0K780
...,...,...,...,...,...,...
189795,10038,4212,0.065141,192.0,AHAKU6TTWIHJPZIODW7MGC52M2DA,B06ZYHGC9R
189796,10038,4554,0.065099,194.0,AHAKU6TTWIHJPZIODW7MGC52M2DA,B001ELJE4W
189797,10038,3873,0.065023,196.0,AHAKU6TTWIHJPZIODW7MGC52M2DA,B0009HAYDC
189798,10038,4383,0.064556,198.0,AHAKU6TTWIHJPZIODW7MGC52M2DA,B0072A4GQK


In [20]:
label_df = create_label_df(
    val_df,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
    timestamp_col=args.timestamp_col,
)
label_df

Unnamed: 0,user_id,parent_asin,rating,rating_rank
548,AFI3SEGKSS7X7CVOC3HS5U5RAIWQ,B077GG9D5D,5.0,1.0
697,AGDFPA6XVENIY7XJPIOXBUHRJQ6A,B09V25XG1G,5.0,1.0
109,AHMJVCKVHJIT2R5NWWV4HG4TDH6A,B07C2XYDW8,5.0,1.0
197,AGQAPJGNR3IDAJCPD2YPU7CRRPBA,B0C7BN9G35,5.0,1.0
797,AF6IUPCJM4FDNOQZNCDOGTLM2M4Q,B09T5VN7D1,4.0,1.0
...,...,...,...,...
850,AFB6FYPPCN33UMUU5536IHXNOHCQ,B001E8WQJA,0.0,18.0
1106,AG4RCXKPTC6QRORJLUSBY4SO2IAA,B00DBLBMBQ,0.0,18.0
1003,AESD4RLWUKM6JTD6SNNWYLHLLQQA,B00Z9TMBOU,0.0,18.0
52,AFB6FYPPCN33UMUU5536IHXNOHCQ,B0892HRCST,0.0,19.0


In [21]:
eval_df = merge_recs_with_target(
    recommendations_df,
    label_df,
    k=args.top_K,
    user_col=args.user_col,
    item_col=args.item_col,
    rating_col=args.rating_col,
)
eval_df

Unnamed: 0,user_indice,recommendation,score,rec_ranking,user_id,parent_asin,rating,rating_rank
151,14192.0,3254.0,0.880842,1,AE2AZ2MNROPF33U6SS53VI22OXJA,B078T3R8YS,0,
152,14192.0,3254.0,0.880842,2,AE2AZ2MNROPF33U6SS53VI22OXJA,B078T3R8YS,0,
95,14192.0,1864.0,0.387307,3,AE2AZ2MNROPF33U6SS53VI22OXJA,B00EW6QT76,0,
96,14192.0,1864.0,0.387307,4,AE2AZ2MNROPF33U6SS53VI22OXJA,B00EW6QT76,0,
195,14192.0,1907.0,0.370678,5,AE2AZ2MNROPF33U6SS53VI22OXJA,B0B9MJK753,0,
...,...,...,...,...,...,...,...,...
191508,16101.0,2573.0,0.061997,196,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B003G2Z4FK,0,
191488,16101.0,2468.0,0.061555,197,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B001EYUQN2,0,
191489,16101.0,2468.0,0.061555,198,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B001EYUQN2,0,
191519,16101.0,1053.0,0.061463,199,AHZNHP6OKXRZV2UJMYDPLWCKFKEA,B0050SWNZW,0,


In [22]:
ranking_report = log_ranking_metrics(args, eval_df)

  return (1 + beta_sqr) * precision_arr * recall_arr / (beta_sqr * precision_arr + recall_arr)


## Classification metrics

In [23]:
val_anchor_item_indices = val_df["last_item_indice"].values
val_item_indices = val_df["item_indice"].values

In [24]:
classifications = (
    model.predict(val_item_indices, val_anchor_item_indices)
    .cpu()
    .detach()
    .numpy()
    .reshape(-1, 1)
)
classifications = MinMaxScaler(feature_range=(0, 1)).fit_transform(classifications)

In [25]:
eval_classification_df = val_df.assign(
    classification_proba=classifications,
    label=lambda df: df[args.rating_col].gt(0).astype(int),
)
eval_classification_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence,last_item_indice,classification_proba,label
0,AHO3T4INZKH5C7IIV5JERA3H2SKQ,B01GY35QPU,0.0,1638057548682,11638,1902,Video Games,Detroit Become Human - PlayStation 4,[Detroit: Become Human is the latest title in ...,"[Video Games, PlayStation 4, Games]",21.4,"[3579, 878, 434, 1579, 3669, 3322, 3495, 2339,...",287,0.289372,0
1,AEMYS2WV33NDV3OAJFOM7SSSIDTA,B0088TN7BO,0.0,1640914537652,14788,2180,Video Games,LEGO Lord of the Rings - Nintendo 3DS,"[Product Description, Based on The Lord of the...","[Video Games, Legacy Systems, Nintendo Systems...",19.95,"[-1, -1, -1, -1, -1, 3244, 2872, 2621, 3048, 3...",3049,0.314741,0
2,AEXKGQQMYQQUNWVGD66TG3VT4V4A,B00IPTUJ8G,0.0,1653120417877,19789,4619,Video Games,Borderlands 2,"[Borderlands 2 for PS Vita, View Larger, View ...","[Video Games, Legacy Systems, PlayStation Syst...",48.1,"[-1, -1, -1, -1, -1, 2358, 3142, 4521, 2292, 2...",2541,0.274276,0
3,AGQAPJGNR3IDAJCPD2YPU7CRRPBA,B00CMQTVK0,0.0,1643392366551,6414,1765,Video Games,Xbox One with Kinect (Day One Edition),"[Get more with Xbox One., Introducing Xbox One...","[Video Games, Xbox One, Consoles]",589.99,"[-1, -1, -1, -1, 3142, 4274, 4345, 809, 416, 1...",1578,0.257726,0
4,AGIJWTPKBANKWNEM2AHK7PWTBYLQ,B01BF9X9VQ,0.0,1647513889874,1195,2412,Video Games,World of Warcraft: Legion - Standard Edition -...,"[Kingdoms will burn, The Burning Legion surges...","[Video Games, PC, Games]",28.6,"[170, 3609, 1588, 1467, 3786, 1627, 4234, 3056...",162,0.265222,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1893,AF5T2J7T33UFSPUGCKTPTGU7EY7A,B008I2LRMC,0.0,1654892101229,14749,1743,Video Games,F1: 2012,"[Product Description, F1 2012 is designed to b...","[Video Games, Legacy Systems, Xbox Systems, Xb...",33.03,"[601, 1842, 3391, 2640, 761, 3624, 1079, 2656,...",4449,0.253910,0
1894,AGXQLR7TWHVUQLDBKKOBI4OQQGRQ,B01N3ASPNV,0.0,1637285457188,8208,3069,All Electronics,amFilm Tempered Glass Screen Protector for Nin...,[],"[Video Games, Nintendo Switch, Accessories, Fa...",8.91,"[-1, -1, 4227, 1039, 1352, 2619, 2655, 1305, 1...",3173,0.245217,0
1895,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,9221,2581,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,"[-1, 2705, 3691, 1804, 111, 3330, 1775, 2979, ...",4206,0.248265,1
1896,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,4173,3191,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,"[-1, -1, -1, -1, -1, 2379, 1799, 1932, 1568, 161]",161,0.282582,1


In [26]:
classification_report = log_classification_metrics(
    args,
    eval_classification_df,
    target_col="label",
    prediction_col="classification_proba",
)

# Clean up

In [27]:
all_params = [args]

if args.log_to_mlflow:
    for params in all_params:
        params_dict = params.dict()
        params_ = {f"{params.__repr_name__()}.{k}": v for k, v in params_dict.items()}
        mlflow.log_params(params_)

    mlflow.end_run()

2024/09/22 21:06:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run 005-content-based at: http://localhost:5003/#/experiments/1/runs/6a780f1f2e204bf1b53318d7da3751fc.
2024/09/22 21:06:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5003/#/experiments/1.
