In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor


In [37]:
df = pd.read_csv("product_info.csv")    

In [43]:
global num, cat, text

# features
num = ["price_usd", "sale_price_usd", "value_price_usd",
           "child_min_price", "child_max_price", "child_count"]
cat = ["brand_name", "primary_category", "secondary_category"]
text = "ingredients"



def train_regressor(df, target_col):

    # ensure numeric types
    for c in num:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # target + rows with target present
    y = pd.to_numeric(df[target_col], errors="coerce")
    mask = y.notna()
    X, y = df.loc[mask, num + cat + [text]], y.loc[mask]

    # transformers
    num_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore"))
    ])
    txt_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="constant", fill_value="")),
        ("flat", FunctionTransformer(lambda a: a.ravel(), feature_names_out="one-to-one")),
        ("tfidf", TfidfVectorizer(max_features=1000))
    ])

    pre = make_column_transformer(
        (num_pipe, num),
        (cat_pipe, cat),
        (txt_pipe, [text])
    )

    model = make_pipeline(
        pre,
        XGBRegressor(
            objective="reg:squarederror",
            n_estimators=300,
            max_depth=6,
            n_jobs=-1,
            random_state=42
        )
    )

    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20, random_state=42)
    model.fit(X_tr, y_tr)

    y_hat = model.predict(X_te)
    rmse = float(np.sqrt(mean_squared_error(y_te, y_hat)))
    r2 = float(r2_score(y_te, y_hat))

    return model, {"rmse": rmse, "r2": r2}


In [48]:

# Train on rating
rating_model, rating_metrics = train_regressor(df, "rating")

In [49]:

# Train on loves_count
loves_model, loves_metrics = train_regressor(df, "loves_count")

In [60]:
from sklearn.preprocessing import StandardScaler

num = ["price_usd","sale_price_usd","value_price_usd","child_min_price","child_max_price","child_count"]
cat = ["brand_name","primary_category","secondary_category"]
text = "ingredients"

X_full = df[num + cat + [text]].copy()

df["pred_rating"] = rating_model.predict(X_full)
df["pred_loves"]  = loves_model.predict(X_full)
df["loves_log"]   = np.log1p(pd.to_numeric(df["loves_count"], errors="coerce")).fillna(0)

# category-normalized z-scores
g = df.groupby("primary_category")
def z(col):
    mu = g[col].transform("mean")
    sd = g[col].transform("std").replace(0, np.nan)
    return (df[col] - mu) / sd

df["z_pred_rating"] = z("pred_rating")
df["z_pred_loves"]  = z("pred_loves")
df["z_loves_log"]   = z("loves_log")

# effective price + z
df["price_eff"] = pd.to_numeric(df["sale_price_usd"], errors="coerce").fillna(
                  pd.to_numeric(df["price_usd"], errors="coerce"))
df["z_price"] = z("price_eff")

# final combined score (tune weights)
df["gem_score"] = 1.2*df["z_pred_rating"] + 0.8*(df["z_pred_loves"] - df["z_loves_log"]) - 0.4*df["z_price"]

top50 = (df.sort_values("gem_score", ascending=False)
           [["product_id","product_name","brand_name","primary_category",
             "pred_rating","rating","pred_loves","loves_count","price_usd",
             "gem_score"]]
           .head(50))


# Format numbers for readability
top50_readable = top50.copy().reset_index(drop=True)

# Round key numeric columns
top50_readable["pred_rating"] = top50_readable["pred_rating"].round(2)
top50_readable["rating"] = top50_readable["rating"].round(2)
top50_readable["pred_loves"] = top50_readable["pred_loves"].round(0).astype(int)
top50_readable["loves_count"] = top50_readable["loves_count"].astype(int)
top50_readable["price_usd"] = top50_readable["price_usd"].round(2)
top50_readable["gem_score"] = top50_readable["gem_score"].round(2)


# Reorder / rename columns for clarity
top50_readable = top50_readable.rename(columns={
    "pred_rating": "Pred Rating",
    "rating": "Actual Rating",
    "pred_loves": "Pred Loves",
    "loves_count": "Actual Loves",
    "price_usd": "Price",
    "gem_score": "Gem Score"
})

# Show top 10 in a pretty format
print(top50_readable.head(10).to_string(index=False))





product_id                                                    product_name                  brand_name primary_category  Pred Rating  Actual Rating  Pred Loves  Actual Loves  Price  Gem Score
   P420652              Lip Sleeping Mask Intense Hydration with Vitamin C                     LANEIGE         Skincare         4.38           4.35     1065929       1081315   24.0      15.40
   P428224                                     No. 3 Hair Repair Perfector                     Olaplex             Hair         4.20           4.33      516548        527201   30.0      13.61
 P97989778                                         Soft Pinch Liquid Blush Rare Beauty by Selena Gomez           Makeup         4.53           4.54     1390104       1401068   23.0      11.14
   P427417                     Niacinamide 10% + Zinc 1% Oil Control Serum                The Ordinary         Skincare         4.26           4.24      752819        763168    6.0      10.21
   P406080                              

In [61]:
from IPython.display import display

display(top50_readable)

Unnamed: 0,product_id,product_name,brand_name,primary_category,Pred Rating,Actual Rating,Pred Loves,Actual Loves,Price,Gem Score
0,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,Skincare,4.38,4.35,1065929,1081315,24.0,15.4
1,P428224,No. 3 Hair Repair Perfector,Olaplex,Hair,4.2,4.33,516548,527201,30.0,13.61
2,P97989778,Soft Pinch Liquid Blush,Rare Beauty by Selena Gomez,Makeup,4.53,4.54,1390104,1401068,23.0,11.14
3,P427417,Niacinamide 10% + Zinc 1% Oil Control Serum,The Ordinary,Skincare,4.26,4.24,752819,763168,6.0,10.21
4,P406080,Brazilian Bum Bum Body Cream,Sol de Janeiro,Bath & Body,4.46,4.3,450426,470934,48.0,9.53
5,P394534,Black Opium Eau de Parfum,Yves Saint Laurent,Fragrance,4.63,4.64,272051,276396,155.0,9.51
6,P427419,Hyaluronic Acid 2% + B5 Hydrating Serum,The Ordinary,Skincare,4.19,4.21,717555,720504,15.7,9.39
7,P377873,Radiant Creamy Concealer,NARS,Makeup,4.26,4.31,1150340,1153594,32.0,8.09
8,P67988453,Gloss Bomb Universal Lip Luminizer,Fenty Beauty by Rihanna,Makeup,4.59,4.64,954505,968317,21.0,7.68
9,P417312,Brazilian Crush Cheirosa ’62 Bum Bum Hair & Bo...,Sol de Janeiro,Fragrance,4.25,4.3,230664,245440,24.0,7.53
