In [5]:
import csv
import pandas as pd

df = pd.read_csv("product_info.csv")
print(df.columns)

Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count',
       'rating', 'reviews', 'size', 'variation_type', 'variation_value',
       'variation_desc', 'ingredients', 'price_usd', 'value_price_usd',
       'sale_price_usd', 'limited_edition', 'new', 'online_only',
       'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'child_count',
       'child_max_price', 'child_min_price'],
      dtype='object')


**Hidden Gem Predictor**


In [None]:
# Make sure numeric columns are properly parsed
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
df["reviews"] = pd.to_numeric(df["reviews"], errors="coerce")
df["price_usd"] = pd.to_numeric(df["price_usd"], errors="coerce")

# Apply your SQL conditions:
# - rating >= 4.7
# - reviews < 50 (handling NaN as 0)
hidden_gems = (
    df[(df["rating"] >= 4.4) & (df["reviews"].fillna(0) < 50)]
    .sort_values(["rating", "reviews"], ascending=[False, True])
    .loc[:, ["product_id", "product_name", "brand_name", "primary_category",
             "rating", "reviews", "loves_count", "price_usd"]]
)

print(hidden_gems.columns) # Variables in the dataset.
print(hidden_gems.shape[0]) # Number of training data.

Index(['product_id', 'product_name', 'brand_name', 'primary_category',
       'rating', 'reviews', 'loves_count', 'price_usd'],
      dtype='object')
1025


In [22]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

num_cols = ["loves_count", "price_usd", "sale_price_usd"]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")


# Create hidden gem label
df["hidden_gem"] = ((df["rating"] >= 4.4) & (df["reviews"].fillna(0) < 50)).astype(int)

num_features = ["price_usd", "sale_price_usd", "loves_count"]
cat_features = ["brand_name", "primary_category"]

X = df[num_features + cat_features]
y = df["hidden_gem"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])  

categorical_transformer = Pipeline(steps=[ 
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])  

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ])  


model = Pipeline(steps=[
    ("preprocessor", preprocessor),         
    ("model", XGBClassifier(use_label_encoder=False, eval_metric="logloss"))
])  


# Set 80 percent of the whole data as training data, and 20 percent as test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.99      0.93      1489
           1       0.54      0.12      0.20       210

    accuracy                           0.88      1699
   macro avg       0.72      0.55      0.57      1699
weighted avg       0.85      0.88      0.84      1699



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
