# Content-Based Recommender

In [1]:
pip install pandas matplotlib seaborn scikit-learn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [3]:
# load data
df = pd.read_csv("../data/processed/feature_engineer_df.csv")

In [None]:
df

In [4]:
df_copy = df

# Train a Linear Model to Learn Weights

In [5]:
import ast

def safely_convert_to_list(val):
    try:
        if isinstance(val, str):
            parsed = ast.literal_eval(val)
            return parsed if isinstance(parsed, list) else []
        return val if isinstance(val, list) else []
    except:
        return []

In [6]:
df_copy['function_tags'] = df_copy['function_tags'].apply(safely_convert_to_list)

In [7]:
skin_type_tag_map = {
    "dry": ["hydrating", "barrier-repair", "healing"],
    "oily": ["oil-control", "exfoliating", "acne-fighting"],
    "sensitive": ["soothing", "anti-redness", "anti-inflammatory"],
    "acne-prone": ["acne-fighting", "oil-control", "anti-inflammatory"],
    "mature": ["anti-aging", "firming", "antioxidant"],
    "dull": ["brightening", "antioxidant", "exfoliating"]
}

dry_tags = ["hydrating", "barrier-repair", "healing"]
oily_tags = ["oil-control", "exfoliating", "acne-fighting"]
sensitive_tags = ["soothing", "anti-redness", "anti-inflammatory"]
acne_prone_tags = ["acne-fighting", "oil-control", "anti-inflammatory"]
mature_tags = ["anti-aging", "firming", "antioxidant"]
dull_tags = ["brightening", "antioxidant", "exfoliating"]

In [8]:
import re

def parse_function_tags(raw_tags):
    if not isinstance(raw_tags, list):
        return []

    cleaned = []
    for tag_str in raw_tags:
        if isinstance(tag_str, str):
            # Remove square brackets, split by comma, strip whitespace
            tag_list = re.sub(r'[\[\]]', '', tag_str).split(',')
            for tag in tag_list:
                tag = tag.strip().lower()
                if tag and tag != 'none':
                    cleaned.append(tag)
    return list(set(cleaned))  # remove duplicates

In [9]:
df_copy['function_tags'] = df_copy['function_tags'].apply(parse_function_tags)

In [10]:
df_copy['function_tags']

0       [hydrating, barrier-repair, antioxidant, anti-...
1       [anti-inflammatory, anti-redness, hydrating, b...
2                                                      []
3       [anti-inflammatory, brightening, soothing, ant...
4                                     [soothing, healing]
                              ...                        
2281    [hydrating, healing, soothing, barrier-repair,...
2282                                                   []
2283    [hydrating, brightening, soothing, barrier-rep...
2284    [anti-inflammatory, hydrating, brightening, so...
2285    [anti-inflammatory, healing, brightening, soot...
Name: function_tags, Length: 2286, dtype: object

In [11]:
def get_matching_tags(tag_list, skin_type_tags):
    return [tag for tag in tag_list if tag in skin_type_tags]

In [12]:
# matched_tags -> dry
df_copy['dry_tags'] = df_copy['function_tags'].apply(lambda x: get_matching_tags(x, dry_tags))

In [None]:
df_copy

In [13]:
# matched_tags -> oily
df_copy['oily_tags'] = df_copy['function_tags'].apply(lambda x: get_matching_tags(x, oily_tags))

In [None]:
df_copy

In [14]:
# matched_tags -> sensitive sensitive_tags
df_copy['sensitive_tags'] = df_copy['function_tags'].apply(lambda x: get_matching_tags(x, sensitive_tags))

In [15]:
# matched_tags -> acne-prone acne_prone_tags
df_copy['acne_prone_tags'] = df_copy['function_tags'].apply(lambda x: get_matching_tags(x, acne_prone_tags))

In [16]:
# matched_tags -> mature mature_tags
df_copy['mature_tags'] = df_copy['function_tags'].apply(lambda x: get_matching_tags(x, mature_tags))

In [17]:
# matched_tags -> dull dull_tags
df_copy['dull_tags'] = df_copy['function_tags'].apply(lambda x: get_matching_tags(x, dull_tags))

In [None]:
df

In [18]:
# Convert Tag Lists to Binary Target Columns

df_copy['target_dry'] = df_copy['dry_tags'].apply(lambda x: len(x) > 0)
df_copy['target_oily'] = df_copy['oily_tags'].apply(lambda x: len(x) > 0)
df_copy['target_sensitive'] = df_copy['sensitive_tags'].apply(lambda x: len(x) > 0)
df_copy['target_acne_prone'] = df_copy['acne_prone_tags'].apply(lambda x: len(x) > 0)
df_copy['target_mature'] = df_copy['mature_tags'].apply(lambda x: len(x) > 0)
df_copy['target_dull'] = df_copy['dull_tags'].apply(lambda x: len(x) > 0)

In [None]:
df_copy

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

features = [
    'rating_scaled',
    'is_recommended_scaled',
    'total_pos_feedback_count_scaled',
    'num_function_tags_scaled'
]

# Mapping your target columns
skin_target_map = {
    'dry': 'target_dry',
    'oily': 'target_oily',
    'sensitive': 'target_sensitive',
    'acne-prone': 'target_acne_prone',
    'mature': 'target_mature',
    'dull': 'target_dull'
}

# Store models and weights
skin_models = {}
skin_weights = {}

for skin_type, target_col in skin_target_map.items():
    print(f"\n🧪 Training model for {skin_type} skin")

    y = df_copy[target_col]
    X = df_copy[features]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    print(f"🔢 y_train distribution for {skin_type}:")
    print(y_train.value_counts())

    model = LogisticRegression(C=0.1)
    model.fit(X_train, y_train)

    # Save model and feature weights
    skin_models[skin_type] = model
    skin_weights[skin_type] = dict(zip(X.columns, model.coef_[0]))

    # Evaluation
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"✅ Accuracy for {skin_type}: {acc:.2f}")
    print(classification_report(y_test, y_pred))


🧪 Training model for dry skin
🔢 y_train distribution for dry:
target_dry
True     1037
False     791
Name: count, dtype: int64
✅ Accuracy for dry: 0.82
              precision    recall  f1-score   support

       False       0.85      0.72      0.78       199
        True       0.81      0.90      0.85       259

    accuracy                           0.82       458
   macro avg       0.83      0.81      0.81       458
weighted avg       0.82      0.82      0.82       458


🧪 Training model for oily skin
🔢 y_train distribution for oily:
target_oily
True     938
False    890
Name: count, dtype: int64
✅ Accuracy for oily: 0.79
              precision    recall  f1-score   support

       False       0.79      0.82      0.80       245
        True       0.78      0.75      0.77       213

    accuracy                           0.79       458
   macro avg       0.79      0.78      0.78       458
weighted avg       0.79      0.79      0.79       458


🧪 Training model for sensitive skin
🔢

# Insight

## 🧠 Model Evaluation Summary (Per Skin Type)

Below is a summary of classification performance for each skin type using logistic regression. All models were trained on four engineered features and evaluated on a holdout test set (20%).

| Skin Type     | Accuracy | Precision (True) | Recall (True) | F1-score (True) | Notes |
|---------------|----------|------------------|---------------|-----------------|-------|
| **Dry**       | 0.82     | 0.81             | 0.90          | 0.85            | Excellent performance with high recall — great at identifying dry-suitable products. |
| **Oily**      | 0.79     | 0.78             | 0.75          | 0.77            | Solid performance, balanced precision and recall. |
| **Sensitive** | 0.77     | 0.76             | 0.70          | 0.73            | Good, though recall could be improved for better sensitivity coverage. |
| **Acne-Prone**| 0.77     | 0.71             | 0.91          | 0.80            | Prioritizes finding acne-friendly products well (high recall). |
| **Mature**    | 0.82     | 0.86             | 0.79          | 0.82            | Excellent performance with balanced metrics. |
| **Dull**      | 0.82     | 0.79             | 0.91          | 0.85            | Strong ability to catch dullness-targeted products. |

### ✅ General Insights:
- All models demonstrate strong generalization (accuracy between **77% and 82%**).
- **High recall** for key types (dry, acne-prone, dull) ensures relevant products are rarely missed.
- No significant class imbalance was found during training (`y_train.value_counts()`).
- The models are ready to be used for personalized product recommendation.



# Use the Learned Weights for Scoring

In [20]:
# define scoring function 
def compute_relevance_score(row, weights):
    return sum(row[feature] * weight for feature, weight in weights.items())

In [21]:
for skin_type, weights in skin_weights.items():
    df_copy[f'relevance_score_{skin_type}'] = df_copy.apply(
        lambda row: compute_relevance_score(row, weights), axis=1
    )


In [None]:
df_copy

In [22]:
# Recommendation Function

def recommend_products(df, skin_type, top_n=10, category=None):
    """
    Recommends top N products for a given skin type based on relevance scores.

    Args:
        df (pd.DataFrame): The DataFrame containing products and relevance scores.
        skin_type (str): Skin type to recommend for (e.g., 'dry', 'oily', etc.)
        top_n (int): Number of products to return.
        category (str, optional): If provided, filters results by primary_category.

    Returns:
        pd.DataFrame: Top N recommended products.
    """
    score_col = f'relevance_score_{skin_type}'
    
    if score_col not in df.columns:
        raise ValueError(f"No relevance score found for skin type '{skin_type}'")

    df_filtered = df.copy()

    if category:
        df_filtered = df_filtered[
            df_filtered['primary_category'].str.lower() == category.lower()
        ]

    return df_filtered.sort_values(score_col, ascending=False).head(top_n)


In [23]:
# Normalize scores for UI 

from sklearn.preprocessing import MinMaxScaler

for skin_type in skin_weights.keys():
    score_col = f'relevance_score_{skin_type}'
    norm_col = f'{score_col}_scaled'
    df_copy[norm_col] = MinMaxScaler().fit_transform(df_copy[[score_col]])

In [None]:
df_copy

# Save Train Models

In [24]:
pip install joblib

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [25]:
# save each model

import joblib
import os

# Create folder if it doesn't exist
os.makedirs('models', exist_ok=True)

for skin_type, model in skin_models.items():
    filename = f'models/logreg_{skin_type}.joblib'
    joblib.dump(model, filename)
    print(f"✅ Saved {skin_type} model to {filename}")


✅ Saved dry model to models/logreg_dry.joblib
✅ Saved oily model to models/logreg_oily.joblib
✅ Saved sensitive model to models/logreg_sensitive.joblib
✅ Saved acne-prone model to models/logreg_acne-prone.joblib
✅ Saved mature model to models/logreg_mature.joblib
✅ Saved dull model to models/logreg_dull.joblib


In [26]:
df_copy.columns

Index(['product_id', 'product_name', 'brand_name', 'ingredients', 'rating',
       'primary_category', 'skin_type', 'review_text_cleaned',
       'is_recommended', 'reviews', 'price_usd', 'total_pos_feedback_count',
       'what_does_it_do', 'who_is_it_good_for', 'who_should_avoid',
       'function_tags', 'num_ingredients', 'num_function_tags',
       'has_irritants', 'skin_match_score', 'parsed_function_tags',
       'matched_tags', 'rating_scaled', 'is_recommended_scaled',
       'reviews_scaled', 'price_usd_scaled', 'total_pos_feedback_count_scaled',
       'skin_match_score_scaled', 'num_ingredients_scaled',
       'num_function_tags_scaled', 'relevance_score', 'dry_tags', 'oily_tags',
       'sensitive_tags', 'acne_prone_tags', 'mature_tags', 'dull_tags',
       'target_dry', 'target_oily', 'target_sensitive', 'target_acne_prone',
       'target_mature', 'target_dull', 'relevance_score_dry',
       'relevance_score_oily', 'relevance_score_sensitive',
       'relevance_score_acne-

In [33]:
cols_to_keep = [
    'product_name',
    'brand_name',
    'rating',
    'price_usd',
    'function_tags',
    # Keep only the relevance scores
    'relevance_score_dry',
    'relevance_score_oily',
    'relevance_score_sensitive',
    'relevance_score_acne-prone',
    'relevance_score_mature',
    'relevance_score_dull',
]

In [34]:
df_final= df_copy[cols_to_keep]

In [35]:
df_final

Unnamed: 0,product_name,brand_name,rating,price_usd,function_tags,relevance_score_dry,relevance_score_oily,relevance_score_sensitive,relevance_score_acne-prone,relevance_score_mature,relevance_score_dull
0,Renewing Eye Cream,Murad,4.031620,89.0,"[hydrating, barrier-repair, antioxidant, anti-...",1.138888,1.133340,1.302729,1.136431,1.308429,1.520420
1,Goodbye Acne Max Complexion Correction Pads,Peter Thomas Roth,4.419882,48.0,"[anti-inflammatory, anti-redness, hydrating, b...",0.862160,0.880449,1.080717,0.873969,1.035583,1.266697
2,Grape Water Moisturizing Face Mist,Caudalie,4.443390,12.0,[],-0.132345,0.080326,0.173657,-0.041518,-0.160145,0.130056
3,Clarifying Lotion 1,CLINIQUE,4.515000,20.0,"[anti-inflammatory, brightening, soothing, ant...",0.583642,0.597385,0.839134,0.596371,0.772908,0.994846
4,7 Day Face Scrub Cream Rinse-Off Formula,CLINIQUE,4.532099,26.0,"[soothing, healing]",0.054958,0.087585,0.366474,0.082570,0.245430,0.459274
...,...,...,...,...,...,...,...,...,...,...,...
2281,Vinoperfect Radiance Dark Spot Serum Vitamin C...,Caudalie,4.244681,82.0,"[hydrating, healing, soothing, barrier-repair,...",0.664468,0.816099,0.877833,0.726673,0.682358,0.965432
2282,One-Step Gentle Exfoliating Cleanser with Oran...,Clarins,4.613767,39.0,[],0.060923,0.116436,0.375142,0.096677,0.227772,0.456514
2283,Essential-C Toner,Murad,4.308823,42.0,"[hydrating, brightening, soothing, barrier-rep...",1.893813,1.809169,2.004532,1.852842,2.138785,2.349712
2284,Essential-C Day Moisture Broad Spectrum SPF 30...,Murad,4.248492,68.0,"[anti-inflammatory, hydrating, brightening, so...",1.947492,1.978088,2.035883,1.945347,2.048271,2.314612


In [36]:
# save engineered data
df_final.to_csv("../data/processed/final_df.csv", index=False)