In [1]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd



In [2]:
# Load the cosmetics data from CSV file
cosmetics_df = pd.read_csv("D:/download/cosmetics.csv/cosmetics.csv")




In [3]:
# Step 1: Prepare Data with Multi-Output Targets
def prepare_data(df):
    """
    Prepares features and targets for multi-output classification.

    Parameters:
    - df (DataFrame): The cosmetics dataset.

    Returns:
    - X (DataFrame): Feature data.
    - y (DataFrame): Multi-output target data for skin types.
    """
    X = df[['Ingredients', 'Price']]
    y = df[['Oily', 'Normal', 'Dry']]
    return X, y



In [4]:
# Step 2: Create Multi-Output Model Pipeline
def create_multioutput_pipeline():
    """
    Creates a multi-output machine learning pipeline with TF-IDF and RandomForestClassifier.

    Returns:
    - pipeline (Pipeline): Scikit-learn pipeline for multi-output classification.
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(), 'Ingredients'),  # TF-IDF for text features
            ('num', StandardScaler(), ['Price'])         # Standard scaling for price
        ])
    
    # MultiOutputClassifier wraps RandomForest to handle multiple target columns
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
    ])
    return pipeline



In [5]:
# Step 3: Train and Evaluate the Model
def train_model(df):
    """
    Trains a multi-output model for all skin types.

    Parameters:
    - df (DataFrame): The cosmetics dataset.

    Returns:
    - pipeline (Pipeline): Trained model pipeline.
    """
    X, y = prepare_data(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    pipeline = create_multioutput_pipeline()
    pipeline.fit(X_train, y_train)
    
    return pipeline



In [10]:
# Step 4: Generate Recommendations
def generate_recommendations(df, model, skin_type, category, price_range=(0, 100), top_n=5):
    """
    Uses the trained model to predict and rank recommendations based on skin type and price range.

    Parameters:
    - df (DataFrame): The cosmetics dataset.
    - model (Pipeline): Trained multi-output model.
    - skin_type (str): Target skin type to filter ('Oily', 'Normal', 'Dry').
    - category (str): Product category to filter.
    - price_range (tuple): Price range for recommendations, e.g., (min_price, max_price).
    - top_n (int): Number of top recommendations to return.

    Returns:
    - recommendations (DataFrame): DataFrame of top recommendations based on suitability.
    """
    min_price, max_price = price_range
    category_df = df[(df['Label'] == category) & (df['Price'] >= min_price) & (df['Price'] <= max_price)]
    if category_df.empty:
        print(f"No products found for {category} in the price range {price_range}")
        return pd.DataFrame()
    # Predict probabilities for the specified skin type
    probabilities = model.predict_proba(category_df[['Ingredients', 'Price']])
    skin_type_index = {'Oily': 0, 'Normal': 1, 'Dry': 2}[skin_type]
    #suitability_scores = [p[skin_type_index][:, 1] for p in probabilities]  # Probability of suitability for skin type
    # Extract probability of suitability (1st class) for the selected skin type
    skin_type_probabilities = probabilities[0]  # Since this is the 1D array of predicted probabilities

    suitability_scores = skin_type_probabilities[:, 1]  # Get the probabilities for class 1 (suitable)

    # Add suitability scores to the DataFrame
    # Add suitability scores to the DataFrame
    category_df = category_df.assign(predicted_suitability=suitability_scores)
    recommendations = category_df.sort_values(by=['predicted_suitability', 'Rank'], ascending=False).head(top_n)
    
    return recommendations[['Brand', 'Name', 'Price', 'Rank', 'predicted_suitability']]



In [12]:
# Step 5: Train the Model and Get Recommendations
# Train multi-output model for all skin types
multi_skin_model = train_model(cosmetics_df)

# Example usage: Get top 5 recommendations for 'Moisturizer' for 'Oily' skin within a price range
print("\nTop Recommendations for Oily Skin - Moisturizer within Price Range (20, 50):")
recommendations = generate_recommendations(cosmetics_df, multi_skin_model, skin_type='Oily', category='Moisturizer', price_range=(20, 50), top_n=5)
recommendations


Top Recommendations for Oily Skin - Moisturizer within Price Range (20, 50):


Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
45,GLOW RECIPE,Watermelon Glow Sleeping Mask,45,4.1,1.0
280,PHILOSOPHY,Hope In A Jar,39,4.0,1.0
36,LANEIGE,Water Sleeping Mask,25,4.4,0.99
46,HERBIVORE,Pink Cloud Rosewater Moisture Crème,48,4.6,0.97
34,IT COSMETICS,Your Skin But Better CC+ Cream Oil-Free Matte ...,38,3.9,0.97


In [15]:

# Example usage: Get top 5 recommendations for 'Moisturizer' for 'Oily' skin within a price range
print("\nTop Recommendations for Oily Skin - Moisturizer within Price Range (20, 50):")
recommendations = generate_recommendations(cosmetics_df, multi_skin_model, skin_type='Dry', category='Cleanser', price_range=(20, 250), top_n=5)
recommendations


Top Recommendations for Oily Skin - Moisturizer within Price Range (20, 50):


Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
376,ORIGINS,GinZing™ Refreshing Scrub Cleanser,20,4.5,0.97
328,BOSCIA,Detoxifying Black Charcoal Cleanser,30,4.3,0.97
404,INDIE LEE,Brightening Cleanser,32,4.4,0.96
299,DRUNK ELEPHANT,T.L.C. Framboos™ Glycolic Night Serum,90,4.3,0.96
362,FARMACY,Clean Bee Ultra Gentle Facial Cleanser,28,4.3,0.96


In [16]:
# Example usage: Get top 5 recommendations for 'Moisturizer' for 'Oily' skin within a price range
print("\nTop Recommendations for Oily Skin - Moisturizer within Price Range (20, 50):")
recommendations = generate_recommendations(cosmetics_df, multi_skin_model, skin_type='Oily', category='Cleanser', price_range=(20, 250), top_n=5)
recommendations


Top Recommendations for Oily Skin - Moisturizer within Price Range (20, 50):


Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
376,ORIGINS,GinZing™ Refreshing Scrub Cleanser,20,4.5,0.97
328,BOSCIA,Detoxifying Black Charcoal Cleanser,30,4.3,0.97
404,INDIE LEE,Brightening Cleanser,32,4.4,0.96
299,DRUNK ELEPHANT,T.L.C. Framboos™ Glycolic Night Serum,90,4.3,0.96
362,FARMACY,Clean Bee Ultra Gentle Facial Cleanser,28,4.3,0.96


In [17]:
import joblib

# Save the model
joblib.dump(multi_skin_model, 'recommendation_model.pkl')


['recommendation_model.pkl']

In [19]:
# Load the saved model
loaded_model = joblib.load('recommendation_model.pkl')


In [22]:
# Example usage: Get top 5 recommendations for 'Moisturizer' for 'Oily' skin within a price range
print("\nTop Recommendations for Dry Skin - cleanser within Price Range (20, 150):")
recommendations = generate_recommendations(cosmetics_df, loaded_model, skin_type='Dry', category='Cleanser', price_range=(20, 150), top_n=10)
recommendations


Top Recommendations for Dry Skin - cleanser within Price Range (20, 150):


Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
376,ORIGINS,GinZing™ Refreshing Scrub Cleanser,20,4.5,0.97
328,BOSCIA,Detoxifying Black Charcoal Cleanser,30,4.3,0.97
404,INDIE LEE,Brightening Cleanser,32,4.4,0.96
299,DRUNK ELEPHANT,T.L.C. Framboos™ Glycolic Night Serum,90,4.3,0.96
362,FARMACY,Clean Bee Ultra Gentle Facial Cleanser,28,4.3,0.96
408,REN CLEAN SKINCARE,Evercalm™ Gentle Cleansing Milk,30,4.3,0.96
518,KIEHL'S SINCE 1851,Clearly Corrective™ Brightening & Exfoliating ...,29,4.3,0.96
571,KIEHL'S SINCE 1851,Herbal-Infused Micellar Cleansing Water,28,3.7,0.96
351,CAUDALIE,Vinopure Natural Salicylic Acid Pore Minimizin...,28,3.8,0.9525
391,IT COSMETICS,Confidence in a Cleanser™ Skin-Transforming Hy...,28,4.7,0.95


In [23]:
# Example usage: Get top 5 recommendations for 'Moisturizer' for 'Oily' skin within a price range
print("\nTop Recommendations for Normal Skin - cleanser within Price Range (20, 150):")
recommendations = generate_recommendations(cosmetics_df, loaded_model, skin_type='Normal', category='Cleanser', price_range=(20, 150), top_n=10)
recommendations


Top Recommendations for Normal Skin - cleanser within Price Range (20, 150):


Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
376,ORIGINS,GinZing™ Refreshing Scrub Cleanser,20,4.5,0.97
328,BOSCIA,Detoxifying Black Charcoal Cleanser,30,4.3,0.97
404,INDIE LEE,Brightening Cleanser,32,4.4,0.96
299,DRUNK ELEPHANT,T.L.C. Framboos™ Glycolic Night Serum,90,4.3,0.96
362,FARMACY,Clean Bee Ultra Gentle Facial Cleanser,28,4.3,0.96
408,REN CLEAN SKINCARE,Evercalm™ Gentle Cleansing Milk,30,4.3,0.96
518,KIEHL'S SINCE 1851,Clearly Corrective™ Brightening & Exfoliating ...,29,4.3,0.96
571,KIEHL'S SINCE 1851,Herbal-Infused Micellar Cleansing Water,28,3.7,0.96
351,CAUDALIE,Vinopure Natural Salicylic Acid Pore Minimizin...,28,3.8,0.9525
391,IT COSMETICS,Confidence in a Cleanser™ Skin-Transforming Hy...,28,4.7,0.95


In [25]:
# Example usage: Get top 5 recommendations for 'Moisturizer' for 'Oily' skin within a price range
print("\nTop Recommendations for Normal Skin - cleanser within Price Range (20, 150):")
recommendations = generate_recommendations(cosmetics_df, loaded_model, skin_type='Normal', category='Cleanser', price_range=(20, 150), top_n=5)
recommendations


Top Recommendations for Normal Skin - cleanser within Price Range (20, 150):


Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
376,ORIGINS,GinZing™ Refreshing Scrub Cleanser,20,4.5,0.97
328,BOSCIA,Detoxifying Black Charcoal Cleanser,30,4.3,0.97
404,INDIE LEE,Brightening Cleanser,32,4.4,0.96
299,DRUNK ELEPHANT,T.L.C. Framboos™ Glycolic Night Serum,90,4.3,0.96
362,FARMACY,Clean Bee Ultra Gentle Facial Cleanser,28,4.3,0.96


In [33]:
# Example usage: Get top 5 recommendations for 'Moisturizer' for 'Oily' skin within a price range
print("\nTop Recommendations for Normal Skin - cleanser within Price Range (20, 150):")
recommendations1 = generate_recommendations(cosmetics_df, loaded_model, skin_type='Normal', category='Treatment', price_range=(50, 150), top_n=10)
recommendations1


Top Recommendations for Normal Skin - cleanser within Price Range (20, 150):


Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
589,GLAMGLOW,SUPERMUD® Activated Charcoal Treatment,59,4.2,0.98
796,PHILOSOPHY,Time In A Bottle 100% In-Control,76,4.0,0.98
755,DR. DENNIS GROSS SKINCARE,Doctor's Kit Gold Standard Anti-Aging Solution,59,5.0,0.97
667,DR. DENNIS GROSS SKINCARE,Alpha Beta® Medi–Spa Peel,64,4.5,0.97
579,DRUNK ELEPHANT,C-Firma™ Day Serum,80,4.1,0.97
646,CAUDALIE,Vine[Activ] Vitamin C Anti-Wrinkle Serum,52,4.1,0.97
689,DR. DENNIS GROSS SKINCARE,C+ Collagen Brighten & Firm Vitamin C Serum,78,3.9,0.97
621,OLEHENRIKSEN,Glow Cycle Retin-ALT Power Serum,58,4.4,0.96
795,DR. BRANDT SKINCARE,Bright Biotic™ Dark Spot Minimizing Serum,55,4.3,0.96
745,ORIGINS,Dr. Andrew Weil For Origins™ Mega-Mushroom Rel...,56,4.1,0.96


In [34]:
# Example usage: Get top 5 recommendations for 'Moisturizer' for 'Oily' skin within a price range
print("\nTop Recommendations for Normal Skin - cleanser within Price Range (20, 150):")
recommendations = generate_recommendations(cosmetics_df, loaded_model, skin_type='Dry', category='Treatment', price_range=(50, 150), top_n=10)
recommendations


Top Recommendations for Normal Skin - cleanser within Price Range (20, 150):


Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
589,GLAMGLOW,SUPERMUD® Activated Charcoal Treatment,59,4.2,0.98
796,PHILOSOPHY,Time In A Bottle 100% In-Control,76,4.0,0.98
755,DR. DENNIS GROSS SKINCARE,Doctor's Kit Gold Standard Anti-Aging Solution,59,5.0,0.97
667,DR. DENNIS GROSS SKINCARE,Alpha Beta® Medi–Spa Peel,64,4.5,0.97
579,DRUNK ELEPHANT,C-Firma™ Day Serum,80,4.1,0.97
646,CAUDALIE,Vine[Activ] Vitamin C Anti-Wrinkle Serum,52,4.1,0.97
689,DR. DENNIS GROSS SKINCARE,C+ Collagen Brighten & Firm Vitamin C Serum,78,3.9,0.97
621,OLEHENRIKSEN,Glow Cycle Retin-ALT Power Serum,58,4.4,0.96
795,DR. BRANDT SKINCARE,Bright Biotic™ Dark Spot Minimizing Serum,55,4.3,0.96
745,ORIGINS,Dr. Andrew Weil For Origins™ Mega-Mushroom Rel...,56,4.1,0.96


In [35]:
recommendations == recommendations1

Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
589,True,True,True,True,True
796,True,True,True,True,True
755,True,True,True,True,True
667,True,True,True,True,True
579,True,True,True,True,True
646,True,True,True,True,True
689,True,True,True,True,True
621,True,True,True,True,True
795,True,True,True,True,True
745,True,True,True,True,True


In [45]:

recommendations = generate_recommendations(cosmetics_df, loaded_model, skin_type='Dry', category='Treatment', price_range=(151, 150), top_n=10)
recommendations

ValueError: Found array with 0 sample(s) (shape=(0, 4184)) while a minimum of 1 is required by TfidfTransformer.

In [46]:
def generate_recommendations(df, model, skin_type, category, price_range=(0, 100), top_n=5):
    """
    Uses the trained model to predict and rank recommendations based on skin type and price range.

    Parameters:
    - df (DataFrame): The cosmetics dataset.
    - model (Pipeline): Trained multi-output model.
    - skin_type (str): Target skin type to filter ('Oily', 'Normal', 'Dry').
    - category (str): Product category to filter.
    - price_range (tuple): Price range for recommendations, e.g., (min_price, max_price).
    - top_n (int): Number of top recommendations to return.

    Returns:
    - recommendations (DataFrame): DataFrame of top recommendations based on suitability.
    """
    # Extract the price range
    min_price, max_price = price_range
    
    # Filter dataset based on category and price range
    category_df = df[(df['Label'] == category) & (df['Price'] >= min_price) & (df['Price'] <= max_price)]
    
    # Check if the category_df is empty
    if category_df.empty:
        print(f"No products found for {category} in the price range {price_range}")
        return pd.DataFrame()  # Return an empty DataFrame
    
    # Predict probabilities for the specified skin type
    probabilities = model.predict_proba(category_df[['Ingredients', 'Price']])
    
    # Map skin types to column indices in the probability array
    skin_type_index = {'Oily': 0, 'Normal': 1, 'Dry': 2}
    
    # Extract probabilities for the relevant skin type
    skin_type_probabilities = [p[skin_type_index[skin_type]] for p in probabilities]
    
    # We need to get the probability of suitability for class 1 (suitable) for the selected skin type
    suitability_scores = [p[1] for p in skin_type_probabilities]  # Probability of being suitable for the skin type
    
    # Add the suitability scores to the DataFrame
    category_df = category_df.assign(predicted_suitability=suitability_scores)
    
    # Sort and return the top_n recommendations
    recommendations = category_df.sort_values(by=['predicted_suitability', 'Rank'], ascending=False).head(top_n)
    
    return recommendations[['Brand', 'Name', 'Price', 'Rank', 'predicted_suitability']]

# Example usage: Get top 5 recommendations for 'Moisturizer' for 'Oily' skin within a price range
recommendations = generate_recommendations(cosmetics_df, multi_skin_model, skin_type='Oily', category='Moisturizer', price_range=(20, 50), top_n=5)
print(recommendations)


ValueError: Length of values (3) does not match length of index (148)

In [39]:
categories = list(cosmetics_df['Label'].unique())

In [40]:
categories

['Moisturizer',
 'Cleanser',
 'Treatment',
 'Face Mask',
 'Eye cream',
 'Sun protect']

In [43]:

recommendations = generate_recommendations(cosmetics_df, loaded_model, skin_type='Dry', category='Face Mask', price_range=(150, 350), top_n=5)
recommendations

Unnamed: 0,Brand,Name,Price,Rank,predicted_suitability
864,GLAMGLOW,The Ultimate Glow Set,169,0.0,0.94
999,AMOREPACIFIC,TIME RESPONSE Vintage Wash-off Masque,170,5.0,0.93
1080,EVE LOM,WHITE Brightening Mask,160,2.7,0.93
870,SK-II,Brightening Derm Revival Mask,170,4.5,0.9
940,LA MER,Treatment Lotion Hydrating Mask,150,4.1,0.845
