In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install sentence-transformers


Looking in indexes: https://download.pytorch.org/whl/cu121


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sentence_transformers import SentenceTransformer


In [None]:
random.seed(42)
np.random.seed(42)

# --- Konfigurasi Jumlah Data ---
num_products = 1500
num_chemicals = 2000
num_relations = 3500

# --- Kategori dan Jenis Kulit ---
categories = ["serum", "moisturizer", "sunscreen", "cleanser", "toner", "mask"]
skin_types = ["kering", "berminyak", "sensitif", "kombinasi", "normal"]


In [None]:
brand_pool = [
    "Kaca", "Lumière", "DermaLab", "SkinMuse", "Aurelia", "NaturaDerm",
    "Cerin", "GlowWorks", "PureKind", "VitaSkin", "HydraTalk", "CalmTheory"
]

actives_pool = [
    "Niacinamide", "Alpha Arbutin", "Retinol", "Vitamin C", "Hyaluronic Acid",
    "Salicylic Acid", "Ceramide", "AHA BHA PHA", "Centella", "Azelaic Acid",
    "Tranexamic Acid", "Peptide", "Tea Tree", "Panthenol", "Snail Mucin"
]

benefits_pool = [
    "Brightening", "Hydrating", "Soothing", "Barrier Repair",
    "Acne Care", "Anti Aging", "Oil Control", "UV Defense"
]

product_form_by_cat = {
    "serum": ["Serum", "Essence Serum", "Booster Serum"],
    "moisturizer": ["Moisturizer", "Gel Cream", "Barrier Cream"],
    "sunscreen": ["Sunscreen SPF50 PA++++", "UV Shield SPF50", "Sun Gel SPF50"],
    "cleanser": ["Gentle Cleanser", "Foam Cleanser", "Gel Cleanser"],
    "toner": ["Hydrating Toner", "Exfoliating Toner", "Calming Toner"],
    "mask": ["Clay Mask", "Sheet Mask", "Sleeping Mask"]
}
real_compounds_pool = [
    "Glycerin","Propylene Glycol","Butylene Glycol","Pentylene Glycol",
    "Sodium Hyaluronate","Hyaluronic Acid","Panthenol","Betaine","Allantoin",
    "Squalane","Dimethicone","Caprylic/Capric Triglyceride","Cetearyl Alcohol",
    "Ceramide NP","Ceramide AP","Ceramide EOP","Cholesterol","Linoleic Acid",
    "Niacinamide","Alpha Arbutin","Retinol","Ascorbic Acid",
    "Salicylic Acid","Glycolic Acid","Lactic Acid","Azelaic Acid",
    "Tranexamic Acid","Kojic Acid","Bakuchiol","Centella Asiatica Extract",
    "Green Tea Extract","Licorice Root Extract","Tea Tree Oil",
    "Zinc Oxide","Titanium Dioxide","Octocrylene","Avobenzone",
    "Phenoxyethanol","Ethylhexylglycerin","Sodium Benzoate",
    "Potassium Sorbate","Disodium EDTA","Tocopherol",
    "Cocamidopropyl Betaine","Decyl Glucoside","Sodium Cocoyl Isethionate"
]

function_pool_real = [
    "humektan","emolien","antioksidan","pengawet",
    "pelarut","surfactant","UV filter","brightening agent",
    "exfoliant","soothing agent","barrier support"
]






In [None]:
def generate_products():
    products = []
    all_cmp_ids = [f"CMP{j+1:04d}" for j in range(num_chemicals)]

    for i in range(num_products):
        cat = random.choice(categories)
        brand = random.choice(brand_pool)
        form = random.choice(product_form_by_cat[cat])

        # pilih 1 atau 2 hero ingredients dari mapping
        hero_k = random.choice([1, 2])
        hero_cmps = random.sample(mapped_cmp_ids, k=hero_k)
        hero_names = [cmp_to_active[h] for h in hero_cmps]

        # total ingredients 5–10 (hero sudah termasuk)
        total_k = random.randint(5, 10)
        other_cmps = random.sample([c for c in all_cmp_ids if c not in hero_cmps],
                                   k=total_k - hero_k)

        key_ing = hero_cmps + other_cmps

        # string hero buat nama
        if hero_k == 1:
            hero_str = hero_names[0]
        else:
            hero_str = f"{hero_names[0]} + {hero_names[1]}"

        # kadang kasih persen di hero pertama
        if random.random() < 0.6:
            pct = random.choice([2,3,4,5,7,10])
            hero_str = hero_str.replace(hero_names[0], f"{hero_names[0]} {pct}%")

        product_name = f"{brand} {hero_str} {form}"

        products.append({
            "product_id": f"PRD{i+1:04d}",
            "product_name": product_name,
            "brand": brand,
            "hero_ingredients": ", ".join(hero_names),
            "category": cat,
            "target_skin_type": random.choice(skin_types),
            "key_ingredients": ",".join(key_ing),
            "effectiveness_score": round(random.uniform(0.4, 1.0), 2),
            "safety_score": round(random.uniform(0.5, 1.0), 2),
            "popularity_index": round(random.uniform(0.0, 1.0), 2),
            "description": f"Produk dengan efek {random.choice(['melembapkan', 'mencerahkan', 'menenangkan', 'melindungi'])}."
        })

    return pd.DataFrame(products)

In [None]:
def generate_chemicals():
    chemicals = []
    pool = real_compounds_pool.copy()
    random.shuffle(pool)

    for i in range(num_chemicals):
        compound_name = pool[i] if i < len(pool) else random.choice(real_compounds_pool)

        chemicals.append({
            "compound_id": f"CMP{i+1:04d}",
            "compound_name": compound_name,

            "molecular_formula": f"C{random.randint(1,40)}H{random.randint(2,80)}O{random.randint(0,15)}",
            "molecular_weight": round(random.uniform(80, 900), 2),
            "reactivity_score": round(random.uniform(0.1, 1.0), 2),
            "toxicity_level": round(random.uniform(0.0, 1.0), 2),
            "solubility": random.choice(["water", "alcohol", "oil"]),
            "function": random.choice(function_pool_real),
            "origin": random.choice(["alami", "sintetis", "turunan bio"]),

            "stability_index": round(random.uniform(0.3, 1.0), 2),
            "skin_absorption_rate": round(random.uniform(0.1, 1.0), 2),
            "ph_value": round(random.uniform(3.0, 9.0), 2)
        })
    return pd.DataFrame(chemicals)


In [None]:
def generate_relations(df_products, df_chemicals):
    relations = []
    for i in range(num_relations):
        relations.append({
            "relation_id": f"REL{i+1:04d}",
            "product_id": random.choice(df_products["product_id"].tolist()),
            "compound_id": random.choice(df_chemicals["compound_id"].tolist()),
            "percentage_in_formula": round(random.uniform(0.1, 5.0), 2),
            "role_in_product": random.choice(["bahan aktif", "pengawet", "pewangi", "pelarut"]),
            "synergy_score": round(random.uniform(0.2, 1.0), 2),
            "interaction_type": random.choice(["sinergis", "netral", "antagonis"]),
            "potential_new_compound": random.choice([True, False])
        })
    return pd.DataFrame(relations)



In [None]:
df_chemicals = generate_chemicals()

cmp_to_active = dict(
    zip(df_chemicals["compound_id"].head(10),
        df_chemicals["compound_name"].head(10))
)
mapped_cmp_ids = list(cmp_to_active.keys())

df_products = generate_products()
df_relations = generate_relations(df_products, df_chemicals)

df_relations = df_relations.merge(
    df_chemicals[["compound_id", "compound_name"]],
    on="compound_id",
    how="left"
)

print(df_products.head())
print(df_chemicals.head())
print(df_relations.head())

  product_id                                       product_name       brand  \
0    PRD0001             PureKind Bakuchiol 10% UV Shield SPF50    PureKind   
1    PRD0002                 Kaca Ascorbic Acid 5% Gel Cleanser        Kaca   
2    PRD0003  CalmTheory Ascorbic Acid 7% + Niacinamide Boos...  CalmTheory   
3    PRD0004              DermaLab Niacinamide 10% Gel Cleanser    DermaLab   
4    PRD0005     DermaLab Cetearyl Alcohol 7% Exfoliating Toner    DermaLab   

             hero_ingredients   category target_skin_type  \
0                   Bakuchiol  sunscreen         sensitif   
1               Ascorbic Acid   cleanser           normal   
2  Ascorbic Acid, Niacinamide      serum           normal   
3                 Niacinamide   cleanser        berminyak   
4            Cetearyl Alcohol      toner           normal   

                                     key_ingredients  effectiveness_score  \
0  CMP0003,CMP1308,CMP1485,CMP0178,CMP0935,CMP032...                 0.64   
1  C

In [None]:
# Simpan ke file CSV
df_products.to_csv("products.csv", index=False)
df_chemicals.to_csv("chemicals.csv", index=False)
df_relations.to_csv("relations.csv", index=False)

print("✅ Semua file berhasil disimpan!")



✅ Semua file berhasil disimpan!


In [None]:
df_products = pd.read_csv("products.csv")
df_chemicals = pd.read_csv("chemicals.csv")
df_relations = pd.read_csv("relations.csv")

print("Jumlah data:")
print("Products:", len(df_products))
print("Chemicals:", len(df_chemicals))
print("Relations:", len(df_relations))

Jumlah data:
Products: 1500
Chemicals: 2000
Relations: 3500


In [None]:
print(df_products.columns)


Index(['product_id', 'product_name', 'brand', 'hero_ingredients', 'category',
       'target_skin_type', 'key_ingredients', 'effectiveness_score',
       'safety_score', 'popularity_index', 'description'],
      dtype='object')


In [None]:
df_products['text'] = df_products.apply(
    lambda row: (
        f"{row['product_name']}. "
        f"Kategori {row['category']}. "
        f"Cocok untuk kulit {row['target_skin_type']}. "
        f"Mengandung {row['key_ingredients']}. "
        f"Efektivitas {row['effectiveness_score']} dan keamanan {row['safety_score']}. "
        f"{row['description']}"
    ),
    axis=1
)


In [None]:
model = SentenceTransformer("all-mpnet-base-v2")

texts = df_products["text"].tolist()

embeddings = model.encode(texts, convert_to_numpy=True)
df_products["embedding"] = embeddings.tolist()

print("Vector shape:", embeddings.shape)


Vector shape: (1500, 768)


In [None]:
df_chemicals['text'] = df_chemicals.apply(
    lambda row: (
        f"{row['compound_name']} dengan formula {row['molecular_formula']}. "
        f"Berat molekul {row['molecular_weight']}. "
        f"Reaktivitas {row['reactivity_score']}, toksisitas {row['toxicity_level']}. "
        f"Larut dalam {row['solubility']}. "
        f"Berfungsi sebagai {row['function']} dan berasal dari sumber {row['origin']}. "
        f"Stabilitas {row['stability_index']}, absorpsi kulit {row['skin_absorption_rate']}, pH {row['ph_value']}."
    ),
    axis=1
)


In [None]:
chemical_embeddings = model.encode(df_chemicals["text"].tolist(), convert_to_numpy=True)
df_chemicals["embedding"] = chemical_embeddings.tolist()


In [None]:
df_relations['text'] = df_relations.apply(
    lambda row: (
        f"Relasi antara produk {row['product_id']} "
        f"dan senyawa {row['compound_name']} ({row['compound_id']}). "
        f"Persentase komposisi {row['percentage_in_formula']} persen. "
        f"Berperan sebagai {row['role_in_product']}. "
        f"Skor sinergi {row['synergy_score']}. "
        f"Tipe interaksi {row['interaction_type']}. "
        f"Potensi senyawa baru: {row['potential_new_compound']}."
    ),
    axis=1
)


In [None]:
relation_embeddings = model.encode(df_relations["text"].tolist(), convert_to_numpy=True)
df_relations["embedding"] = relation_embeddings.tolist()

In [None]:
df_products.to_csv("products_with_embeddings.csv", index=False)
df_chemicals.to_csv("chemicals_with_embeddings.csv", index=False)
df_relations.to_csv("relations_with_embeddings.csv", index=False)
