In [None]:
import os, pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from transformers import AutoTokenizer, AutoConfig, AutoModel

In [None]:
MODEL_NAME = "bert-base-uncased"   # bisa ganti ke smaller/larger sesuai GPU
MAX_LEN = 128
BATCH_SIZE = 8
EPOCHS = 10
LR = 2e-5
EMBED_DIM = 128
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

print("Device:", DEVICE)

Device: cuda


In [None]:
df = pd.read_csv("skincare_dataset.csv")
print("Rows:", len(df))
# required columns: product_name, brand, Label, ingredients_list, Combination, Dry, Normal, Oily, Sensitive
skin_cols = [c for c in ["Combination","Dry","Normal","Oily","Sensitive"] if c in df.columns]
print("Skin columns used:", skin_cols)

# text field used for training embedding and classifier
df["text"] = (
    df["product_name"].fillna("") + " " +
    df["brand"].fillna("") + " " +
    df["short_description"].fillna("") + " " +
    df["what_is_it"].fillna("") + " " +
    df["what_does_it_do"].fillna("") + " " +
    df["ingredients_list"].fillna("")
)

# map Label to simpler string for matching (lowercase)
df["Label_clean"] = df["Label"].fillna("").astype(str)


Rows: 1098
Skin columns used: ['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']


In [None]:
if len(skin_cols) == 0:
    raise ValueError("Dataset must contain skin type binary columns.")
y = df[skin_cols].astype(int).values  # shape (N, num_skin)

texts = df["text"].tolist()
train_texts, val_texts, y_train, y_val, train_idx, val_idx = train_test_split(
    texts, y, df.index.tolist(), test_size=0.15, random_state=SEED
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class BertEmbedForMultiLabel(nn.Module):
    def __init__(self, model_name, num_labels, embed_dim=EMBED_DIM):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden = self.bert.config.hidden_size
        # classifier (multi-label)
        self.classifier = nn.Sequential(
            nn.Linear(hidden, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_labels)
        )
        # embedding head for retrieval
        self.embed_head = nn.Sequential(
            nn.Linear(hidden, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, embed_dim)
        )
    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        pooled = out.pooler_output  # [batch, hidden]
        logits = self.classifier(pooled)            # raw logits for BCEWithLogits
        embedding = self.embed_head(pooled)         # raw embedding
        embedding = nn.functional.normalize(embedding, p=2, dim=1)  # normalize
        return logits, embedding

num_labels = y.shape[1]
model = BertEmbedForMultiLabel(MODEL_NAME, num_labels, EMBED_DIM)
model.to(DEVICE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertEmbedForMultiLabel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
# 6) Dataset class
class SkinProductDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k,v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_ds = SkinProductDataset(train_texts, y_train, tokenizer)
val_ds   = SkinProductDataset(val_texts, y_val, tokenizer)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
optim = AdamW(model.parameters(), lr=LR)
criterion = nn.BCEWithLogitsLoss()

In [None]:
def evaluate_val():
    model.eval()
    all_logits = []
    all_labels = []
    with torch.no_grad():
        for batch in val_dl:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].cpu().numpy()

            logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(logits).cpu().numpy()

            all_logits.append(probs)
            all_labels.append(labels)

    all_logits = np.vstack(all_logits)
    all_labels = np.vstack(all_labels)

    preds_bin = (all_logits >= 0.5).astype(int)
    f1 = f1_score(all_labels, preds_bin, average="macro", zero_division=0)

    return f1, all_logits, all_labels


best_f1 = -1.0
threshold = 0.70

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_dl, desc=f"Train epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optim.zero_grad()
        logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optim.step()

        total_loss += loss.item() * input_ids.size(0)

    avg_loss = total_loss / len(train_dl.dataset)
    f1_val, _, _ = evaluate_val()

    print(f"\nEpoch {epoch+1} | Train loss: {avg_loss:.4f} | Val F1-macro: {f1_val:.4f}")

    if f1_val >= threshold:
        torch.save(model.state_dict(), "best_model.pt")
        best_f1 = f1_val
        print(f"MODEL OK — F1 {f1_val:.4f} >= {threshold}. Saved.")
    else:
        print(f"F1 {f1_val:.4f} < {threshold} — model NOT saved.")


if best_f1 < threshold:
    raise ValueError(
        f"Training FAILED: best F1 = {best_f1:.4f} (< {threshold}). "
        "Model not good enough — adjust hyperparameters or retry training."
    )

torch.save(model.state_dict(), "final_bert_hybrid.pt")
print("Training finished. Best val F1:", best_f1)

Train epoch 1:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 1 | Train loss: 0.6506 | Val F1-macro: 0.7296
MODEL OK — F1 0.7296 >= 0.7. Saved.


Train epoch 2:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 2 | Train loss: 0.6457 | Val F1-macro: 0.7285
MODEL OK — F1 0.7285 >= 0.7. Saved.


Train epoch 3:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 3 | Train loss: 0.6406 | Val F1-macro: 0.7241
MODEL OK — F1 0.7241 >= 0.7. Saved.


Train epoch 4:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 4 | Train loss: 0.6106 | Val F1-macro: 0.7065
MODEL OK — F1 0.7065 >= 0.7. Saved.


Train epoch 5:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 5 | Train loss: 0.5677 | Val F1-macro: 0.7276
MODEL OK — F1 0.7276 >= 0.7. Saved.


Train epoch 6:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 6 | Train loss: 0.5052 | Val F1-macro: 0.7281
MODEL OK — F1 0.7281 >= 0.7. Saved.


Train epoch 7:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 7 | Train loss: 0.4387 | Val F1-macro: 0.7410
MODEL OK — F1 0.7410 >= 0.7. Saved.


Train epoch 8:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 8 | Train loss: 0.3744 | Val F1-macro: 0.7757
MODEL OK — F1 0.7757 >= 0.7. Saved.


Train epoch 9:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 9 | Train loss: 0.3016 | Val F1-macro: 0.7487
MODEL OK — F1 0.7487 >= 0.7. Saved.


Train epoch 10:   0%|          | 0/117 [00:00<?, ?it/s]


Epoch 10 | Train loss: 0.2731 | Val F1-macro: 0.7758
MODEL OK — F1 0.7758 >= 0.7. Saved.
Training finished. Best val F1: 0.7758350817464837


In [None]:
import os

print("best_model.pt:", os.path.exists("best_model.pt"))
print("final_bert_hybrid.pt:", os.path.exists("final_bert_hybrid.pt"))

best_model.pt: True
final_bert_hybrid.pt: True


# build

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

Device: cuda
Model loaded successfully.


Building embeddings: 100%|██████████| 35/35 [00:11<00:00,  3.12it/s]

Embeddings shape: (1098, 128)

Saved:
- product_embeddings.npy
- product_index.pkl
- tokenizer_saved/





In [None]:
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
EMBED_DIM = 128
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Device:", DEVICE)

In [None]:
df = pd.read_csv("skincare_dataset.csv")

df["text"] = (
    df["product_name"].fillna("") + " " +
    df["brand"].fillna("") + " " +
    df["short_description"].fillna("") + " " +
    df["what_is_it"].fillna("") + " " +
    df["what_does_it_do"].fillna("") + " " +
    df["ingredients_list"].fillna("")
)

skin_cols = ["Combination","Dry","Normal","Oily","Sensitive"]
skin_cols = [c for c in skin_cols if c in df.columns]
num_labels = len(skin_cols)

In [None]:
class BertEmbedForMultiLabel(nn.Module):
    def __init__(self, model_name, num_labels, embed_dim=128):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden = self.bert.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(hidden, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_labels)
        )

        self.embed_head = nn.Sequential(
            nn.Linear(hidden, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, embed_dim)
        )

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        pooled = out.pooler_output
        logits = self.classifier(pooled)
        emb = self.embed_head(pooled)
        emb = nn.functional.normalize(emb, p=2, dim=1)
        return logits, emb

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BertEmbedForMultiLabel(MODEL_NAME, num_labels, EMBED_DIM).to(DEVICE)

model.load_state_dict(torch.load("best_model.pt", map_location=DEVICE))
model.eval()
print("Model loaded successfully.")

In [None]:
all_texts = df["text"].tolist()
batch_size = 32
emb_list = []

with torch.no_grad():
    for i in tqdm(range(0, len(all_texts), batch_size), desc="Building embeddings"):
        batch = all_texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        input_ids = enc["input_ids"].to(DEVICE)
        att = enc["attention_mask"].to(DEVICE)

        _, emb = model(input_ids=input_ids, attention_mask=att)
        emb_list.append(emb.cpu().numpy())

embeddings = np.vstack(emb_list)
print("Embeddings shape:", embeddings.shape)

In [None]:
np.save("product_embeddings.npy", embeddings)
df.reset_index(drop=True).to_pickle("product_index.pkl")
tokenizer.save_pretrained("tokenizer_saved")

print("\nSaved:")
print("- product_embeddings.npy")
print("- product_index.pkl")
print("- tokenizer_saved/")

In [None]:
!zip -r skincare_model_bundle.zip best_model.pt final_bert_hybrid.pt product_embeddings.npy product_index.pkl tokenizer_saved

  adding: best_model.pt (deflated 7%)
  adding: final_bert_hybrid.pt (deflated 7%)
  adding: product_embeddings.npy (deflated 7%)
  adding: product_index.pkl (deflated 88%)
  adding: tokenizer_saved/ (stored 0%)
  adding: tokenizer_saved/special_tokens_map.json (deflated 42%)
  adding: tokenizer_saved/vocab.txt (deflated 53%)
  adding: tokenizer_saved/tokenizer.json (deflated 71%)
  adding: tokenizer_saved/tokenizer_config.json (deflated 75%)


In [None]:
from google.colab import files
files.download("skincare_model_bundle.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>