In [4]:
!pip install openpyxl
import openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [5]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tqdm import tqdm
import os
import pickle
import json

# Load dataset
data = pd.read_excel('dataset1.xlsx')

# Prepare the target labels
label_encoder = LabelEncoder()
data['Category'] = label_encoder.fit_transform(data['Category'])
num_labels = len(label_encoder.classes_)

# Combine input columns into a single text input
data['input_text'] = data[['Skin_Type', 'Product', 'Brand', 'Ingredients', 'Review_Cleaned']].astype(str).agg(' '.join, axis=1)

# Tokenization and Preprocessing using BERT's tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ProductReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Split data into training and testing sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['input_text'].values,
    data['Category'].values,
    test_size=0.2,
    random_state=42
)

# Create DataLoaders
batch_size = 16
train_dataset = ProductReviewDataset(train_texts, train_labels, tokenizer)
val_dataset = ProductReviewDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Load BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model = model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training function
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in tqdm(data_loader, desc="Training", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Evaluation function
def eval_model(model, data_loader, device):
    model.eval()
    correct_predictions = 0
    losses = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation", unit="batch"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Create output directory
os.makedirs('model_artifacts', exist_ok=True)

# Training loop
epochs = 1
best_val_acc = 0

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Train loss: {train_loss} | Train accuracy: {train_acc}')

    val_acc, val_loss = eval_model(model, val_loader, device)
    print(f'Validation loss: {val_loss} | Validation accuracy: {val_acc}')

    # Save the model if it's the best so far
    if val_acc > best_val_acc:
        best_val_acc = val_acc

        # Save model weights
        torch.save(model.state_dict(), 'model_artifacts/best_model_weights.pth')

        # Save tokenizer
        tokenizer.save_pretrained('model_artifacts/tokenizer')

        # Save label encoder
        with open('model_artifacts/label_encoder.pkl', 'wb') as f:
            pickle.dump(label_encoder, f)

        # Save model configuration
        model_config = {
            'max_length': 128,
            'num_labels': num_labels,
            'model_name': 'bert-base-uncased'
        }
        with open('model_artifacts/model_config.json', 'w') as f:
            json.dump(model_config, f)

# Model evaluation on validation set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1


Training: 100%|██████████| 433/433 [41:32<00:00,  5.76s/batch]


Train loss: 0.3275000738586551 | Train accuracy: 0.8816302933949993


Validation: 100%|██████████| 109/109 [02:32<00:00,  1.40s/batch]


Validation loss: 0.01212782291011936 | Validation accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

    Cleanser       1.00      1.00      1.00       818
   Face Mask       1.00      1.00      1.00         7
 Moisturizer       1.00      1.00      1.00       517
   Treatment       1.00      1.00      1.00       388

    accuracy                           1.00      1730
   macro avg       1.00      1.00      1.00      1730
weighted avg       1.00      1.00      1.00      1730



In [6]:
import torch
import json
import pickle
from transformers import BertTokenizer, BertForSequenceClassification

class SkincareReviewPredictor:
    def __init__(self,
                 model_weights_path='model_artifacts/best_model_weights.pth',
                 config_path='model_artifacts/model_config.json',
                 tokenizer_path='model_artifacts/tokenizer',
                 label_encoder_path='model_artifacts/label_encoder.pkl'):
        """
        Initialize the predictor with saved model artifacts

        Args:
            model_weights_path (str): Path to saved model weights
            config_path (str): Path to model configuration file
            tokenizer_path (str): Path to saved tokenizer
            label_encoder_path (str): Path to saved label encoder
        """
        # Determine device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load model configuration
        with open(config_path, 'r') as f:
            self.model_config = json.load(f)

        # Load label encoder
        with open(label_encoder_path, 'rb') as f:
            self.label_encoder = pickle.load(f)

        # Load tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

        # Load model
        self.model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=self.model_config['num_labels']
        )
        self.model.load_state_dict(torch.load(model_weights_path, map_location=self.device))
        self.model = self.model.to(self.device)
        self.model.eval()

    def preprocess_input(self,
                          skin_type='',
                          product='',
                          brand='',
                          ingredients='',
                          review='',
                          max_length=128):
        """
        Preprocess input features into a single text input

        Args:
            skin_type (str): Skin type description
            product (str): Product name
            brand (str): Brand name
            ingredients (str): Product ingredients
            review (str): Product review text
            max_length (int): Maximum sequence length for tokenization

        Returns:
            dict: Tokenized input
        """
        # Combine input features into a single text
        input_text = f"{skin_type} {product} {brand} {ingredients} {review}".strip()

        # Tokenize input
        encoding = self.tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].to(self.device),
            'attention_mask': encoding['attention_mask'].to(self.device)
        }

    def predict_category(self, **kwargs):
        """
        Predict category for given input

        Args:
            **kwargs: Keyword arguments for input features

        Returns:
            str: Predicted category
        """
        # Preprocess input
        processed_input = self.preprocess_input(**kwargs)

        # Make prediction
        with torch.no_grad():
            outputs = self.model(
                input_ids=processed_input['input_ids'],
                attention_mask=processed_input['attention_mask']
            )
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

        # Convert prediction to original category
        predicted_category = self.label_encoder.inverse_transform(preds.cpu().numpy())[0]

        return predicted_category

    def get_available_categories(self):
        """
        Return list of available categories

        Returns:
            list: Available categories
        """
        return list(self.label_encoder.classes_)

# Example usage demonstration
def main():
    # Initialize the predictor
    predictor = SkincareReviewPredictor()

    # Print available categories
    print("Available Categories:")
    print(predictor.get_available_categories())

    # Interactive prediction loop
    while True:
        print("\n--- Skincare Product Category Prediction ---")

        # Get user inputs
        skin_type = input("Enter Skin Type (optional): ")
        product = input("Enter Product Name (optional): ")
        brand = input("Enter Brand Name (optional): ")
        ingredients = input("Enter Ingredients (optional): ")
        review = input("Enter Review Text (optional): ")

        # Skip if no input provided
        if not any([skin_type, product, brand, ingredients, review]):
            print("No input provided. Exiting...")
            break

        try:
            # Predict category
            predicted_category = predictor.predict_category(
                skin_type=skin_type,
                product=product,
                brand=brand,
                ingredients=ingredients,
                review=review
            )

            print(f"\nPredicted Category: {predicted_category}")

        except Exception as e:
            print(f"An error occurred: {e}")

        # Ask if user wants to continue
        continue_prediction = input("\nDo you want to predict another category? (yes/no): ").lower()
        if continue_prediction != 'yes':
            break

if __name__ == "__main__":
    main()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Available Categories:
['Cleanser', 'Face Mask', 'Moisturizer', 'Treatment']

--- Skincare Product Category Prediction ---

Predicted Category: Moisturizer
