# **1.0 Initial Setup**

In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import torch

from datetime import datetime
from torch.cuda.amp import autocast # automatic mixed precision
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm # progress bar
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Check for GPU availability and set the device accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead.")

# Configurations
pd.set_option("display.max_colwidth", 100)
pd.set_option("display.max_rows", None)
scaler = torch.cuda.amp.GradScaler() # enable automatic mixed precision training
start_time = datetime.now()


GPU available: NVIDIA GeForce RTX 4050 Laptop GPU


# **2.0 Load Data**

In [2]:
# Load the skincare_reviews dataset
skincare_reviews = pd.read_csv("processed_data/skincare_reviews.csv", encoding="utf-8", low_memory=False)


In [3]:
# Display the first 5 rows of skincare_reviews
skincare_reviews.head().transpose()


Unnamed: 0,0,1,2,3,4
product_id,P503052,P423688,P504045,P479125,P440650
product_name,Mini Flatter Me Digestive Enzyme Supplement,Daily Microfoliant Exfoliator,Daily Dose Bioretinol + Mineral SPF 40 with Bakuchiol,Mini Cloud Dew Oil-Free Gel Cream,Mini Retinol Youth Renewal Serum
brand_name,HUM Nutrition,Dermalogica,Supergoop!,Summer Fridays,Murad
average_rating,4.2643,4.696,4.7357,4.4613,4.6089
ingredients,"['Protease 3.0, Protease 4.5, Protease 6.0, Peptidase, Bromelain, Neutral Bacterial Protease, Pa...","['Microcrystalline Cellulose, Magnesium Oxide, Sodium Cocoyl Isethionate, Colloidal Oatmeal, Dis...","['Aqua/Water/Eau, Dicaprylyl Ether, Caprylic/Capric Triglyceride, Butyloctyl Salicylate, Polygly...","['Water/Aqua/Eau, Propanediol, Dicaprylyl Carbonate, Glycerin, Pentylene Glycol, Sodium Hyaluron...","['Water/Aqua/Eau, Cyclopentasiloxane, Isopropyl Palmitate, C14-22 Alcohols, Polymethylsilsesquio..."
price_usd,15.0,65.0,46.0,17.0,32.0
limited_edition,na,na,na,na,na
highlights,"['Vegan', 'Clean at Sephora', 'Without Phthalates', 'Without Parabens', 'Gluten Free']","['Vegan', 'Good for: Dullness/Uneven Texture', 'allure 2019 Best of Beauty Award Winner', 'Salic...","['Good for: Dullness/Uneven Texture', 'Good for: Loss of firmness', 'Clean at Sephora', 'Retinol...","['Best for Normal Skin', 'Plumping', 'Hydrating', 'Clean at Sephora', 'Good for: Dryness']","['Good for: Dark Circles', 'Good for: Anti-Aging', 'Retinol', 'Without Parabens', 'Gluten Free',..."
secondary_category,Wellness,Cleansers,Treatments,Moisturizers,Mini Size
tertiary_category,Beauty Supplements,Exfoliators,Face Serums,Moisturizers,na


# **3.0 Checking for Missing and Duplicate Data**

In [4]:
# Display if there are any duplicated rows in skincare_reviews
skincare_reviews.duplicated().sum()


0

In [5]:
# Verify that there are no missing values in skincare_reviews
skincare_reviews.isnull().sum()


product_id            0
product_name          0
brand_name            0
average_rating        0
ingredients           0
price_usd             0
limited_edition       0
highlights            0
secondary_category    0
tertiary_category     0
author_id             0
author_rating         0
review_text           0
dtype: int64

# **4.0 Sentiment-Calibrated Rating Adjustment (SCRA) via DeBERTa ABSA**

The fundamental concept of Decoding-enhanced BERT with Disentangled Attention (DeBERTa)

<img src="images/image_3.png" width="60%" height="60%">

**Image Source:** https://paperswithcode.com/method/deberta

## **4.1 DeBERTa Model Initialization for ABSA**

In [6]:
# Initialize the tokenizer and model for the DeBERTa-based aspect-based sentiment analysis (ABSA)
model_name = "yangheng/deberta-v3-large-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)


In [7]:
# Create a custom dataset class for handling review texts
class ReviewDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx]

# Define batch size
batch_size = 3

# Create a DataLoader instance for the reviews
review_dataset = ReviewDataset(skincare_reviews["review_text"].tolist())
review_loader = DataLoader(review_dataset, batch_size=batch_size, shuffle=False)


In [8]:
# Function to perform sentiment analysis on a batch of review texts
def analyze_sentiment_batch(review_texts):
    # Tokenize the batch of texts
    inputs = tokenizer(review_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Perform inference
    with torch.no_grad():
        with autocast():
            outputs = model(**inputs)
    
    # Convert the logits to probabilities
    logits = outputs.logits.detach().cpu()
    predictions = torch.softmax(logits, dim=-1).tolist()
    
    return predictions


In [9]:
# Define thresholds for categorizing sentiments
thresholds = {
    # (lower bound, upper bound), where None represents negative/positive infinity
    "1": (None, 0.2),   # strong negative sentiment
    "2": (0.2, 0.4),    # negative sentiment
    "3": (0.4, 0.6),    # neutral sentiment
    "4": (0.6, 0.8),    # positive sentiment
    "5": (0.8, None),   # strong positive sentiment
}

# Refine the sentiment scores and categorize them
def refine_sentiment(scores):    
    # Calculate an overall sentiment score considering both positive and negative scores
    overall_score = (scores["positive"] - scores["negative"] + 1) / 2 # normalize to range [0, 1]
    
    # Determine category based on overall_score
    for category, (lower, upper) in thresholds.items():
        if (lower is None or overall_score > lower) and (upper is None or overall_score <= upper):
            return category

    return "neutral" # default category if no thresholds are met


In [10]:
# Analyze sentiment for each batch of reviews and refine results
results = []

# Customize the progress bar format
custom_bar_format = "{desc}: {percentage:0.2f}%|{bar:20}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"

# Calibrate author ratings using DeBERTa sentiment analysis
for batch in tqdm(review_loader, bar_format=custom_bar_format):
    predictions = analyze_sentiment_batch(batch)
    batch_results = [refine_sentiment({"negative": p[0], "neutral": p[1], "positive": p[2]}) for p in predictions]
    results.extend(batch_results)


100.00%|████████████████████| 17020/17020 [12:45<00:00, 22.23it/s]


In [11]:
# Update DataFrame with adjusted author ratings based on sentiment analysis
skincare_reviews["adjusted_author_rating"] = results


In [12]:
# Display the first 5 rows
skincare_reviews.head().transpose()


Unnamed: 0,0,1,2,3,4
product_id,P503052,P423688,P504045,P479125,P440650
product_name,Mini Flatter Me Digestive Enzyme Supplement,Daily Microfoliant Exfoliator,Daily Dose Bioretinol + Mineral SPF 40 with Bakuchiol,Mini Cloud Dew Oil-Free Gel Cream,Mini Retinol Youth Renewal Serum
brand_name,HUM Nutrition,Dermalogica,Supergoop!,Summer Fridays,Murad
average_rating,4.2643,4.696,4.7357,4.4613,4.6089
ingredients,"['Protease 3.0, Protease 4.5, Protease 6.0, Peptidase, Bromelain, Neutral Bacterial Protease, Pa...","['Microcrystalline Cellulose, Magnesium Oxide, Sodium Cocoyl Isethionate, Colloidal Oatmeal, Dis...","['Aqua/Water/Eau, Dicaprylyl Ether, Caprylic/Capric Triglyceride, Butyloctyl Salicylate, Polygly...","['Water/Aqua/Eau, Propanediol, Dicaprylyl Carbonate, Glycerin, Pentylene Glycol, Sodium Hyaluron...","['Water/Aqua/Eau, Cyclopentasiloxane, Isopropyl Palmitate, C14-22 Alcohols, Polymethylsilsesquio..."
price_usd,15.0,65.0,46.0,17.0,32.0
limited_edition,na,na,na,na,na
highlights,"['Vegan', 'Clean at Sephora', 'Without Phthalates', 'Without Parabens', 'Gluten Free']","['Vegan', 'Good for: Dullness/Uneven Texture', 'allure 2019 Best of Beauty Award Winner', 'Salic...","['Good for: Dullness/Uneven Texture', 'Good for: Loss of firmness', 'Clean at Sephora', 'Retinol...","['Best for Normal Skin', 'Plumping', 'Hydrating', 'Clean at Sephora', 'Good for: Dryness']","['Good for: Dark Circles', 'Good for: Anti-Aging', 'Retinol', 'Without Parabens', 'Gluten Free',..."
secondary_category,Wellness,Cleansers,Treatments,Moisturizers,Mini Size
tertiary_category,Beauty Supplements,Exfoliators,Face Serums,Moisturizers,na


## **4.2 Calibration of Author Rating**

Take the average of `author_rating` and `adjusted_author_rating` for each row in skincare_reviews to get the `calibrated_author_rating`.

**Formula:** $\dfrac{\text{author\_rating} + \text{adjusted\_author\_rating}}{2} = \text{calibrated\_author\_rating}$

**Example:** $\dfrac{5 + 3}{2} = 4$

However, I will ingore the row where **review_text is "-"**, means that no review text written by that specific author, thus I should **remain the origianl rating given by that author**.

This step is crucial to **maintain the integrity of the original ratings** where the text analysis would not provide meaningful sentiment insight.

In [13]:
# Calculate "calibrated_author_rating" as the average of "author_rating" and "adjusted_author_rating"
skincare_reviews["calibrated_author_rating"] = (skincare_reviews["author_rating"] +
                                                skincare_reviews["adjusted_author_rating"].astype(float)) / 2

# Use the original "author_rating" for entries with no review text ("-"), to preserve original ratings
skincare_reviews["calibrated_author_rating"] = np.where(skincare_reviews["review_text"] == "-",
                                                        skincare_reviews["author_rating"],
                                                        skincare_reviews["calibrated_author_rating"])


In [14]:
skincare_reviews.head().transpose()

Unnamed: 0,0,1,2,3,4
product_id,P503052,P423688,P504045,P479125,P440650
product_name,Mini Flatter Me Digestive Enzyme Supplement,Daily Microfoliant Exfoliator,Daily Dose Bioretinol + Mineral SPF 40 with Bakuchiol,Mini Cloud Dew Oil-Free Gel Cream,Mini Retinol Youth Renewal Serum
brand_name,HUM Nutrition,Dermalogica,Supergoop!,Summer Fridays,Murad
average_rating,4.2643,4.696,4.7357,4.4613,4.6089
ingredients,"['Protease 3.0, Protease 4.5, Protease 6.0, Peptidase, Bromelain, Neutral Bacterial Protease, Pa...","['Microcrystalline Cellulose, Magnesium Oxide, Sodium Cocoyl Isethionate, Colloidal Oatmeal, Dis...","['Aqua/Water/Eau, Dicaprylyl Ether, Caprylic/Capric Triglyceride, Butyloctyl Salicylate, Polygly...","['Water/Aqua/Eau, Propanediol, Dicaprylyl Carbonate, Glycerin, Pentylene Glycol, Sodium Hyaluron...","['Water/Aqua/Eau, Cyclopentasiloxane, Isopropyl Palmitate, C14-22 Alcohols, Polymethylsilsesquio..."
price_usd,15.0,65.0,46.0,17.0,32.0
limited_edition,na,na,na,na,na
highlights,"['Vegan', 'Clean at Sephora', 'Without Phthalates', 'Without Parabens', 'Gluten Free']","['Vegan', 'Good for: Dullness/Uneven Texture', 'allure 2019 Best of Beauty Award Winner', 'Salic...","['Good for: Dullness/Uneven Texture', 'Good for: Loss of firmness', 'Clean at Sephora', 'Retinol...","['Best for Normal Skin', 'Plumping', 'Hydrating', 'Clean at Sephora', 'Good for: Dryness']","['Good for: Dark Circles', 'Good for: Anti-Aging', 'Retinol', 'Without Parabens', 'Gluten Free',..."
secondary_category,Wellness,Cleansers,Treatments,Moisturizers,Mini Size
tertiary_category,Beauty Supplements,Exfoliators,Face Serums,Moisturizers,na


In [15]:
# Display the data types of each column
skincare_reviews.dtypes


product_id                   object
product_name                 object
brand_name                   object
average_rating              float64
ingredients                  object
price_usd                   float64
limited_edition              object
highlights                   object
secondary_category           object
tertiary_category            object
author_id                     int64
author_rating                 int64
review_text                  object
adjusted_author_rating       object
calibrated_author_rating    float64
dtype: object

**Note:** There is a mismatch between the data types of the `author_rating` and `adjusted_author_rating` columns. The `author_rating` column is an integer (`int64`), whereas `adjusted_author_rating` is an object. So, I will convert the `adjusted_author_rating` column to numeric format.

In [16]:
# Convert adjusted_author_rating to numeric
skincare_reviews["adjusted_author_rating"] = pd.to_numeric(skincare_reviews["adjusted_author_rating"])

# Configuration
pd.set_option("display.max_colwidth", None)


In [17]:
# Display the first 2 rows where original "author_rating" is 5 and "adjusted_author_rating" is 3 (calibrated_author_rating = 4)
skincare_reviews_filtered = skincare_reviews[(skincare_reviews["author_rating"] == 5) & (skincare_reviews["adjusted_author_rating"] == 3)]
skincare_reviews_filtered[["author_rating", "review_text", "adjusted_author_rating", "calibrated_author_rating"]].head(2)


Unnamed: 0,author_rating,review_text,adjusted_author_rating,calibrated_author_rating
48,5,Great product I have been trying this cream and I have really liked the results. I got the small size .5 oz/ 15ml which I would say is a perfect size for traveling but definitely not for regular use. Since I am 32 I have been looking for more products that are for anti aging. This product has been great for dryness and uneven texture. I would recommend getting the bigger size even though it’s $120. I definitely think it’s worth the price.,3,4.0
501,5,"gifted by sunday riley for an honest review. i was really scared to use this product at first because i’ve heard of how strong it can be, but i’ve used it a couple times now and haven’t had strong reactions. it does burn a little but only the first time. now my skin feels smoother in the morning.",3,4.0


In [18]:
# Drop the "review_text" column as it is no longer needed after I calibrated the author ratings
skincare_reviews.drop(columns="review_text", inplace=True)


In [46]:
# Display random rows to verify the changes in ratings
skincare_reviews[["author_rating", "adjusted_author_rating", "calibrated_author_rating"]].sample(10)


Unnamed: 0,author_rating,adjusted_author_rating,calibrated_author_rating
24227,4,3,3.5
1919,4,5,4.5
4290,5,5,5.0
7053,5,5,5.0
15591,5,5,5.0
26009,5,5,5.0
19593,5,5,5.0
25089,5,5,5.0
18188,5,3,4.0
27210,5,5,5.0


In [20]:
# Display the distribution of adjusted author ratings
skincare_reviews["calibrated_author_rating"].value_counts().sort_index(ascending=False)


calibrated_author_rating
5.0    38205
4.5     7454
4.0     2053
3.5      943
3.0      734
2.5      564
2.0      468
1.5      376
1.0      263
Name: count, dtype: int64

# **5.0 Export Calibrated Data**

In [21]:
# Save the calibrated skincare_reviews
skincare_reviews.to_csv("processed_data/skincare_reviews_calibrated.csv", index=False, encoding="utf-8")


In [22]:
# Free up GPU memory
del model
torch.cuda.empty_cache()


<blockquote style="background-color: yellow; color: black;"><strong>End of Part 2</strong></blockquote>


In [23]:
# Calculate the overall execution time
end_time = datetime.now()
execution_time = end_time - start_time
hours, remainder = divmod(execution_time.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)

# Display the overall execution time
print(f"Start Time      : {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"End Time        : {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Execution Time  : {int(hours)}h {int(minutes)}m {int(seconds)}s")


Start Time      : 2024-04-24 06:24:03
End Time        : 2024-04-24 06:36:54
Execution Time  : 0h 12m 51s
