### Installs

In [1]:
!pip install vaderSentiment
!pip install setfit
!pip install openpyxl
!pip install xlsxwriter



In [2]:
from google.colab import drive
import os

# Mount Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd
import datetime

# For model training
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from setfit import SetFitModel, Trainer, TrainingArguments
from datasets import Dataset

# For model evaluation
from sklearn.metrics import (
    classification_report, confusion_matrix,
    mean_squared_error, mean_absolute_error, r2_score,
    roc_curve, auc, precision_recall_curve, roc_auc_score
)

# For visualization
from seaborn import heatmap
from matplotlib import pyplot as plt

# For export
import openpyxl
import xlsxwriter

import torch

pd.set_option('display.max_columns', None)

  return datetime.utcnow().replace(tzinfo=utc)


### Functions definitions

In [4]:
def get_distributions(data, cols):
  plt.close()
  for var in cols:
    fig, ax = plt.subplots()

    if data[var].dtypes == "object":

      # Get value counts for the categorical variable
      counts = data[var].value_counts().sort_index()

      # Create bar plot
      ax.bar(counts.index.astype(str), counts.values, edgecolor='black')
      # Add gridlines
      ax.grid(True, axis='y', alpha=0.3, linestyle='--', linewidth=0.7)
      ax.set_axisbelow(True)

      # Formatting
      ax.set_xlabel(var, fontsize=12)
      ax.set_ylabel('Frequency', fontsize=12)
      ax.set_title(f'Distribution of {var}', fontsize=14)
      plt.xticks(rotation=45, ha='right')

      plt.tight_layout()
      plt.show()
      plt.close()

    else:
      counts, bin_edges, _ = plt.hist(data[var], edgecolor='black')
      plt.xticks(bin_edges, rotation=45)
      plt.xlabel(var)
      plt.ylabel('Frequency')
      plt.fill()
      plt.show()
      plt.close()

In [5]:
def get_metrics_table(y_true, pred_dict):
  full_metrics_table = pd.DataFrame(columns=['model_output', 'metrics',	'negative',	'neutral',	'positive',	'accuracy',	'macro avg', 'weighted avg'])

  for var_name, y_pred in pred_dict.items():
    curr = pd.DataFrame(classification_report(y_true, pred_dict[var_name], output_dict=True)).reset_index().rename(columns={'index':'metrics'})
    curr['model_output'] = var_name
    full_metrics_table = pd.concat([full_metrics_table, curr])[full_metrics_table.columns]
  full_metrics_table.sort_values(['accuracy'], ascending=True, inplace=True)
  return full_metrics_table

In [6]:
# For model evaluation - plotting confusion matrix with seaborn
def plot_confusion_matrix(y_true, y_pred, title, xticks = None, yticks= None, color="viridis",  annotation=True):
    plt.close()
    if xticks == None:
      xticks = True
    if yticks == None:
      yticks = True
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots()
    ax = heatmap(cm, annot=annotation, cmap=color,xticklabels=xticks, yticklabels=yticks, fmt='g')
    ax.set(xlabel="Predicted Value", ylabel="True Value")
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')
    ax.set_title(title)
    plt.close()
    return fig

### Data import

In [7]:
# Navigate to your data directory
# data_dir = '/content/drive/MyDrive/Colab Notebooks/DermaLLM/data/'
data_dir = '/content/drive/MyDrive/Colab Notebooks/DermaLLM_old_backup/DermaLLM/data/'

# Set directory for plots dump
plot_path = '/content/drive/MyDrive/Colab Notebooks/DermaLLM_old_backup/DermaLLM/data/plots/'

td_date = datetime.datetime.now().strftime("%m_%d_%Y")

In [8]:
# Import clean reviews table
reviews_df_clean = pd.read_pickle(data_dir + "reviews_data_clean_12_11_2025.pkl")
print(reviews_df_clean.columns)
reviews_df_clean

Index(['author_id', 'rating', 'is_recommended', 'submission_time',
       'review_text', 'review_title', 'skin_tone', 'eye_color', 'skin_type',
       'hair_color', 'product_id', 'product_name', 'brand_name', 'price_usd',
       'helpfulness', 'total_feedback_count', 'total_pos_feedback_count',
       'total_neg_feedback_count', 'unq_review_cnt_per_prod'],
      dtype='object')


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,author_id,rating,is_recommended,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd,helpfulness,total_feedback_count,total_pos_feedback_count,total_neg_feedback_count,unq_review_cnt_per_prod
0,538863,1,0.0,2018-11-01,One use and into the trash this went. I woke u...,one and done,fair,blue,combination,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,,0,0,0,10226
1,561736,5,1.0,2018-07-28,This is my nightly hero. It is the one facial...,Awesome!,light,blue,combination,blonde,P421998,Midnight Recovery Concentrate Moisturizing Fac...,Kiehl's Since 1851,56.0,0.833333,6,5,1,729
2,561736,5,1.0,2018-07-28,This is my nightly hero. It is the one facial...,Awesome!,light,blue,combination,blonde,P445951,Midnight Recovery Concentrate Moisturizing Fac...,Kiehl's Since 1851,30.0,0.833333,6,5,1,729
3,602980,5,1.0,2018-10-03,This has become a must have addition to my mor...,Brain Must!!,lightMedium,blue,combination,blonde,P423159,Brain Dust,Moon Juice,38.0,1.000000,1,1,0,37
4,696309,5,1.0,2019-06-17,"Well, I was a skeptic, but after trying a samp...",replacing my other serums,lightMedium,brown,dry,brown,P444222,Luxury Beauty Serum Calming Treatment,Saint Jane Beauty,125.0,1.000000,30,30,0,247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581780,orderGen270100,5,1.0,2019-02-28,I have dry sensitive combination skin and my f...,Can’t beat the price and you can see the resul...,fair,blue,combination,blonde,P427419,Hyaluronic Acid 2% + B5 Hydrating Serum,The Ordinary,15.7,,0,0,0,2351
581781,orderGen39837,5,1.0,2018-05-18,This is great for fine lines! I’ve never revie...,Actually working on my fine lines!,light,green,normal,blonde,P429515,C-Tango Vitamin C Eye Cream,Drunk Elephant,64.0,0.250000,8,2,6,917
581782,orderGen39837,5,1.0,2020-08-24,I got a facial and was told I have beautiful s...,Love this product,light,green,normal,blonde,P456418,Wild Rose Night-Brightening Sleeping Facial,KORRES,50.0,0.900000,10,9,1,222
581783,orderGen51156,5,1.0,2020-02-10,Nice cooling sensation once applied to the ski...,cooling and moisturizing,lightMedium,brown,combination,black,P433443,Aqua Bomb Sleeping Mask,belif,38.0,0.000000,1,0,1,627


### Feature engineering

In [9]:
reviews_df_clean['review_txt_tt'] = reviews_df_clean['review_text'].astype(str) + " — " + reviews_df_clean['review_title'].astype(str)
print(reviews_df_clean['review_txt_tt'].notnull().mean())

1.0


  return datetime.utcnow().replace(tzinfo=utc)


### Creating Target

In [10]:
# Create labels from ratings
# The idea is that if there is a high rating, review should be positive.
reviews_df_clean['sentiment_rating'] = reviews_df_clean['rating'].apply(
    lambda x: 'positive' if x >= 4 else ('negative' if x <= 2 else 'neutral')
)

  return datetime.utcnow().replace(tzinfo=utc)


In [11]:
# Check the unique values and their distribution
print("Unique values in sentiment_rating:")
print(reviews_df_clean['sentiment_rating'].unique())
print("\nValue counts:")
print(reviews_df_clean['sentiment_rating'].value_counts())
print("\nData type:")
print(reviews_df_clean['sentiment_rating'].dtype)

Unique values in sentiment_rating:
['negative' 'positive' 'neutral']

Value counts:
sentiment_rating
positive    474060
negative     64395
neutral      43330
Name: count, dtype: int64

Data type:
object


  return datetime.utcnow().replace(tzinfo=utc)


In [12]:
# Set up dictionary for models prediction
# Will be used to get comprehensive metrics tables
pred_dict = {}
cm_dict = {}

## Models Training and Evaluations

We will be using the abbreviations below for new variables:

- "ssc" = "sentiment_score"
- "scp" = "sentiment_compound"
- "slb" = "sentiment_label"
- "rv" = "review"
- "tt" = "title"
- "txt" = "text"
- "neg" = "negative"
- "neu" = "neutral"
- "pos" = "positive"



### 1) Vader Sentiment Analyzer

In [15]:
vader_results = {}
vader_metrics = None

In [17]:
X_vars_dict = {"review_text":"txt", "review_title":"tt", "review_txt_tt":"txt_tt"}

for var, code in X_vars_dict.items():

  # Split data
  X = reviews_df_clean[var]
  y = reviews_df_clean['sentiment_rating']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=62)

  analyzer = SentimentIntensityAnalyzer()

  # Apply to each review
  vader_scores = X_test.apply(
      lambda x: analyzer.polarity_scores(str(x))
  )

  # Extract compound score (-1 to 1)
  vader_compound = vader_scores.apply(
      lambda x: x['compound']
  )

  # Create positive/negative label
  vader_labels = vader_compound.apply(
      lambda x: 'positive' if x >= 0.05 else ('negative' if x <= -0.05 else 'neutral')
  )

  # Store predictions
  pred_dict[f'slb_rv_{code}_vader'] = vader_labels
  vader_results[code+'_vader'] = {
      'y_pred': vader_labels,
      'compound_scores': vader_compound,
      'y_test': y_test,
      'X_test': X_test
  }

  cm_dict[var] = plot_confusion_matrix(vader_results[code+'_vader']['y_test'], vader_results[code+'_vader']['y_pred'], xticks=['Negative', 'Neutral', 'Positive'],yticks=['Negative', 'Neutral', 'Positive'], title= var + " - Confusion Matrix")

  vader_metrics = get_metrics_table(vader_results[code+'_vader']['y_test'], pred_dict)
  vader_metrics.sort_values(['accuracy'], ascending=False, inplace=True)



  full_metrics_table = pd.concat([full_metrics_table, curr])[full_metrics_table.columns]
  return datetime.utcnow().replace(tzinfo=utc)
  full_metrics_table = pd.concat([full_metrics_table, curr])[full_metrics_table.columns]
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  full_metrics_table = pd.concat([full_metrics_table, curr])[full_metrics_table.columns]
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().r

In [18]:
# Output results to excel for easier review

with pd.ExcelWriter(data_dir+"vader_models_results_test"+td_date+".xlsx") as writer:
    vader_metrics.to_excel(writer, sheet_name = "metrics", index=False)

    workbook = writer.book
    worksheet = writer.sheets['metrics']

    row = 0

    # Add confusion matrices
    for plot_name, fig in cm_dict.items():
          # Set the figure size
          fig.set_size_inches(8, 6)
          # Save the figure to a temporary file
          img_path = plot_path + plot_name + '.png'
          fig.savefig(img_path, bbox_inches='tight')
          # Insert the image into the worksheet
          worksheet.insert_image(row, 10, img_path)
          plt.close(fig)  # Close the figure to free memory
          row += 30

writer.close()

  warn("Calling close() on already closed file.")


In [None]:
# Save copy of the data with vader scores
# reviews_df_clean.to_pickle(data_dir+f'reviews_ssc_vader_{td_date}.pkl')

In [None]:
reviews_df_clean

### 2) TF-IDF Custom Analyzer

#### Import saved data

In [20]:
custom_results = {}
custom_metrics = None
# Set up dictionary for models prediction
# Will be used to get comprehensive metrics tables
pred_dict = {}
cm_dict = {}

In [22]:
X_vars_dict = {"review_text":"txt", "review_title":"tt", "review_txt_tt":"txt_tt"}

for var, code in X_vars_dict.items():

  # Split data
  X = reviews_df_clean[var]
  y = reviews_df_clean['sentiment_rating']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=62)

  vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2), min_df=5, max_df=0.7 )
  X_train_tfidf = vectorizer.fit_transform(X_train)
  X_test_tfidf = vectorizer.transform(X_test)

  # Train logistic regression to get probability scores
  lr = LogisticRegression(max_iter=100, random_state=62, class_weight='balanced', solver='saga', n_jobs=-1)
  lr.fit(X_train_tfidf, y_train)

  # Get all probabilities (returns array with shape [n_samples, 3])
  all_probs = lr.predict_proba(X_test_tfidf)

  # Check what the classes are
  print("Class order:", lr.classes_)  # e.g., ['negative', 'neutral', 'positive']

  # Apply to each review
  custom_labels = lr.predict(X_test_tfidf)

  # Store predictions
  pred_dict[f'slb_rv_{code}_custom'] = custom_labels
  custom_results[code+'_custom'] = {
      'y_pred': custom_labels,
      'compound_scores': all_probs.max(axis=1),
      'y_test': y_test,
      'X_test': X_test,
      'all_probs':all_probs
  }

  cm_dict[var] = plot_confusion_matrix(custom_results[code+'_custom']['y_test'], custom_results[code+'_custom']['y_pred'], xticks=['Negative', 'Neutral', 'Positive'],yticks=['Negative', 'Neutral', 'Positive'], title= var + " - Confusion Matrix")

  custom_metrics = get_metrics_table(custom_results[code+'_custom']['y_test'], pred_dict)
  custom_metrics.sort_values(['accuracy'], ascending=False, inplace=True)



Class order: ['negative' 'neutral' 'positive']


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  full_metrics_table = pd.concat([full_metrics_table, curr])[full_metrics_table.columns]
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.

Class order: ['negative' 'neutral' 'positive']


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  full_metrics_table = pd.concat([full_metrics_table, curr])[full_metrics_table.columns]
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.

Class order: ['negative' 'neutral' 'positive']


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  full_metrics_table = pd.concat([full_metrics_table, curr])[full_metrics_table.columns]
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.

In [23]:
# Output results to excel for easier review

with pd.ExcelWriter(data_dir+"custom_models_results_test_"+td_date+".xlsx") as writer:
    custom_metrics.to_excel(writer, sheet_name = "metrics", index=False)

    workbook = writer.book
    worksheet = writer.sheets['metrics']

    row = 0

    # Add confusion matrices
    for plot_name, fig in cm_dict.items():
          # Set the figure size
          fig.set_size_inches(8, 6)
          # Save the figure to a temporary file
          img_path = plot_path + plot_name + '.png'
          fig.savefig(img_path, bbox_inches='tight')
          # Insert the image into the worksheet
          worksheet.insert_image(row, 10, img_path)
          plt.close(fig)  # Close the figure to free memory
          row += 30

writer.close()

  warn("Calling close() on already closed file.")


In [None]:
reviews_df_clean

### 3) SetFit Model Analyzer

In [None]:
# Import saved data

reviews_df_clean = pd.read_pickle(data_dir+'reviews_ssc_vader_12_17_2025.pkl')

In [None]:
# Split data
X = reviews_df_clean['review_text']
y = reviews_df_clean['sentiment_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=reviews_df_clean['sentiment_rating'], random_state=62)

In [None]:
def train_setfit_model(X_train, y_train, X_test, y_test, field_name):
    """Train SetFit model on one text field"""

    # Initialize model
    model = SetFitModel.from_pretrained(
        "sentence-transformers/paraphrase-mpnet-base-v2",
        labels=["negative", "neutral", "positive"],
        device="cpu"
        # device="cuda" if torch.cuda.is_available() else "cpu"
    )

    # Prepare data
    train_dataset = Dataset.from_dict({
        "text": X_train.tolist(),
        "label": y_train.tolist()
    })

    eval_dataset = Dataset.from_dict({
        "text": X_test.tolist(),
        "label": y_test.tolist()
    })

    # Training arguments
    args = TrainingArguments(
        batch_size=8,
        num_epochs=1,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        metric="accuracy",
    )

    # Train
    print(f"\n{'='*50}")
    print(f"Training SetFit on: {field_name}")
    print(f"{'='*50}")
    trainer.train()

    # Predictions
    predictions = model.predict(eval_dataset['label'])

    # Get probabilities for ROC curves
    proba = model.predict_proba(eval_dataset['label'])

    # Evaluate
    print(f"\n{field_name} Results:")
    print(classification_report(eval_dataset['label'], predictions,
                                target_names=['negative', 'neutral', 'positive']))

    return model, predictions, proba

In [None]:
# Run for all 3 fields
results = {}

# For text only
X_train_text, X_test_text, y_train, y_test = train_test_split(
    reviews_df_clean['review_text'], reviews_df_clean['sentiment_rating'],
    test_size=0.2, random_state=62, stratify=reviews_df_clean['sentiment_rating'])

results['txt'] = train_setfit_model(X_train_text, y_train, X_test_text, y_test, "Review text")

# For title only
X_train_title, X_test_title, y_train, y_test = train_test_split(
    reviews_df_clean['review_title'], reviews_df_clean['sentiment_rating'],
    test_size=0.2, random_state=62, stratify=reviews_df_clean['sentiment_rating'])

results['tt'] = train_setfit_model(X_train_title, y_train, X_test_title, y_test,"Review Title")

# For combined text
X_train_combined, X_test_combined, y_train, y_test = train_test_split(
    reviews_df_clean['review_txt_tt'], reviews_df_clean['sentiment_rating'],
    test_size=0.2, random_state=62, stratify=reviews_df_clean['sentiment_rating'])

results['txt_tt'] = train_setfit_model(X_train_combined, y_train, X_test_combined, y_test, "Combined Text")

In [None]:
reviews_df_clean

In [None]:
for model, items in results.items():
  predictions_categories = results[model][1]
  predictions_proba = results[model][2]

  pred_dict[model] = predictions_categories
  cm_dict[model] = plot_confusion_matrix(reviews_df_clean['sentiment_rating'], predictions_categories, xticks=['Negative', 'Neutral', 'Positive'],yticks=['Negative', 'Neutral', 'Positive'], title= model + " - Confusion Matrix")

  # Add prediction to reviews data
  reviews_df_clean[f'slb_rv_{model}_vader'] = predictions_categories
  reviews_df_clean[f'ssc_rv_{model}_vader'] = predictions_proba

In [None]:
reviews_df_clean.columns

In [None]:
# Sample only 8 examples per class for training (24 total)
def get_few_shot_sample(df, n_per_class=8):
    """Sample n examples per class"""
    sampled = df.groupby('sentiment_rating').apply(
        lambda x: x.sample(n=min(n_per_class, len(x)), random_state=62)
    ).reset_index(drop=True)
    return sampled

# Few-shot training set
few_shot_df = get_few_shot_sample(reviews_df_clean, n_per_class=8)

# Full test set
X_train_few = few_shot_df['review_text']
y_train_few = few_shot_df['sentiment_rating']

# Train with very limited data
model_few_shot, _, _ = train_setfit_model(
    X_train_few, y_train_few, X_test, y_test,
    "SetFit Few-Shot (24 examples)"
)

In [None]:
reviews_df_clean

## Best model Results and mapping

### Run best model on the full data

In [25]:
analyzer = SentimentIntensityAnalyzer()

# Apply to each review
reviews_df_clean[f'ssc_rv_txt_tt_vader'] = reviews_df_clean['review_txt_tt'].apply(
    lambda x: analyzer.polarity_scores(str(x))
)

# Extract compound score (-1 to 1)
reviews_df_clean[f'scp_rv_txt_tt_vader'] = reviews_df_clean[f'ssc_rv_txt_tt_vader'].apply(
    lambda x: x['compound']
)

# Create positive/negative label
reviews_df_clean[f'slb_rv_txt_tt_vader'] = reviews_df_clean[f'scp_rv_txt_tt_vader'].apply(
    lambda x: 'positive' if x >= 0.05 else ('negative' if x <= -0.05 else 'neutral')
)


pred_dict[var] = reviews_df_clean[var]
cm_dict[var] = plot_confusion_matrix(reviews_df_clean['sentiment_rating'], reviews_df_clean['slb_rv_txt_tt_vader'], xticks=['Negative', 'Neutral', 'Positive'],yticks=['Negative', 'Neutral', 'Positive'], title= var + " - Confusion Matrix")

  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
X_vars_dict = {"review_text":"txt", "review_title":"tt", "review_txt_tt":"txt_tt"}

for var, code in X_vars_dict.items():

  # Split data
  X = reviews_df_clean[var]
  y = reviews_df_clean['sentiment_rating']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=62)

  vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2), min_df=5, max_df=0.7 )
  X_train_tfidf = vectorizer.fit_transform(X_train)
  X_test_tfidf = vectorizer.transform(X_test)

  # Train logistic regression to get probability scores
  lr = LogisticRegression(max_iter=100, random_state=62, class_weight='balanced', solver='saga', n_jobs=-1)
  lr.fit(X_train_tfidf, y_train)

  # Get all probabilities (returns array with shape [n_samples, 3])
  all_probs = lr.predict_proba(X_test_tfidf)

  # Check what the classes are
  print("Class order:", lr.classes_)  # e.g., ['negative', 'neutral', 'positive']

  # Add all probability columns
  reviews_df_clean['prob_neg_rv_txt_custom'] = all_probs[:, 0]
  reviews_df_clean['prob_neu_rv_txt_custom'] = all_probs[:, 1]
  reviews_df_clean['prob_pos_rv_txt_custom'] = all_probs[:, 2]

  # For this method, we will take compound as the max probability, which corresponds to the label
  reviews_df_clean['scp_rv_txt_tt_custom'] = all_probs

  # Get the predicted label (simplest approach!)
  reviews_df_clean['slb_rv_txt_custom'] = lr.predict(vectorizer.transform(X_test_tfidf))

### Aggregate scores to product level

In [44]:
# Calculate quality score (quick method)
reviews_df_clean['review_quality_score'] = (
    (reviews_df_clean['review_text'].str.len() / 500).clip(upper=1) * 0.6 +  # Length
    # reviews_df_clean['max_probability'] * 0.3 +                              # Confidence (if custom model)
    (reviews_df_clean['sentiment_rating'] == reviews_df_clean['slb_rv_txt_tt_vader']).astype(float) * 0.4  # Accuracy
)

# Aggregate to product level
reviews_prod_lvl = reviews_df_clean.groupby(['product_id','product_name']).agg({

    # Review counts
    'author_id': 'count',  # Total number of reviews

    # Sentiment distribution
    'sentiment_rating': [
        lambda x: (x == 'positive').sum(),   # Count positive
        lambda x: (x == 'neutral').sum(),    # Count neutral
        lambda x: (x == 'negative').sum(),   # Count negative
        lambda x: x.mode()[0] if len(x) > 0 else np.nan  # Most common sentiment
    ],

    # Model predictions
    'slb_rv_txt_tt_vader': lambda x: x.mode()[0] if len(x) > 0 else np.nan,  # Most common prediction

    'scp_rv_txt_tt_vader': 'mean',

    # Quality metrics
    'review_quality_score': 'mean',  # Average quality score

    'rating': 'mean',  # Average star rating

}).reset_index()

# Flatten multi-level column names
reviews_prod_lvl.columns = [
    'product_id',
    'product_name',
    'total_reviews',
    'positive_rating_count',
    'neutral_rating_count',
    'negative_rating_count',
    'dominant_rating_sentiment',
    'predicted_sentiment',
    'predicted_sentiment_score',
    'avg_review_quality',
    'avg_rating'
]

# Add sentiment percentages
reviews_prod_lvl['positive_rating_pct'] = (
    reviews_prod_lvl['positive_rating_count'] / reviews_prod_lvl['total_reviews'] * 100
).round(2)

reviews_prod_lvl['neutral_rating_pct'] = (
    reviews_prod_lvl['neutral_rating_count'] / reviews_prod_lvl['total_reviews'] * 100
).round(2)

reviews_prod_lvl['negative_rating_pct'] = (
    reviews_prod_lvl['negative_rating_count'] / reviews_prod_lvl['total_reviews'] * 100
).round(2)


# Get top 5 reviews per product and format as numbered string
best_reviews_formatted = (
    reviews_df_clean
    .sort_values('review_quality_score', ascending=False)
    .groupby(['product_id'])
    .head(5)
    .groupby('product_id')['review_text']
    .apply(lambda reviews: ' ; '.join([f"{i+1}. {review}" for i, review in enumerate(reviews)]))
    .rename('review_sample')
).reset_index()

# Merge with product summary
reviews_prod_lvl = reviews_prod_lvl.merge(
    best_reviews_formatted,
    on=['product_id'],
    how='left'
)


final_columns = [
    'product_id',
    'product_name',
    'total_reviews',
    'avg_rating',
    'positive_rating_count',
    'neutral_rating_count',
    'negative_rating_count',
    'positive_rating_pct',
    'neutral_rating_pct',
    'negative_rating_pct',
    'dominant_rating_sentiment',
    'predicted_sentiment',
    'predicted_sentiment_score',
    'avg_review_quality',
    'review_sample'
]

reviews_prod_lvl = reviews_prod_lvl[final_columns]

# 8. Sort by total reviews (descending)
reviews_prod_lvl = reviews_prod_lvl.sort_values('total_reviews', ascending=False)

reviews_prod_lvl

  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,product_id,product_name,total_reviews,avg_rating,positive_rating_count,neutral_rating_count,negative_rating_count,positive_rating_pct,neutral_rating_pct,negative_rating_pct,dominant_rating_sentiment,predicted_sentiment,predicted_sentiment_score,avg_review_quality,review_sample
387,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,10226,4.327303,8259,689,1278,80.76,6.74,12.50,positive,positive,0.710803,0.652860,1. This is one of the best products i have eve...
484,P427421,Protini Polypeptide Firming Refillable Moistur...,4075,3.964908,2846,423,806,69.84,10.38,19.78,positive,positive,0.624273,0.675171,"1. Love this! I have pretty normal skin/combo,..."
931,P450271,Green Clean Makeup Meltaway Cleansing Balm Lim...,3887,4.494983,3350,185,352,86.18,4.76,9.06,positive,positive,0.762690,0.693891,1. There is a REASON this product has an allur...
359,P417238,Green Clean Makeup Removing Cleansing Balm,3887,4.494983,3350,185,352,86.18,4.76,9.06,positive,positive,0.762690,0.693891,1. There is a REASON this product has an allur...
60,P269122,Alpha Beta Extra Strength Daily Peel Pads,3804,4.597003,3524,147,133,92.64,3.86,3.50,positive,positive,0.757206,0.676755,1. I don’t think I’ll ever be without these ag...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749,P480176,Mini Calendula Deep Clean Foaming Face Wash,1,5.000000,1,0,0,100.00,0.00,0.00,positive,positive,0.932700,0.528400,1. Love this! Smells so good and a little goe...
1594,P475083,Skincare Essentials Kit,1,4.000000,1,0,0,100.00,0.00,0.00,positive,positive,0.966900,0.674800,1. I love this Clarins double serum! I’ve used...
1582,P474941,3 Step Intro Kit Type II,1,3.000000,0,1,0,0.00,100.00,0.00,neutral,positive,0.664300,0.130800,1. Didn’t see much of a difference however it’...
2283,P504882,Bye Bye Bumps - Best of Body Kit,1,5.000000,1,0,0,100.00,0.00,0.00,positive,positive,0.935200,0.626800,1. I love this so much. Definitely worth the p...


  return datetime.utcnow().replace(tzinfo=utc)


In [47]:
reviews_prod_lvl[reviews_prod_lvl['dominant_rating_sentiment']=="negative"]['predicted_sentiment'].value_counts()

  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0_level_0,count
predicted_sentiment,Unnamed: 1_level_1
positive,65
negative,20
neutral,3


### Save data with sentiment scores

In [48]:
reviews_prod_lvl.to_pickle(data_dir+"reviews_prod_lvl.pkl")

## Code references

### Vader Sentiment Analyzer

Sentiment Analysis Article: https://medium.com/@rslavanyageetha/vader-a-comprehensive-guide-to-sentiment-analysis-in-python-c4f1868b0d2e



### SetFit Model Analyzer

Github repo: https://github.com/huggingface/setfit/


@misc{https://doi.org/10.48550/arxiv.2209.11055,
  doi = {10.48550/ARXIV.2209.11055},
  url = {https://arxiv.org/abs/2209.11055},
  author = {Tunstall, Lewis and Reimers, Nils and Jo, Unso Eun Seo and Bates, Luke and Korat, Daniel and Wasserblat, Moshe and Pereg, Oren},
  keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {Efficient Few-Shot Learning Without Prompts},
  publisher = {arXiv},
  year = {2022},
  copyright = {Creative Commons Attribution 4.0 International}
}