<a href="https://colab.research.google.com/github/BTT-Cadence-Design-Systems-2A/AI-Studio-Project/blob/Bert-base-multilingual-uncased-sentiment/Cadence_2A_Bert_base_multilingual_uncased_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **Install libraries**

In [None]:
!pip install -U datasets huggingface_hub

**Imports & config**

In [None]:
import json
import fsspec
from itertools import islice
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

REPO = "McAuley-Lab/Amazon-Reviews-2023"


CATEGORIES = ["Software", "Video_Games", "All_Beauty"]
ALL_CATEGORIES = ["All_Beauty", "Amazon_Fashion", "Appliances", "Arts_Crafts_and_Sewing", "Automotive", "Baby_Products", "Beauty_and_Personal_Care", "Books",
              "CDs_and_Vinyl", "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewelry", "Digital_Music", "Electronics", "Gift_Cards", "Grocery_and_Gourmet_Food",
              "Handmade_Products", "Health_and_Household", "Health_and_Personal_Care", "Home_and_Kitchen", "Industrial_and_Scientific",
              "Kindle_Store", "Magazine_Subscriptions", "Movies_and_TV", "Musical_Instruments", "Office_Products", "Patio_Lawn_and_Garden", "Pet_Supplies",
              "Software", "Sports_and_Outdoors", "Subscription_Boxes", "Tools_and_Home_Improvement", "Toys_and_Games", "Video_Games",
              "Unknown"]


N_PER_CAT = 10_000
N_META    = 60_000

pd.set_option("display.max_colwidth", 200)

**Load & sample each category (streaming) and concatenate**

In [None]:
def stream_jsonl(url: str, limit: int | None = None):
    """
    Stream a JSONL file line-by-line from Hugging Face
    Normalizes mixed-type fields like 'price'
    """
    with fsspec.open(url, "rt") as f:
        for idx, line in enumerate(f):
            if limit is not None and idx >= limit:
                break
            obj = json.loads(line)


            if "price" in obj and obj["price"] is not None:
                obj["price"] = str(obj["price"])

            return_obj = obj
            yield return_obj


def ensure_asin(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure there is an 'asin' column
    """
    for cand in ["asin", "parent_asin", "product_id", "item_id", "Parent_ASIN", "ParentAsin"]:
        if cand in df.columns:
            if "asin" not in df.columns:
                df["asin"] = df[cand]
            return df
    if len(df) > 0:
        print("No recognizable ASIN-like key found. Example row:\n", df.head(1).to_dict("records")[0])
    return df


def load_category(category: str, n_reviews: int, n_meta: int):
    """
    Load one category's reviews and meta as DataFrames
    """
    reviews_url = f"hf://datasets/{REPO}/raw/review_categories/{category}.jsonl"
    meta_url    = f"hf://datasets/{REPO}/raw/meta_categories/meta_{category}.jsonl"

    reviews_df = pd.DataFrame(islice(stream_jsonl(reviews_url), n_reviews)).assign(category=category)
    meta_df    = pd.DataFrame(islice(stream_jsonl(meta_url),    n_meta)).assign(category=category)
    return reviews_df, meta_df

**Inspect schemas and key columns**

In [None]:
all_reviews, all_meta = [], []

for cat in CATEGORIES:
    r_df, m_df = load_category(cat, n_reviews=N_PER_CAT, n_meta=N_META)
    all_reviews.append(r_df)
    all_meta.append(m_df)

reviews_df = pd.concat(all_reviews, ignore_index=True)
meta_df    = pd.concat(all_meta,    ignore_index=True)

reviews_df = ensure_asin(reviews_df)
meta_df    = ensure_asin(meta_df)


if "asin" in reviews_df:
    reviews_df = reviews_df[reviews_df["asin"].notna()]
if "asin" in meta_df:
    meta_df = meta_df[meta_df["asin"].notna()]

print(f"Loaded rows -> reviews: {len(reviews_df):,} | meta: {len(meta_df):,}")
display(reviews_df.head(2))
display(meta_df.head(2))

print(f"Unique products in reviews: {reviews_df['asin'].nunique():,}")
print(f"Unique products in meta: {meta_df['asin'].nunique():,}")


In [None]:
# print(reviews_df.columns)
# print(meta_df.columns)
# merged = reviews_df.merge(meta_df, on="parent_asin", how="left", suffixes=("_review", "_meta"))
# print(merged)
# print(merged.columns)
# merged.shape

**Helper: ensure_asin + normalize IDs**

In [None]:
meta_keys = {"asin", "parent_asin", "category"}
meta_keep = ["asin", "parent_asin"] + [c for c in meta_df.columns if c not in meta_keys]


m1 = reviews_df.merge(meta_df[meta_keep], on="asin", how="left", suffixes=("_review", "_meta"))


m2 = reviews_df.merge(
    meta_df[meta_keep].rename(columns={"asin": "asin_meta2", "parent_asin": "parent_asin_meta2"}),
    left_on="parent_asin",
    right_on="asin_meta2",
    how="left",
)


merged = m1.copy()
for col in meta_keep:
    if col in {"asin", "parent_asin"}:
        continue
    col_m1 = col
    col_m2 = col + "_m2"
    if col in m2.columns:
        merged[col_m2] = m2[col]
        merged[col] = merged[col].where(merged[col].notna(), merged[col_m2])
        merged.drop(columns=[col_m2], inplace=True)


if "asin_meta2" in m2.columns:
    merged["asin_meta_fallback"] = m2["asin_meta2"]

print("Merged shape:", merged.shape)


meta_signal = [c for c in merged.columns if c.endswith("_meta") or c in ["average_rating", "rating_number", "price", "store", "categories", "details", "title", "images", "videos", "main_category"]]
coverage = merged[meta_signal].notna().any(axis=1).mean() if meta_signal else 0.0
print(f"Rows with ANY meta fields: {coverage:.2%}")

display(merged.head(5))

# **Milestone #1: Sentiment Analysis of a Singular Review**


Goal: Take the reviews dataframe, only maintain the rating, title, category, and text columns, and then train a model that predicts the rating given a review text


In [None]:
def load_category_into_review(category: str, n_reviews: int):
    """
    Load one category's reviews as DataFrames
    """
    reviews_url = f"hf://datasets/{REPO}/raw/review_categories/{category}.jsonl"

    data = (
        {k: row.get(k) for k in ["rating", "title", "text"]}
        for row in islice(stream_jsonl(reviews_url), n_reviews)
    )

    reviews_df = pd.DataFrame(data).assign(category=category)
    return reviews_df

In [None]:
sentiment_reviews =  []

for cat in ALL_CATEGORIES:
    r_df = load_category_into_review(cat, n_reviews=N_PER_CAT)
    sentiment_reviews.append(r_df)

reviews_df_milestone1 = pd.concat(sentiment_reviews, ignore_index=True)


print(f"Loaded rows -> reviews: {len(reviews_df_milestone1):,}")
display(reviews_df_milestone1.head(2))

In [None]:
reviews_df_milestone1.info()
reviews_df_milestone1['rating'].value_counts()

## Milestone #1: Data Cleaning

In [None]:
reviews_df_milestone1.isna().sum()

### Text Normalization (removing punctuation)

In [None]:
import string


def remove_punctuation(text: str) -> str:
    """
    Function removes all punctuation from a string
    """
    if not isinstance(text, str):
        return ""
    return text.translate(str.maketrans("", "", string.punctuation))

In [None]:
"""
   Creates clean_review and clean_title and clean_review. These two columns will be used during model training.
"""
reviews_df_milestone1['clean_review'] = (
    reviews_df_milestone1['text']
    .str.lower()
    .apply(remove_punctuation)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

reviews_df_milestone1['clean_title'] = (
    reviews_df_milestone1['title']
    .str.lower()
    .apply(remove_punctuation)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

### Lemmitization of Reviews

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

In [None]:
reviews_df_milestone1['lemmatized_review'] = reviews_df_milestone1['clean_review'].apply(lemmatize_text)
reviews_df_milestone1['lemmatized_title'] = reviews_df_milestone1['clean_title'].apply(lemmatize_text)

### Creating Sentiment Labels


In [None]:
def create_sentiment_label(rating: int) -> str:
  if rating >= 4:
    return 'positive'
  elif rating <= 2:
    return 'negative'
  else:
    return 'neutral'

In [None]:
reviews_df_milestone1['sentiment_labels'] = (
    reviews_df_milestone1['rating']
    .apply(create_sentiment_label)
)

In [None]:
reviews_df_milestone1.head()

### Tokenization of Reviews


In [None]:
# documents = reviews_df_milestone1['clean_review'].tolist()

In [None]:
# vectorizer = TfidfVectorizer(
#     stop_words="english",   # remove english stopwords like this, a, the, etc
#     # max_features=5000,      # keep top 5000 words (tune this)
# )
# X = vectorizer.fit_transform(documents)

In [None]:
# print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# df_tfidf.head()

In [None]:
from nltk.tokenize import word_tokenize
reviews_df_milestone1['tokenized_review'] = reviews_df_milestone1['clean_review'].apply(word_tokenize)

In [None]:
reviews_df_milestone1.head(5)

# Bert-base Multilingual Uncased Model for sentiment analysis

### Install libraries

In [None]:
!pip install transformers
!pip install torch
!pip install datasets

### Convert "clean review" column to Dataset

In [None]:
# from datasets import Dataset
# clean_reviews_dataset = Dataset.from_pandas(reviews_df_milestone1[['clean_review']])

In [None]:
# print(clean_reviews_dataset)

### Load model and model's tokenizer to convert cleaned review text to number embeddings

In [None]:
# from os import truncate
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# # define the tokenization function
# def tokenize_text(examples):
#   return tokenizer(examples['clean_review'], padding=True, truncation=True, max_length=256)

# # apply tokenization func to the clean review text
# tokenized_clean_reviews = clean_reviews_dataset.map(tokenize_text, batched=True)

In [None]:
# print(tokenized_clean_reviews)

### Split the tokenized text into training and validation set

In [None]:
# train_valid_dataset = tokenized_clean_reviews.train_test_split(test_size=0.2)
# train_dataset = train_valid_dataset['train']
# valid_dataset = train_valid_dataset['test']

In [None]:
# print(train_dataset)
# print(f"train dataset shape: {train_dataset.shape}")
# print(valid_dataset)
# print(f"valid dataset shape: {valid_dataset.shape}")

### Create data loader to manage batches of data during training
Dataloader is used to organize data for model training by providing efficient ways to batch, shuffle, and transform data.


In [None]:
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
# valid_dataloader = DataLoader(valid_dataset, batch_size=8)

### Setting up the model and config for fine-tuning the model

AdamW is for adjusting learning rate during training.




In [None]:
# from transformers import BertForSequenceClassification, Trainer, TrainingArguments
# from torch.optim import AdamW

# # load the pre-trained bert model for sequence classification
# model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=3) # num_labels will be 3 (we are classifying positive, negative, or neutral)

In [None]:
# # define training argument
# training_args = TrainingArguments(
#     output_dir='./results', # ouput directory for res
#     eval_strategy='epoch', # evaluation strategy
#     num_train_epochs=3, # num of training epochs
#     learning_rate=2e-5, # learning rate
#     per_device_train_batch_size=8, # batch size for training
#     per_device_eval_batch_size=8, # batch size for evaluation
#     weight_decay=0.01 # weight decay

# )

In [None]:
# # define Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset
# )

In [None]:
# trainer.train()

### Unsupervised Learning (Clustering) with paraphrase-multilingual-MiniLM-L12-v2

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch, numpy as np
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, util

In [None]:
# Load model
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# model = AutoModel.from_pretrained("bert-base-multilingual-uncased")

In [None]:
data = reviews_df_milestone1['clean_review'].tolist()
data_emb = model.encode(data)

In [None]:
print(data_emb)
print(data_emb.shape)

### Fine-tune the model

In [None]:
num_clusters = 2 # positive, negative
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(data_emb)
cluster_assignment = clustering_model.labels_

In [None]:
reviews_df_milestone1['model predictions'] = cluster_assignment

In [None]:
reviews_df_milestone1.head(15)

### Checking clusters
Although we do 3 clusters, it doesn't directly mean positive, negative, neutral. It just pull similar thing closer

In [None]:
for c in range(3):
  print(f"Cluster {c}")
  print(reviews_df_milestone1[reviews_df_milestone1['model predictions']==c]["clean_review"].head(5).tolist())

### Cosine Similarity between sentiment label embeddings and clean review text embeddings

For this, we don't need training. We just need to use paraphrase-multilingual-MiniLM-L12-v2 encoder to encode the embeddings and calculate cosine similarity between them to determine what is the most similar one.

In [None]:
# labels = ['positive', 'negative', 'neutral']
# label_emb = model.encode(labels)
POS = [
  "This review is positive.", "I loved it.", "excellent, satisfied, would recommend",
  "great quality", "works perfectly"
]
NEU = [
  "This review is neutral.", "it is okay", "average, acceptable",
  "neither good nor bad"
]
NEG = [
  "This review is negative.", "I hated it.", "terrible, disappointed, refund",
  "poor quality", "does not work"
]

def proto_embed(texts):
    vecs = model.encode(texts)
    return np.mean(vecs, axis=0) # Calculate the mean of the embeddings

p_pos = proto_embed(POS)
p_neu = proto_embed(NEU)
p_neg = proto_embed(NEG)

# Stack the mean embeddings
protos = np.stack([p_pos, p_neu, p_neg])  # shape: [3, d]

In [None]:
# print(label_emb)
scores = data_emb @ protos.T                                  # [N, 3]
labels = np.array(["positive","neutral","negative"])
pred = labels[scores.argmax(axis=1)]
reviews_df_milestone1["sentiment_pred"] = pred

In [None]:
reviews_df_milestone1.head(15)

### Model performance compared with rating labels

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_true = reviews_df_milestone1['sentiment_labels']
y_pred = reviews_df_milestone1['sentiment_pred']

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2%}")

classification_report = classification_report(y_true, y_pred)
print(f"Classification Report: {classification_report}")

confusion_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix: {confusion_matrix}")
