<a href="https://colab.research.google.com/github/BTT-Cadence-Design-Systems-2A/AI-Studio-Project/blob/Bert-base-multilingual-uncased-sentiment/Cadence_2A_Bert_base_cased_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **Install libraries**

In [None]:
!pip install -U datasets huggingface_hub

**Imports & config**

In [2]:
import json
import fsspec
from itertools import islice
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

REPO = "McAuley-Lab/Amazon-Reviews-2023"


CATEGORIES = ["Software", "Video_Games", "All_Beauty"]
ALL_CATEGORIES = ["All_Beauty", "Amazon_Fashion", "Appliances", "Arts_Crafts_and_Sewing", "Automotive", "Baby_Products", "Beauty_and_Personal_Care", "Books",
              "CDs_and_Vinyl", "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewelry", "Digital_Music", "Electronics", "Gift_Cards", "Grocery_and_Gourmet_Food",
              "Handmade_Products", "Health_and_Household", "Health_and_Personal_Care", "Home_and_Kitchen", "Industrial_and_Scientific",
              "Kindle_Store", "Magazine_Subscriptions", "Movies_and_TV", "Musical_Instruments", "Office_Products", "Patio_Lawn_and_Garden", "Pet_Supplies",
              "Software", "Sports_and_Outdoors", "Subscription_Boxes", "Tools_and_Home_Improvement", "Toys_and_Games", "Video_Games",
              "Unknown"]


N_PER_CAT = 10_000
N_META    = 60_000

pd.set_option("display.max_colwidth", 200)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


**Load & sample each category (streaming) and concatenate**

In [3]:
def stream_jsonl(url: str, limit: int | None = None):
    """
    Stream a JSONL file line-by-line from Hugging Face
    Normalizes mixed-type fields like 'price'
    """
    with fsspec.open(url, "rt") as f:
        for idx, line in enumerate(f):
            if limit is not None and idx >= limit:
                break
            obj = json.loads(line)


            if "price" in obj and obj["price"] is not None:
                obj["price"] = str(obj["price"])

            return_obj = obj
            yield return_obj


def ensure_asin(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure there is an 'asin' column
    """
    for cand in ["asin", "parent_asin", "product_id", "item_id", "Parent_ASIN", "ParentAsin"]:
        if cand in df.columns:
            if "asin" not in df.columns:
                df["asin"] = df[cand]
            return df
    if len(df) > 0:
        print("No recognizable ASIN-like key found. Example row:\n", df.head(1).to_dict("records")[0])
    return df


def load_category(category: str, n_reviews: int, n_meta: int):
    """
    Load one category's reviews and meta as DataFrames
    """
    reviews_url = f"hf://datasets/{REPO}/raw/review_categories/{category}.jsonl"
    meta_url    = f"hf://datasets/{REPO}/raw/meta_categories/meta_{category}.jsonl"

    reviews_df = pd.DataFrame(islice(stream_jsonl(reviews_url), n_reviews)).assign(category=category)
    meta_df    = pd.DataFrame(islice(stream_jsonl(meta_url),    n_meta)).assign(category=category)
    return reviews_df, meta_df

**Inspect schemas and key columns**

In [None]:
all_reviews, all_meta = [], []

for cat in CATEGORIES:
    r_df, m_df = load_category(cat, n_reviews=N_PER_CAT, n_meta=N_META)
    all_reviews.append(r_df)
    all_meta.append(m_df)

reviews_df = pd.concat(all_reviews, ignore_index=True)
meta_df    = pd.concat(all_meta,    ignore_index=True)

reviews_df = ensure_asin(reviews_df)
meta_df    = ensure_asin(meta_df)


if "asin" in reviews_df:
    reviews_df = reviews_df[reviews_df["asin"].notna()]
if "asin" in meta_df:
    meta_df = meta_df[meta_df["asin"].notna()]

print(f"Loaded rows -> reviews: {len(reviews_df):,} | meta: {len(meta_df):,}")
display(reviews_df.head(2))
display(meta_df.head(2))

print(f"Unique products in reviews: {reviews_df['asin'].nunique():,}")
print(f"Unique products in meta: {meta_df['asin'].nunique():,}")


In [5]:
# print(reviews_df.columns)
# print(meta_df.columns)
# merged = reviews_df.merge(meta_df, on="parent_asin", how="left", suffixes=("_review", "_meta"))
# print(merged)
# print(merged.columns)
# merged.shape

**Helper: ensure_asin + normalize IDs**

In [None]:
meta_keys = {"asin", "parent_asin", "category"}
meta_keep = ["asin", "parent_asin"] + [c for c in meta_df.columns if c not in meta_keys]


m1 = reviews_df.merge(meta_df[meta_keep], on="asin", how="left", suffixes=("_review", "_meta"))


m2 = reviews_df.merge(
    meta_df[meta_keep].rename(columns={"asin": "asin_meta2", "parent_asin": "parent_asin_meta2"}),
    left_on="parent_asin",
    right_on="asin_meta2",
    how="left",
)


merged = m1.copy()
for col in meta_keep:
    if col in {"asin", "parent_asin"}:
        continue
    col_m1 = col
    col_m2 = col + "_m2"
    if col in m2.columns:
        merged[col_m2] = m2[col]
        merged[col] = merged[col].where(merged[col].notna(), merged[col_m2])
        merged.drop(columns=[col_m2], inplace=True)


if "asin_meta2" in m2.columns:
    merged["asin_meta_fallback"] = m2["asin_meta2"]

print("Merged shape:", merged.shape)


meta_signal = [c for c in merged.columns if c.endswith("_meta") or c in ["average_rating", "rating_number", "price", "store", "categories", "details", "title", "images", "videos", "main_category"]]
coverage = merged[meta_signal].notna().any(axis=1).mean() if meta_signal else 0.0
print(f"Rows with ANY meta fields: {coverage:.2%}")

display(merged.head(5))

# **Milestone #1: Sentiment Analysis of a Singular Review**


Goal: Take the reviews dataframe, only maintain the rating, title, category, and text columns, and then train a model that predicts the rating given a review text


In [7]:
def load_category_into_review(category: str, n_reviews: int):
    """
    Load one category's reviews as DataFrames
    """
    reviews_url = f"hf://datasets/{REPO}/raw/review_categories/{category}.jsonl"

    data = (
        {k: row.get(k) for k in ["rating", "title", "text"]}
        for row in islice(stream_jsonl(reviews_url), n_reviews)
    )

    reviews_df = pd.DataFrame(data).assign(category=category)
    return reviews_df

In [8]:
sentiment_reviews =  []

for cat in ALL_CATEGORIES:
    r_df = load_category_into_review(cat, n_reviews=N_PER_CAT)
    sentiment_reviews.append(r_df)

reviews_df_milestone1 = pd.concat(sentiment_reviews, ignore_index=True)


print(f"Loaded rows -> reviews: {len(reviews_df_milestone1):,}")
display(reviews_df_milestone1.head(2))

Loaded rows -> reviews: 340,000


Unnamed: 0,rating,title,text,category
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",All_Beauty
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",All_Beauty


In [9]:
reviews_df_milestone1.info()
reviews_df_milestone1['rating'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340000 entries, 0 to 339999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   rating    340000 non-null  float64
 1   title     340000 non-null  object 
 2   text      340000 non-null  object 
 3   category  340000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 10.4+ MB


Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5.0,224659
4.0,53930
3.0,26023
1.0,21621
2.0,13767


## Milestone #1: Data Cleaning

In [10]:
reviews_df_milestone1.isna().sum()

Unnamed: 0,0
rating,0
title,0
text,0
category,0


### Text Normalization (removing punctuation)

In [11]:
import string


def remove_punctuation(text: str) -> str:
    """
    Function removes all punctuation from a string
    """
    if not isinstance(text, str):
        return ""
    return text.translate(str.maketrans("", "", string.punctuation))

In [12]:
"""
   Creates clean_review and clean_title and clean_review. These two columns will be used during model training.
"""
reviews_df_milestone1['clean_review'] = (
    reviews_df_milestone1['text']
    .str.lower()
    .apply(remove_punctuation)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

reviews_df_milestone1['clean_title'] = (
    reviews_df_milestone1['title']
    .str.lower()
    .apply(remove_punctuation)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

### Lemmitization of Reviews

In [13]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

In [14]:
reviews_df_milestone1['lemmatized_review'] = reviews_df_milestone1['clean_review'].apply(lemmatize_text)
reviews_df_milestone1['lemmatized_title'] = reviews_df_milestone1['clean_title'].apply(lemmatize_text)

### Creating Sentiment Labels


In [15]:
def create_sentiment_label(rating: int) -> str:
  if rating >= 4:
    return 'positive'
  elif rating <= 2:
    return 'negative'
  else:
    return 'neutral'

In [16]:
reviews_df_milestone1['sentiment_labels'] = (
    reviews_df_milestone1['rating']
    .apply(create_sentiment_label)
)

In [None]:
reviews_df_milestone1.head()

### Tokenization of Reviews


In [None]:
# documents = reviews_df_milestone1['clean_review'].tolist()

In [None]:
# vectorizer = TfidfVectorizer(
#     stop_words="english",   # remove english stopwords like this, a, the, etc
#     # max_features=5000,      # keep top 5000 words (tune this)
# )
# X = vectorizer.fit_transform(documents)

In [None]:
# print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# df_tfidf.head()

In [None]:
from nltk.tokenize import word_tokenize
reviews_df_milestone1['tokenized_review'] = reviews_df_milestone1['clean_review'].apply(word_tokenize)

In [None]:
reviews_df_milestone1.head(5)

### Creating another column for sentiment label classes
For this label, we will set 0 - negative, 1 - neutral, 2 - positive.

In [72]:
def create_sentiment_label_classes(rating: int) -> str:
  if rating >= 4:
    return 2
  elif rating <= 2:
    return 0
  else:
    return 1

In [73]:
reviews_df_milestone1['sentiment_label_classes'] = (
    reviews_df_milestone1['rating']
    .apply(create_sentiment_label_classes)
)

In [None]:
reviews_df_milestone1.head(10)

# Bert-base Cased Model for sentiment analysis

### Install libraries

In [None]:
!pip install transformers
!pip install torch
!pip install datasets

### Convert "clean review" column to Dataset

In [76]:
from datasets import Dataset
clean_reviews_dataset = Dataset.from_pandas(reviews_df_milestone1[['clean_review', 'sentiment_label_classes']]).rename_column("sentiment_label_classes", "labels")

In [77]:
print(clean_reviews_dataset)

Dataset({
    features: ['clean_review', 'labels'],
    num_rows: 340000
})


### Load model and model's tokenizer to convert cleaned review text to number embeddings

In [88]:
# Load model directly
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# define the tokenization function
def tokenize_text(examples):
  return tokenizer(examples['clean_review'], padding=True, truncation=True, max_length=128)

# apply tokenization func to the clean review text
tokenized_clean_reviews = clean_reviews_dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/340000 [00:00<?, ? examples/s]

In [89]:
print(tokenized_clean_reviews)

Dataset({
    features: ['clean_review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 340000
})


### Split the tokenized text into training, validation, and test set
We will use 70% training set, 15% validation set and 15% test set

In [90]:
train_dataset, valid_test_dataset = tokenized_clean_reviews.train_test_split(test_size=0.3, seed=42).values()
valid_dataset, test_dataset = valid_test_dataset.train_test_split(test_size=0.5, seed=42).values()

In [91]:
print(train_dataset)
print(f"train dataset shape: {train_dataset.shape}")
print(valid_dataset)
print(f"valid dataset shape: {valid_dataset.shape}")
print(test_dataset)
print(f"test dataset shape: {test_dataset.shape}")

Dataset({
    features: ['clean_review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 238000
})
train dataset shape: (238000, 5)
Dataset({
    features: ['clean_review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 51000
})
valid dataset shape: (51000, 5)
Dataset({
    features: ['clean_review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 51000
})
test dataset shape: (51000, 5)


### Create data loader to manage batches of data during training
Dataloader is used to organize data for model training by providing efficient ways to batch, shuffle, and transform data.


In [92]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
valid_dataloader = DataLoader(valid_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

### Setting up the model and config for fine-tuning the model

AdamW is for adjusting learning rate during training.




In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from torch.optim import AdamW

# load the pre-trained bert model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

In [94]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [95]:
# define training argument
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Bert-base Multilingual Uncase Model checkpoint', # ouput directory for model checkpoint
    eval_strategy='epoch', # evaluation strategy
    num_train_epochs=3, # num of training epochs
    learning_rate=2e-5, # learning rate
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=8, # batch size for evaluation
    weight_decay=0.01 # weight decay

)

In [96]:
# define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

In [97]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4634,0.439781
2,0.3824,0.40535
3,0.3661,0.421329


TrainOutput(global_step=89250, training_loss=0.4188374367935651, metrics={'train_runtime': 5290.5135, 'train_samples_per_second': 134.959, 'train_steps_per_second': 16.87, 'total_flos': 4.6965745064448e+16, 'train_loss': 0.4188374367935651, 'epoch': 3.0})

In [None]:
!pip install evaluate

### Evaluating the model

In [106]:
res = trainer.predict(test_dataset)
logits = res.predictions # pred scores for each class
labels = res.label_ids # label for each class
pred = logits.argmax(axis=-1)   # pick the class with highest score

In [109]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

acc = accuracy_score(labels, pred)
prec, rec, f1, _ = precision_recall_fscore_support(labels, pred, average="weighted")

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)


Accuracy: 0.8730392156862745
Precision: 0.8523497528612406
Recall: 0.8730392156862745
F1: 0.8604433576104994


#### IGNORE EVERYTING BELOW THIS. CLUSTERING AND UNSUPERVISED LEARNING IS LESS RELIABLE FOR TEXT CLASSIFICATION.

### Unsupervised Learning (Clustering) with paraphrase-multilingual-MiniLM-L12-v2

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch, numpy as np
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, util

In [None]:
# Load model
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# model = AutoModel.from_pretrained("bert-base-multilingual-uncased")

In [None]:
data = reviews_df_milestone1['clean_review'].tolist()
data_emb = model.encode(data)

In [None]:
print(data_emb)
print(data_emb.shape)

### Fine-tune the model

In [None]:
num_clusters = 2 # positive, negative
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(data_emb)
cluster_assignment = clustering_model.labels_

In [None]:
reviews_df_milestone1['model predictions'] = cluster_assignment

In [None]:
reviews_df_milestone1.head(15)

### Checking clusters
Although we do 3 clusters, it doesn't directly mean positive, negative, neutral. It just pull similar thing closer

In [None]:
for c in range(3):
  print(f"Cluster {c}")
  print(reviews_df_milestone1[reviews_df_milestone1['model predictions']==c]["clean_review"].head(5).tolist())

### Cosine Similarity between sentiment label embeddings and clean review text embeddings

For this, we don't need training. We just need to use paraphrase-multilingual-MiniLM-L12-v2 encoder to encode the embeddings and calculate cosine similarity between them to determine what is the most similar one.

In [None]:
# labels = ['positive', 'negative', 'neutral']
# label_emb = model.encode(labels)
POS = [
  "This review is positive.", "I loved it.", "excellent, satisfied, would recommend",
  "great quality", "works perfectly"
]
NEU = [
  "This review is neutral.", "it is okay", "average, acceptable",
  "neither good nor bad"
]
NEG = [
  "This review is negative.", "I hated it.", "terrible, disappointed, refund",
  "poor quality", "does not work"
]

def proto_embed(texts):
    vecs = model.encode(texts)
    return np.mean(vecs, axis=0) # Calculate the mean of the embeddings

p_pos = proto_embed(POS)
p_neu = proto_embed(NEU)
p_neg = proto_embed(NEG)

# Stack the mean embeddings
protos = np.stack([p_pos, p_neu, p_neg])  # shape: [3, d]

In [None]:
# print(label_emb)
scores = data_emb @ protos.T                                  # [N, 3]
labels = np.array(["positive","neutral","negative"])
pred = labels[scores.argmax(axis=1)]
reviews_df_milestone1["sentiment_pred"] = pred

In [None]:
reviews_df_milestone1.head(15)

### Model performance compared with rating labels

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_true = reviews_df_milestone1['sentiment_labels']
y_pred = reviews_df_milestone1['sentiment_pred']

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2%}")

classification_report = classification_report(y_true, y_pred)
print(f"Classification Report: {classification_report}")

confusion_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix: {confusion_matrix}")
