<a href="https://colab.research.google.com/github/BTT-Cadence-Design-Systems-2A/AI-Studio-Project/blob/Bert-base-multilingual-uncased-sentiment/Cadence_2A_Bert_base_multilingual_uncased_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **Install libraries**

In [1]:
!pip install -U datasets huggingface_hub

Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.35.1-py3-none-any.whl.metadata (14 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading datasets-4.1.1-py3-none-any.whl (503 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.35.1-py3-none-any.whl (563 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m563.3/563.3 kB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (42.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, huggingface_hub, datasets
  Attempting uninstall: pyarrow
    Found existing instal

**Imports & config**

In [2]:
import json
import fsspec
from itertools import islice
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

REPO = "McAuley-Lab/Amazon-Reviews-2023"


CATEGORIES = ["Software", "Video_Games", "All_Beauty"]
ALL_CATEGORIES = ["All_Beauty", "Amazon_Fashion", "Appliances", "Arts_Crafts_and_Sewing", "Automotive", "Baby_Products", "Beauty_and_Personal_Care", "Books",
              "CDs_and_Vinyl", "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewelry", "Digital_Music", "Electronics", "Gift_Cards", "Grocery_and_Gourmet_Food",
              "Handmade_Products", "Health_and_Household", "Health_and_Personal_Care", "Home_and_Kitchen", "Industrial_and_Scientific",
              "Kindle_Store", "Magazine_Subscriptions", "Movies_and_TV", "Musical_Instruments", "Office_Products", "Patio_Lawn_and_Garden", "Pet_Supplies",
              "Software", "Sports_and_Outdoors", "Subscription_Boxes", "Tools_and_Home_Improvement", "Toys_and_Games", "Video_Games",
              "Unknown"]


N_PER_CAT = 10_000
N_META    = 60_000

pd.set_option("display.max_colwidth", 200)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


**Load & sample each category (streaming) and concatenate**

In [6]:
def stream_jsonl(url: str, limit: int | None = None):
    """
    Stream a JSONL file line-by-line from Hugging Face
    Normalizes mixed-type fields like 'price'
    """
    with fsspec.open(url, "rt") as f:
        for idx, line in enumerate(f):
            if limit is not None and idx >= limit:
                break
            obj = json.loads(line)


            if "price" in obj and obj["price"] is not None:
                obj["price"] = str(obj["price"])

            return_obj = obj
            yield return_obj


def ensure_asin(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure there is an 'asin' column
    """
    for cand in ["asin", "parent_asin", "product_id", "item_id", "Parent_ASIN", "ParentAsin"]:
        if cand in df.columns:
            if "asin" not in df.columns:
                df["asin"] = df[cand]
            return df
    if len(df) > 0:
        print("No recognizable ASIN-like key found. Example row:\n", df.head(1).to_dict("records")[0])
    return df


def load_category(category: str, n_reviews: int, n_meta: int):
    """
    Load one category's reviews and meta as DataFrames
    """
    reviews_url = f"hf://datasets/{REPO}/raw/review_categories/{category}.jsonl"
    meta_url    = f"hf://datasets/{REPO}/raw/meta_categories/meta_{category}.jsonl"

    reviews_df = pd.DataFrame(islice(stream_jsonl(reviews_url), n_reviews)).assign(category=category)
    meta_df    = pd.DataFrame(islice(stream_jsonl(meta_url),    n_meta)).assign(category=category)
    return reviews_df, meta_df

**Inspect schemas and key columns**

In [7]:
all_reviews, all_meta = [], []

for cat in CATEGORIES:
    r_df, m_df = load_category(cat, n_reviews=N_PER_CAT, n_meta=N_META)
    all_reviews.append(r_df)
    all_meta.append(m_df)

reviews_df = pd.concat(all_reviews, ignore_index=True)
meta_df    = pd.concat(all_meta,    ignore_index=True)

reviews_df = ensure_asin(reviews_df)
meta_df    = ensure_asin(meta_df)


if "asin" in reviews_df:
    reviews_df = reviews_df[reviews_df["asin"].notna()]
if "asin" in meta_df:
    meta_df = meta_df[meta_df["asin"].notna()]

print(f"Loaded rows -> reviews: {len(reviews_df):,} | meta: {len(meta_df):,}")
display(reviews_df.head(2))
display(meta_df.head(2))

print(f"Unique products in reviews: {reviews_df['asin'].nunique():,}")
print(f"Unique products in meta: {meta_df['asin'].nunique():,}")


Loaded rows -> reviews: 30,000 | meta: 180,000


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,category
0,1.0,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1562182632076,0,False,Software
1,5.0,Lots of Fun,"I love playing tapped out because it is fun to watch the town grow by earning money and buying buildings. I love helping my neighbors, too.",[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1424120336000,0,True,Software


Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,category,subtitle,author,asin
0,Appstore for Android,Accupressure Guide,3.6,,[All the pressing point has been explained with the help of image for the ease of the user.],[Acupressure technique is very ancient and very effective technique to cure many medical problems. Acupressure is an ancient healing art that uses the fingers to press key points on the surface of...,0.0,"[{'large': 'https://m.media-amazon.com/images/I/41+4JZcQQyL.jpg', 'variant': 'MAIN'}, {'large': 'https://m.media-amazon.com/images/I/51P-UDgfJUL.jpg', 'variant': 'PT01'}, {'large': 'https://m.medi...","[{'title': '', 'url': '', 'user_id': ''}]",mAppsguru,[],"{'Release Date': '2015', 'Date first listed on Amazon': 'April 8, 2015', 'Developed By': 'mAppsguru', 'Size': '2.3MB', 'Version': '1.0', 'Application Permissions': ['Access information about netwo...",B00VRPSGEO,,Software,,,B00VRPSGEO
1,Appstore for Android,Ankylosaurus Fights Back - Smithsonian's Prehistoric Pals,4.0,,"[ENCOURAGE literacy skills with highlighted narration, FOLLOW along with three fun ways to read!, LEARN new vocabulary with tappable words, TAP objects to hear their name read aloud]","[Join Ankylosaurus in this interactive book app as he is so busy eating that he doesn’t notice the huge T-rex that is watching him! Explore pictures, learn new vocabulary, and follow along with th...",2.99,"[{'large': 'https://m.media-amazon.com/images/I/A1XJ+0NwHpL.png', 'variant': 'MAIN'}, {'large': 'https://m.media-amazon.com/images/I/A1ALOQjUkVL.png', 'variant': 'PT01'}, {'large': 'https://m.medi...","[{'title': '', 'url': '', 'user_id': ''}]","Oceanhouse Media, Inc",[],"{'Release Date': '2014', 'Date first listed on Amazon': 'September 26, 2014', 'Developed By': 'Oceanhouse Media, Inc', 'Size': '41.6MB', 'Version': '2.30', 'Application Permissions': ['Open networ...",B00NWQXXHQ,,Software,,,B00NWQXXHQ


Unique products in reviews: 18,846
Unique products in meta: 180,000


In [None]:
# print(reviews_df.columns)
# print(meta_df.columns)
# merged = reviews_df.merge(meta_df, on="parent_asin", how="left", suffixes=("_review", "_meta"))
# print(merged)
# print(merged.columns)
# merged.shape

**Helper: ensure_asin + normalize IDs**

In [8]:
meta_keys = {"asin", "parent_asin", "category"}
meta_keep = ["asin", "parent_asin"] + [c for c in meta_df.columns if c not in meta_keys]


m1 = reviews_df.merge(meta_df[meta_keep], on="asin", how="left", suffixes=("_review", "_meta"))


m2 = reviews_df.merge(
    meta_df[meta_keep].rename(columns={"asin": "asin_meta2", "parent_asin": "parent_asin_meta2"}),
    left_on="parent_asin",
    right_on="asin_meta2",
    how="left",
)


merged = m1.copy()
for col in meta_keep:
    if col in {"asin", "parent_asin"}:
        continue
    col_m1 = col
    col_m2 = col + "_m2"
    if col in m2.columns:
        merged[col_m2] = m2[col]
        merged[col] = merged[col].where(merged[col].notna(), merged[col_m2])
        merged.drop(columns=[col_m2], inplace=True)


if "asin_meta2" in m2.columns:
    merged["asin_meta_fallback"] = m2["asin_meta2"]

print("Merged shape:", merged.shape)


meta_signal = [c for c in merged.columns if c.endswith("_meta") or c in ["average_rating", "rating_number", "price", "store", "categories", "details", "title", "images", "videos", "main_category"]]
coverage = merged[meta_signal].notna().any(axis=1).mean() if meta_signal else 0.0
print(f"Rows with ANY meta fields: {coverage:.2%}")

display(merged.head(5))

Merged shape: (30000, 28)
Rows with ANY meta fields: 65.98%


Unnamed: 0,rating,title_review,text,images_review,asin,parent_asin_review,user_id,timestamp,helpful_vote,verified_purchase,...,price,images_meta,videos,store,categories,details,bought_together,subtitle,author,asin_meta_fallback
0,1.0,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1562182632076,0,False,...,34.99,,"[{'title': 'McAfee REAL Support', 'url': 'https://www.amazon.com/vdp/1d78f93b842f4ad2b7a5784562785995?ref=dp_vse_rvc_0', 'user_id': 'AHL62TTXAOHG7TW7I42NKJJYMWXQ'}, {'title': 'How to Activate and ...",McAfee,"[Software, Antivirus & Security, Internet Security Suites]","{'Product Dimensions': '7.5 x 5.5 x 0.5 inches; 0.49 Ounces', 'Item model number': 'MTP00EAMXRAAS', 'Date First Available': 'September 26, 2018', 'Manufacturer': 'McAfee', 'Country of Origin': 'USA'}",,,,B0BQSK9QCF
1,5.0,Lots of Fun,"I love playing tapped out because it is fun to watch the town grow by earning money and buying buildings. I love helping my neighbors, too.",[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1424120336000,0,True,...,0.0,"[{'large': 'https://m.media-amazon.com/images/I/A1oXfoxcSJL.png', 'variant': 'MAIN'}, {'large': 'https://m.media-amazon.com/images/I/A1REwvZmyCL.png', 'variant': 'PT01'}, {'large': 'https://m.medi...","[{'title': '', 'url': '', 'user_id': ''}]",Electronic Arts,[],"{'Release Date': '2013', 'Date first listed on Amazon': 'June 24, 2013', 'Developed By': 'Electronic Arts', 'Size': '73.2MB', 'Version': '4.62.0', 'Application Permissions': ['ACCESS_DOWNLOAD_MANA...",,,,B00CTQ6SIG
2,5.0,Light Up The Dark,"I love this flashlight app! It really illuminates the dark, very cool! Get this app, you will love it, really!",[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1362399267000,0,True,...,,,,,,,,,,
3,4.0,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,1561061428662,0,True,...,0.0,"[{'large': 'https://m.media-amazon.com/images/I/A1ZIEO4ZTEL.jpg', 'variant': 'MAIN'}, {'large': 'https://m.media-amazon.com/images/I/B1uBRtRYlVL.png', 'variant': 'PT01'}, {'large': 'https://m.medi...","[{'title': '', 'url': 'https://images-na.ssl-images-amazon.com/images/I/D1uRaN4cXyS.mp4', 'user_id': ''}]",SG Interactive,[],"{'Release Date': '2014', 'Date first listed on Amazon': 'May 22, 2014', 'Developed By': 'SG Interactive', 'Size': '93.3MB', 'Version': '39.0.0', 'Application Permissions': ['Access information abo...",,,,B00KCYMAWK
4,4.0,I am not that good at it but my kids are,Cute game. I am not that good at it but my kids are. We love Nik Wallenda!,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,1418257196000,0,True,...,0.99,"[{'large': 'https://m.media-amazon.com/images/I/51kWAmsxozL.png', 'variant': 'MAIN'}, {'large': 'https://m.media-amazon.com/images/I/51KW52oWAUL.png', 'variant': 'PT01'}, {'large': 'https://m.medi...","[{'title': '', 'url': '', 'user_id': ''}]",Tapinator,[],"{'Release Date': '2014', 'Date first listed on Amazon': 'November 2, 2014', 'Developed By': 'Tapinator', 'Size': '26.4MB', 'Version': '1.0', 'Application Permissions': ['Access information about n...",,,,B00P1RK566


# **Milestone #1: Sentiment Analysis of a Singular Review**


Goal: Take the reviews dataframe, only maintain the rating, title, category, and text columns, and then train a model that predicts the rating given a review text


In [9]:
def load_category_into_review(category: str, n_reviews: int):
    """
    Load one category's reviews as DataFrames
    """
    reviews_url = f"hf://datasets/{REPO}/raw/review_categories/{category}.jsonl"

    data = (
        {k: row.get(k) for k in ["rating", "title", "text"]}
        for row in islice(stream_jsonl(reviews_url), n_reviews)
    )

    reviews_df = pd.DataFrame(data).assign(category=category)
    return reviews_df

In [10]:
sentiment_reviews =  []

for cat in ALL_CATEGORIES:
    r_df = load_category_into_review(cat, n_reviews=N_PER_CAT)
    sentiment_reviews.append(r_df)

reviews_df_milestone1 = pd.concat(sentiment_reviews, ignore_index=True)


print(f"Loaded rows -> reviews: {len(reviews_df_milestone1):,}")
display(reviews_df_milestone1.head(2))

Loaded rows -> reviews: 340,000


Unnamed: 0,rating,title,text,category
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",All_Beauty
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",All_Beauty


In [11]:
reviews_df_milestone1.info()
reviews_df_milestone1['rating'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340000 entries, 0 to 339999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   rating    340000 non-null  float64
 1   title     340000 non-null  object 
 2   text      340000 non-null  object 
 3   category  340000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 10.4+ MB


Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5.0,224659
4.0,53930
3.0,26023
1.0,21621
2.0,13767


## Milestone #1: Data Cleaning

In [12]:
reviews_df_milestone1.isna().sum()

Unnamed: 0,0
rating,0
title,0
text,0
category,0


### Text Normalization (removing punctuation)

In [13]:
import string


def remove_punctuation(text: str) -> str:
    """
    Function removes all punctuation from a string
    """
    if not isinstance(text, str):
        return ""
    return text.translate(str.maketrans("", "", string.punctuation))

In [14]:
"""
   Creates clean_review and clean_title and clean_review. These two columns will be used during model training.
"""
reviews_df_milestone1['clean_review'] = (
    reviews_df_milestone1['text']
    .str.lower()
    .apply(remove_punctuation)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

reviews_df_milestone1['clean_title'] = (
    reviews_df_milestone1['title']
    .str.lower()
    .apply(remove_punctuation)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

### Lemmitization of Reviews

In [15]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

In [14]:
reviews_df_milestone1['lemmatized_review'] = reviews_df_milestone1['clean_review'].apply(lemmatize_text)
reviews_df_milestone1['lemmatized_title'] = reviews_df_milestone1['clean_title'].apply(lemmatize_text)

### Creating Sentiment Labels


In [16]:
def create_sentiment_label(rating: int) -> str:
  if rating >= 4:
    return 'positive'
  elif rating <= 2:
    return 'negative'
  else:
    return 'neutral'

In [17]:
reviews_df_milestone1['sentiment_labels'] = (
    reviews_df_milestone1['rating']
    .apply(create_sentiment_label)
)

In [18]:
reviews_df_milestone1.head()

Unnamed: 0,rating,title,text,category,clean_review,clean_title,sentiment_labels
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",All_Beauty,this spray is really nice it smells really good goes on really fine and does the trick i will say it feels like you need a lot of it though to get the texture i want i have a lot of hair medium th...,such a lovely scent but not overpowering,positive
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",All_Beauty,this product does what i need it to do i just wish it was odorless or had a soft coconut smell having my head smell like an orange coffee is offputting granted i did know the smell was described b...,works great but smells a little weird,positive
2,5.0,Yes!,"Smells good, feels great!",All_Beauty,smells good feels great,yes,positive
3,1.0,Synthetic feeling,Felt synthetic,All_Beauty,felt synthetic,synthetic feeling,negative
4,5.0,A+,Love it,All_Beauty,love it,a,positive


### Tokenization of Reviews


In [None]:
# documents = reviews_df_milestone1['clean_review'].tolist()

In [None]:
# vectorizer = TfidfVectorizer(
#     stop_words="english",   # remove english stopwords like this, a, the, etc
#     # max_features=5000,      # keep top 5000 words (tune this)
# )
# X = vectorizer.fit_transform(documents)

In [None]:
# print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# df_tfidf.head()

In [19]:
from nltk.tokenize import word_tokenize
reviews_df_milestone1['tokenized_review'] = reviews_df_milestone1['clean_review'].apply(word_tokenize)

In [20]:
reviews_df_milestone1.head(5)

Unnamed: 0,rating,title,text,category,clean_review,clean_title,sentiment_labels,tokenized_review
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",All_Beauty,this spray is really nice it smells really good goes on really fine and does the trick i will say it feels like you need a lot of it though to get the texture i want i have a lot of hair medium th...,such a lovely scent but not overpowering,positive,"[this, spray, is, really, nice, it, smells, really, good, goes, on, really, fine, and, does, the, trick, i, will, say, it, feels, like, you, need, a, lot, of, it, though, to, get, the, texture, i,..."
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",All_Beauty,this product does what i need it to do i just wish it was odorless or had a soft coconut smell having my head smell like an orange coffee is offputting granted i did know the smell was described b...,works great but smells a little weird,positive,"[this, product, does, what, i, need, it, to, do, i, just, wish, it, was, odorless, or, had, a, soft, coconut, smell, having, my, head, smell, like, an, orange, coffee, is, offputting, granted, i, ..."
2,5.0,Yes!,"Smells good, feels great!",All_Beauty,smells good feels great,yes,positive,"[smells, good, feels, great]"
3,1.0,Synthetic feeling,Felt synthetic,All_Beauty,felt synthetic,synthetic feeling,negative,"[felt, synthetic]"
4,5.0,A+,Love it,All_Beauty,love it,a,positive,"[love, it]"


# Bert-base Multilingual Uncased Model for sentiment analysis

### Install libraries

In [19]:
!pip install transformers
!pip install torch
!pip install datasets



### Convert "clean review" column to Dataset

In [24]:
# from datasets import Dataset
# clean_reviews_dataset = Dataset.from_pandas(reviews_df_milestone1[['clean_review']])

In [26]:
# print(clean_reviews_dataset)

Dataset({
    features: ['clean_review'],
    num_rows: 340000
})


### Load model and model's tokenizer to convert cleaned review text to number embeddings

In [47]:
# from os import truncate
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# # define the tokenization function
# def tokenize_text(examples):
#   return tokenizer(examples['clean_review'], padding=True, truncation=True, max_length=256)

# # apply tokenization func to the clean review text
# tokenized_clean_reviews = clean_reviews_dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/340000 [00:00<?, ? examples/s]

In [49]:
# print(tokenized_clean_reviews)

Dataset({
    features: ['clean_review', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 340000
})


### Split the tokenized text into training and validation set

In [50]:
# train_valid_dataset = tokenized_clean_reviews.train_test_split(test_size=0.2)
# train_dataset = train_valid_dataset['train']
# valid_dataset = train_valid_dataset['test']

In [51]:
# print(train_dataset)
# print(f"train dataset shape: {train_dataset.shape}")
# print(valid_dataset)
# print(f"valid dataset shape: {valid_dataset.shape}")

Dataset({
    features: ['clean_review', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 272000
})
train dataset shape: (272000, 4)
Dataset({
    features: ['clean_review', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 68000
})
valid dataset shape: (68000, 4)


### Create data loader to manage batches of data during training
Dataloader is used to organize data for model training by providing efficient ways to batch, shuffle, and transform data.


In [52]:
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
# valid_dataloader = DataLoader(valid_dataset, batch_size=8)

### Setting up the model and config for fine-tuning the model

AdamW is for adjusting learning rate during training.




In [53]:
# from transformers import BertForSequenceClassification, Trainer, TrainingArguments
# from torch.optim import AdamW

# # load the pre-trained bert model for sequence classification
# model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=3) # num_labels will be 3 (we are classifying positive, negative, or neutral)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
# # define training argument
# training_args = TrainingArguments(
#     output_dir='./results', # ouput directory for res
#     eval_strategy='epoch', # evaluation strategy
#     num_train_epochs=3, # num of training epochs
#     learning_rate=2e-5, # learning rate
#     per_device_train_batch_size=8, # batch size for training
#     per_device_eval_batch_size=8, # batch size for evaluation
#     weight_decay=0.01 # weight decay

# )

In [55]:
# # define Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset
# )

In [57]:
# trainer.train()

### Unsupervised Learning (Clustering) with paraphrase-multilingual-MiniLM-L12-v2

In [37]:
from transformers import AutoTokenizer, AutoModel
import torch, numpy as np
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, util

In [22]:
# Load model
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# model = AutoModel.from_pretrained("bert-base-multilingual-uncased")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [23]:
data = reviews_df_milestone1['clean_review'].tolist()
data_emb = model.encode(data)

In [24]:
print(data_emb)
print(data_emb.shape)

[[ 0.02831678 -0.06974673 -0.01311803 ...  0.02206064  0.09974912
   0.3209861 ]
 [-0.3951261  -0.00371481  0.11076389 ... -0.11905254  0.14949964
   0.15127476]
 [ 0.10073435 -0.20647696  0.00321975 ...  0.02306101 -0.15701975
   0.26182503]
 ...
 [-0.15839896  0.1480764  -0.2879947  ...  0.25574347  0.20856462
   0.46037543]
 [-0.26067835  0.03547997  0.03526209 ... -0.006985    0.05547178
   0.23127657]
 [ 0.02955304 -0.02411423  0.06500871 ...  0.06792857 -0.06772995
   0.10601745]]
(340000, 384)


### Fine-tune the model

In [30]:
num_clusters = 2 # positive, negative
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(data_emb)
cluster_assignment = clustering_model.labels_

In [31]:
reviews_df_milestone1['model predictions'] = cluster_assignment

In [35]:
reviews_df_milestone1.head(15)

Unnamed: 0,rating,title,text,category,clean_review,clean_title,sentiment_labels,tokenized_review,model predictions
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",All_Beauty,this spray is really nice it smells really good goes on really fine and does the trick i will say it feels like you need a lot of it though to get the texture i want i have a lot of hair medium th...,such a lovely scent but not overpowering,positive,"[this, spray, is, really, nice, it, smells, really, good, goes, on, really, fine, and, does, the, trick, i, will, say, it, feels, like, you, need, a, lot, of, it, though, to, get, the, texture, i,...",0
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",All_Beauty,this product does what i need it to do i just wish it was odorless or had a soft coconut smell having my head smell like an orange coffee is offputting granted i did know the smell was described b...,works great but smells a little weird,positive,"[this, product, does, what, i, need, it, to, do, i, just, wish, it, was, odorless, or, had, a, soft, coconut, smell, having, my, head, smell, like, an, orange, coffee, is, offputting, granted, i, ...",0
2,5.0,Yes!,"Smells good, feels great!",All_Beauty,smells good feels great,yes,positive,"[smells, good, feels, great]",1
3,1.0,Synthetic feeling,Felt synthetic,All_Beauty,felt synthetic,synthetic feeling,negative,"[felt, synthetic]",1
4,5.0,A+,Love it,All_Beauty,love it,a,positive,"[love, it]",1
5,4.0,Pretty Color,The polish was quiet thick and did not apply smoothly. I let dry overnight before adding a second coat since it was so thick.,All_Beauty,the polish was quiet thick and did not apply smoothly i let dry overnight before adding a second coat since it was so thick,pretty color,positive,"[the, polish, was, quiet, thick, and, did, not, apply, smoothly, i, let, dry, overnight, before, adding, a, second, coat, since, it, was, so, thick]",0
6,5.0,Handy,"Great for many tasks. I purchased these for makeup removal. No makeup on your washcloths. Disposable, so great for travel. Soft. Absorbant.",All_Beauty,great for many tasks i purchased these for makeup removal no makeup on your washcloths disposable so great for travel soft absorbant,handy,positive,"[great, for, many, tasks, i, purchased, these, for, makeup, removal, no, makeup, on, your, washcloths, disposable, so, great, for, travel, soft, absorbant]",0
7,3.0,Meh,These were lightweight and soft but much too small for my liking. I would have preferred two of these together to make one loc. For that reason I will not be repurchasing.,All_Beauty,these were lightweight and soft but much too small for my liking i would have preferred two of these together to make one loc for that reason i will not be repurchasing,meh,neutral,"[these, were, lightweight, and, soft, but, much, too, small, for, my, liking, i, would, have, preferred, two, of, these, together, to, make, one, loc, for, that, reason, i, will, not, be, repurcha...",0
8,5.0,Great for at home use and so easy to use!,This is perfect for my between salon visits. I have been using this now twice a week for over a month and I absolutely love it! My skin looks amazing and feels super smooth and silky. This is also...,All_Beauty,this is perfect for my between salon visits i have been using this now twice a week for over a month and i absolutely love it my skin looks amazing and feels super smooth and silky this is also su...,great for at home use and so easy to use,positive,"[this, is, perfect, for, my, between, salon, visits, i, have, been, using, this, now, twice, a, week, for, over, a, month, and, i, absolutely, love, it, my, skin, looks, amazing, and, feels, super...",0
9,5.0,Nice shampoo for the money,I get Keratin treatments at the salon at least 3-4 times a year (would do it more often if I could afford it). I am always in the market to use products that can help extend my salon visits. This ...,All_Beauty,i get keratin treatments at the salon at least 34 times a year would do it more often if i could afford it i am always in the market to use products that can help extend my salon visits this kerat...,nice shampoo for the money,positive,"[i, get, keratin, treatments, at, the, salon, at, least, 34, times, a, year, would, do, it, more, often, if, i, could, afford, it, i, am, always, in, the, market, to, use, products, that, can, hel...",0


### Checking clusters
Although we do 3 clusters, it doesn't directly mean positive, negative, neutral. It just pull similar thing closer

In [33]:
for c in range(3):
  print(f"Cluster {c}")
  print(reviews_df_milestone1[reviews_df_milestone1['model predictions']==c]["clean_review"].head(5).tolist())

Cluster 0
['this spray is really nice it smells really good goes on really fine and does the trick i will say it feels like you need a lot of it though to get the texture i want i have a lot of hair medium thickness i am comparing to other brands with yucky chemicals so im gonna stick with this try it', 'this product does what i need it to do i just wish it was odorless or had a soft coconut smell having my head smell like an orange coffee is offputting granted i did know the smell was described but i was hoping it would be light', 'the polish was quiet thick and did not apply smoothly i let dry overnight before adding a second coat since it was so thick', 'great for many tasks i purchased these for makeup removal no makeup on your washcloths disposable so great for travel soft absorbant', 'these were lightweight and soft but much too small for my liking i would have preferred two of these together to make one loc for that reason i will not be repurchasing']
Cluster 1
['smells good fee

### Cosine Similarity between sentiment label embeddings and clean review text embeddings

For this, we don't need training. We just need to use paraphrase-multilingual-MiniLM-L12-v2 encoder to encode the embeddings and calculate cosine similarity between them to determine what is the most similar one.

In [50]:
# labels = ['positive', 'negative', 'neutral']
# label_emb = model.encode(labels)
POS = [
  "This review is positive.", "I loved it.", "excellent, satisfied, would recommend",
  "great quality", "works perfectly"
]
NEU = [
  "This review is neutral.", "it is okay", "average, acceptable",
  "neither good nor bad"
]
NEG = [
  "This review is negative.", "I hated it.", "terrible, disappointed, refund",
  "poor quality", "does not work"
]

def proto_embed(texts):
    vecs = model.encode(texts)
    return np.mean(vecs, axis=0) # Calculate the mean of the embeddings

p_pos = proto_embed(POS)
p_neu = proto_embed(NEU)
p_neg = proto_embed(NEG)

# Stack the mean embeddings
protos = np.stack([p_pos, p_neu, p_neg])  # shape: [3, d]

In [52]:
# print(label_emb)
scores = data_emb @ protos.T                                  # [N, 3]
labels = np.array(["positive","neutral","negative"])
pred = labels[scores.argmax(axis=1)]
reviews_df_milestone1["sentiment_pred"] = pred

In [53]:
reviews_df_milestone1.head(15)

Unnamed: 0,rating,title,text,category,clean_review,clean_title,sentiment_labels,tokenized_review,model predictions,sentiment_pred
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",All_Beauty,this spray is really nice it smells really good goes on really fine and does the trick i will say it feels like you need a lot of it though to get the texture i want i have a lot of hair medium th...,such a lovely scent but not overpowering,positive,"[this, spray, is, really, nice, it, smells, really, good, goes, on, really, fine, and, does, the, trick, i, will, say, it, feels, like, you, need, a, lot, of, it, though, to, get, the, texture, i,...",0,positive
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",All_Beauty,this product does what i need it to do i just wish it was odorless or had a soft coconut smell having my head smell like an orange coffee is offputting granted i did know the smell was described b...,works great but smells a little weird,positive,"[this, product, does, what, i, need, it, to, do, i, just, wish, it, was, odorless, or, had, a, soft, coconut, smell, having, my, head, smell, like, an, orange, coffee, is, offputting, granted, i, ...",0,positive
2,5.0,Yes!,"Smells good, feels great!",All_Beauty,smells good feels great,yes,positive,"[smells, good, feels, great]",1,positive
3,1.0,Synthetic feeling,Felt synthetic,All_Beauty,felt synthetic,synthetic feeling,negative,"[felt, synthetic]",1,positive
4,5.0,A+,Love it,All_Beauty,love it,a,positive,"[love, it]",1,positive
5,4.0,Pretty Color,The polish was quiet thick and did not apply smoothly. I let dry overnight before adding a second coat since it was so thick.,All_Beauty,the polish was quiet thick and did not apply smoothly i let dry overnight before adding a second coat since it was so thick,pretty color,positive,"[the, polish, was, quiet, thick, and, did, not, apply, smoothly, i, let, dry, overnight, before, adding, a, second, coat, since, it, was, so, thick]",0,negative
6,5.0,Handy,"Great for many tasks. I purchased these for makeup removal. No makeup on your washcloths. Disposable, so great for travel. Soft. Absorbant.",All_Beauty,great for many tasks i purchased these for makeup removal no makeup on your washcloths disposable so great for travel soft absorbant,handy,positive,"[great, for, many, tasks, i, purchased, these, for, makeup, removal, no, makeup, on, your, washcloths, disposable, so, great, for, travel, soft, absorbant]",0,positive
7,3.0,Meh,These were lightweight and soft but much too small for my liking. I would have preferred two of these together to make one loc. For that reason I will not be repurchasing.,All_Beauty,these were lightweight and soft but much too small for my liking i would have preferred two of these together to make one loc for that reason i will not be repurchasing,meh,neutral,"[these, were, lightweight, and, soft, but, much, too, small, for, my, liking, i, would, have, preferred, two, of, these, together, to, make, one, loc, for, that, reason, i, will, not, be, repurcha...",0,negative
8,5.0,Great for at home use and so easy to use!,This is perfect for my between salon visits. I have been using this now twice a week for over a month and I absolutely love it! My skin looks amazing and feels super smooth and silky. This is also...,All_Beauty,this is perfect for my between salon visits i have been using this now twice a week for over a month and i absolutely love it my skin looks amazing and feels super smooth and silky this is also su...,great for at home use and so easy to use,positive,"[this, is, perfect, for, my, between, salon, visits, i, have, been, using, this, now, twice, a, week, for, over, a, month, and, i, absolutely, love, it, my, skin, looks, amazing, and, feels, super...",0,positive
9,5.0,Nice shampoo for the money,I get Keratin treatments at the salon at least 3-4 times a year (would do it more often if I could afford it). I am always in the market to use products that can help extend my salon visits. This ...,All_Beauty,i get keratin treatments at the salon at least 34 times a year would do it more often if i could afford it i am always in the market to use products that can help extend my salon visits this kerat...,nice shampoo for the money,positive,"[i, get, keratin, treatments, at, the, salon, at, least, 34, times, a, year, would, do, it, more, often, if, i, could, afford, it, i, am, always, in, the, market, to, use, products, that, can, hel...",0,positive


### Model performance compared with rating labels

In [54]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_true = reviews_df_milestone1['sentiment_labels']
y_pred = reviews_df_milestone1['sentiment_pred']

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2%}")

classification_report = classification_report(y_true, y_pred)
print(f"Classification Report: {classification_report}")

confusion_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix: {confusion_matrix}")


Accuracy: 74.36%
Classification Report:               precision    recall  f1-score   support

    negative       0.41      0.70      0.52     35388
     neutral       0.14      0.21      0.17     26023
    positive       0.93      0.80      0.86    278589

    accuracy                           0.74    340000
   macro avg       0.49      0.57      0.51    340000
weighted avg       0.82      0.74      0.77    340000

Confusion Matrix: [[ 24893   4746   5749]
 [ 10202   5520  10301]
 [ 25801  30372 222416]]
