In [None]:
import pandas as pd
import re

# Load data
df = pd.read_csv("Reviews.csv")  # Or mount from Drive

df.columns
df.head()



Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [None]:
df = df[['Text', 'Score', 'Time']]
df = df.rename(columns={'Text': 'review_text', 'Score': 'rating', 'Time': 'timestamp'})

In [None]:
# Drop missing reviews
df.dropna(subset=['review_text'], inplace=True)
df = df[df['review_text'].str.strip() != '']

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove links
    text = re.sub(r"[^a-z\s]", "", text)        # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_text'] = df['review_text'].astype(str).apply(clean_text)

# Add word count
df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
df = df[df['word_count'] >= 5]

# Convert Unix timestamp to readable date
df['timestamp'] = df['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))

# Preview
df[['timestamp', 'rating', 'clean_text', 'word_count']].head()

Unnamed: 0,timestamp,rating,clean_text,word_count
0,2011-04-27,5,i have bought several of the vitality canned d...,48
1,2012-09-07,1,product arrived labeled as jumbo salted peanut...,31
2,2008-08-18,4,this is a confection that has been around a fe...,92
3,2011-06-13,2,if you are looking for the secret ingredient i...,41
4,2012-10-21,5,great taffy at a great price there was a wide ...,27


In [None]:
df[['timestamp', 'rating', 'clean_text', 'word_count']].to_csv('clean_reviews.csv', index=False)

Sentiment Analysis

In [None]:
from textblob import TextBlob

def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:
        return "positive"
    elif polarity < -0.1:
        return "negative"
    else:
        return "neutral"

df['sentiment'] = df['clean_text'].apply(get_sentiment)
df[['clean_text', 'sentiment']].head()

Unnamed: 0,clean_text,sentiment
0,i have bought several of the vitality canned d...,positive
1,product arrived labeled as jumbo salted peanut...,neutral
2,this is a confection that has been around a fe...,positive
3,if you are looking for the secret ingredient i...,positive
4,great taffy at a great price there was a wide ...,positive


Topic Extraction

In [None]:
def extract_topic(text):
    text = text.lower()
    if "delivery" in text or "late" in text:
        return "delivery issue"
    elif "price" in text or "expensive" in text:
        return "pricing"
    elif "quality" in text or "broken" in text:
        return "product quality"
    elif "support" in text or "customer service" in text:
        return "customer service"
    else:
        return "other"

df['topic'] = df['clean_text'].apply(extract_topic)

In [None]:
df[['timestamp', 'rating', 'clean_text', 'sentiment', 'topic']].to_csv('labeled_reviews.csv', index=False)

Install Hugging Face Transformers
In Colab or locally:

In [None]:
!pip install transformers
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# Pretrained Sentiment Model

In [None]:
from transformers import pipeline

# Load sentiment model
sentiment_pipe = pipeline("sentiment-analysis")

# Test on one row
sentiment_pipe("This product is amazing. I love it!")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998857975006104}]

In [None]:
# Use on a sample first
sample_df = df.sample(20, random_state=42).copy()

# Apply sentiment analysis
sample_df['sentiment'] = sample_df['clean_text'].apply(lambda x: sentiment_pipe(x[:512])[0]['label'].lower())
sample_df[['clean_text', 'sentiment']].head()

Unnamed: 0,clean_text,sentiment
12933,ive tried all sorts of bottled and canned pizz...,negative
516,these were nasty they were so greasy and too r...,negative
17180,this was my second purchase of this product my...,positive
25607,we have bought baklawa from shatila many times...,positive
24352,these are so delicious and not too bad for you...,positive


 Topic Classification

In [None]:
def extract_topic(text):
    text = text.lower()
    if "late" in text or "delivery" in text:
        return "delivery"
    elif "price" in text or "expensive" in text:
        return "pricing"
    elif "broken" in text or "quality" in text:
        return "product quality"
    elif "support" in text or "help" in text:
        return "customer service"
    else:
        return "other"

sample_df['topic'] = sample_df['clean_text'].apply(extract_topic)

In [None]:
sample_df[['timestamp', 'rating', 'clean_text', 'sentiment', 'topic']].to_csv("labeled_reviews_free.csv", index=False)