In [None]:
from google.colab import files
import pandas as pd
uploaded = files.upload()

Saving balanced30k.csv to balanced30k.csv


In [None]:
df = pd.read_csv("balanced30k.csv")
df.head()

Unnamed: 0,Title,review/score,review/text,categories,word_count,label
0,The Good Earth,2.0,The book The Good Earth is about the rise and ...,Juvenile Fiction,262,0
1,Like Water for Chocolate,4.0,"Don't know why this book sticks in my mind, bu...",Fiction,55,2
2,Shadowy Horses,4.0,By far the best of the latest crop of Romantic...,Fiction,247,2
3,"The Hobbitt, or there and back again; illustra...",5.0,"Classic tale, great illustrations. A known qua...",Fiction,23,2
4,Shamanspace,1.0,"For the past year, I had come across Steve Ayl...",Fiction,226,0


In [None]:
# Preprocessing with basic text cleanup
import re
def clean_for_bert(text):
    text = str(text)
    text = re.sub(r'\s+', ' ', text)             # normalize spaces
    return text.strip()

In [None]:
df['cleaned_text'] = df['review/text'].apply(clean_for_bert)
df.head()

Unnamed: 0,Title,review/score,review/text,categories,word_count,label,cleaned_text
0,The Good Earth,2.0,The book The Good Earth is about the rise and ...,Juvenile Fiction,262,0,The book The Good Earth is about the rise and ...
1,Like Water for Chocolate,4.0,"Don't know why this book sticks in my mind, bu...",Fiction,55,2,"Don't know why this book sticks in my mind, bu..."
2,Shadowy Horses,4.0,By far the best of the latest crop of Romantic...,Fiction,247,2,By far the best of the latest crop of Romantic...
3,"The Hobbitt, or there and back again; illustra...",5.0,"Classic tale, great illustrations. A known qua...",Fiction,23,2,"Classic tale, great illustrations. A known qua..."
4,Shamanspace,1.0,"For the past year, I had come across Steve Ayl...",Fiction,226,0,"For the past year, I had come across Steve Ayl..."


In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split

X = df['review/text']
y = df['label']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [None]:
!pip install transformers



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch

model_name = "cardiffnlp/twitter-roberta-base-sentiment"

# Load tokenizer and model separately
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
# Create pipeline with truncation enabled
classifier = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    truncation=True,
    max_length=512,
    device=0 if torch.cuda.is_available() else -1
)

Device set to use cpu


In [None]:
# Manual batching
from tqdm import tqdm  # Optional progress bar

results = []
batch_size = 16
texts_to_predict = X_test.tolist()

for i in tqdm(range(0, len(texts_to_predict), batch_size)):
    batch = texts_to_predict[i:i + batch_size]
    preds = classifier(batch)
    results.extend(preds)

100%|██████████| 282/282 [47:02<00:00, 10.01s/it]


In [None]:
# Convert predictions to class labels 0, 1, 2
y_pred = [
    int(max(p, key=lambda x: x['score'])['label'].split('_')[-1])
    for p in results
]

In [None]:
# Evaluate performance
from sklearn.metrics import classification_report
print("\n Zero-Shot Twitter Transformer Performance on Test Set:")
print(classification_report(y_test.tolist(), y_pred, zero_division=0))


 Zero-Shot Twitter Transformer Performance on Test Set:
              precision    recall  f1-score   support

           0       0.66      0.75      0.70      1500
           1       0.46      0.29      0.36      1500
           2       0.62      0.76      0.68      1500

    accuracy                           0.60      4500
   macro avg       0.58      0.60      0.58      4500
weighted avg       0.58      0.60      0.58      4500



In [None]:
# Using pre-trained multilingual review-based sentiment model
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

# Load tokenizer and model separately
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [None]:
# Create pipeline with truncation enabled
classifier = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    truncation=True,
    max_length=512,
    device=0 if torch.cuda.is_available() else -1
)

Device set to use cpu


In [None]:
# Manual batching (safe and scalable)
from tqdm import tqdm  # Optional progress bar

results = []
batch_size = 16
texts_to_predict = X_test.tolist()

for i in tqdm(range(0, len(texts_to_predict), batch_size)):
    batch = texts_to_predict[i:i + batch_size]
    preds = classifier(batch)
    results.extend(preds)

100%|██████████| 282/282 [48:33<00:00, 10.33s/it]


In [None]:
# Map 1 and 2 stars to 0, 3 stars to 1, 5 stars to 2
def map_star_label(star_label):
    if "1 star" in star_label or "2 stars" in star_label:
        return 0  # Negative
    elif "3 stars" in star_label:
        return 1  # Neutral
    else:
        return 2  # Positive

y_pred = [
    map_star_label(max(r, key=lambda x: x['score'])['label'])
    for r in results
]

In [None]:
# Evaluate performance
print("\n Zero-shot Review-Based Transformer Performance on Test Set:")
print(classification_report(y_test.tolist(), y_pred, zero_division=0))


 Zero-shot Review-Based Transformer Performance on Test Set:
              precision    recall  f1-score   support

           0       0.74      0.79      0.76      1500
           1       0.68      0.50      0.58      1500
           2       0.73      0.87      0.79      1500

    accuracy                           0.72      4500
   macro avg       0.72      0.72      0.71      4500
weighted avg       0.72      0.72      0.71      4500

