In [None]:
#This script is generated using AI.
#Reason: Because of dataset being large, it was difficult to run it on CPU. Cloud GPU had a limit of 3 hours which was insufficent to process this dataset.
#SO we utilized local GPU on one of our machines to process the dataset.

import torch
print(torch.cuda.is_available())           # Should return True
print(torch.cuda.get_device_name(0))       # Should return your GPU name

True
NVIDIA GeForce RTX 3050 Laptop GPU


In [4]:
import json
import torch
import hashlib
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Load model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

# Adjustable batch size
PREDICTION_BATCH_SIZE = 64  # Increase if you have enough GPU memory
READ_BATCH_SIZE = 500       # Number of records to read from input before predicting

# Define dataset and prediction function
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

    def __len__(self):
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

def predict_sentiment_batch(texts):
    dataset = TextDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=PREDICTION_BATCH_SIZE)
    sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            preds = torch.argmax(probs, dim=-1)
            predictions.extend([sentiment_map[p.item()] for p in preds])

    return predictions

# File paths
input_path = 'dataset_df.json'
output_path = 'sentiment_output_streamed.json'

# Step 1: Load already-processed line hashes
existing_hashes = set()
try:
    with open(output_path, 'r', encoding='utf-8') as outf:
        for line in outf:
            record = json.loads(line)
            text_hash = hashlib.md5(record['text'].encode('utf-8')).hexdigest()
            existing_hashes.add(text_hash)
except FileNotFoundError:
    pass  # New file will be created

# Step 2: Stream and skip already processed lines
batch_texts = []
batch_records = []

with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'a', encoding='utf-8') as outfile:
    for line in tqdm(infile, desc="Processing"):
        line = line.strip()
        if not line:
            continue
        try:
            record = json.loads(line)
            text = record["text"]
            text_hash = hashlib.md5(text.encode('utf-8')).hexdigest()
            if text_hash in existing_hashes:
                continue  # Skip already processed line

            batch_texts.append(text)
            batch_records.append(record)
        except json.JSONDecodeError:
            continue

        if len(batch_texts) >= READ_BATCH_SIZE:
            sentiments = predict_sentiment_batch(batch_texts)
            for record, sentiment in zip(batch_records, sentiments):
                record["sentiment"] = sentiment
                json.dump(record, outfile, ensure_ascii=False)
                outfile.write('\n')
            batch_texts.clear()
            batch_records.clear()

    # Process leftover records
    if batch_texts:
        sentiments = predict_sentiment_batch(batch_texts)
        for record, sentiment in zip(batch_records, sentiments):
            record["sentiment"] = sentiment
            json.dump(record, outfile, ensure_ascii=False)
            outfile.write('\n')


Processing: 586714it [5:52:51, 27.71it/s] 


In [2]:
import pandas as pd
import json

data = []
with open('full_data\\sentiment_output_streamed.json', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line:  # skip blank lines
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                print("Skipping invalid line:", line)
sentiment_df = pd.DataFrame(data)

In [3]:
sentiment_df

Unnamed: 0,user_id,business_id,stars,useful,funny,cool,text,date,name,review_count,categories,address,city,state,postal_code,sentiment
0,QzCEzH3R7Z6erOGLr3t55Q,0pMj5xUAecW9o1P35B0AMw,5.0,1,0,1,Great staff always helps and always nice. Alwa...,2017-05-26 13:10:24,Wawa,8,"Food, Coffee & Tea, Gas Stations, Restaurants,...",2544 W Main Street,Norristown,PA,19403,Positive
1,3MpDvy5gEdsbZh9-p92dHg,8QnuWGVNBhzyYXGSeRdi4g,4.0,0,0,0,After my ROTD yesterday of a different Sweet ...,2013-10-24 19:24:33,Sweet Cece's,17,"Food, Ice Cream & Frozen Yogurt",7114 Hwy 70 S,Nashville,TN,37221,Positive
2,bCla27ma_6i_QFrGkILKrQ,sLgnx_WFCjEoPsS6NwU70Q,5.0,0,0,0,Our family returned for breakfast again this w...,2014-10-27 16:31:37,Le Peep,259,"Event Planning & Services, Salad, Caterers, Am...",3036 N Eagle Rd,Meridian,ID,83646,Positive
3,UsBxLh14sUpO8SdeqIiGOA,Wy8Hswf2cLQGRZN6armkag,1.0,1,0,0,"If I could give it a zero, I would. I order a ...",2011-08-24 23:07:08,Jack in the Box,86,"Restaurants, Fast Food, Mexican, Tacos, Burger...",6875 Hollister Ave,Goleta,CA,93117,Negative
4,mEOMAeEonZoUx2nPM3v6fg,f-WhNOSwN1aB4nRFekf01g,4.0,0,0,0,Id you haven't been to the Smoothie King cente...,2015-03-19 00:30:09,Smoothie King,50,"Arts & Entertainment, Ticket Sales, Food, Juic...",1501 Dave Dixon Drive Space 101-102,New Orleans,LA,70113,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586656,6YRmtPVvnD_mexQifxAJsg,IkjBNJvMZhea1c3j2H2Ahw,3.0,0,0,0,"Besides it being super busy often, usually, I ...",2022-01-10 12:51:07,Starbucks,31,"Food, Coffee & Tea",2350 South Grand Ave,St. Louis,MO,63104,Positive
586657,6Hmh8UC5K0oBgp3_D5doDQ,BxveKq0rKp52EWooIDK54w,2.0,3,1,0,Where do i begin. Overall the food was fine. I...,2019-12-08 19:24:12,Denny's,88,"Breakfast & Brunch, American (Traditional), Re...",3155 E Fairview Ave,Meridian,ID,83642,Negative
586658,UfevNSM_H14XXWZFlHYPoA,TNtcjnta11CpDebuBNdoug,2.0,0,1,0,Our waiter was good but a bit rushed. I didn't...,2017-10-22 20:55:12,Red Robin Gourmet Burgers and Brews,83,"Home Services, American (New), Contractors, Am...",130 Gravois Bluffs Cir,Fenton,MO,63026,Negative
586659,LHWtjTG7e1NzNPYUbUo-9w,rgeuy1qbw6Z8B6CSVANHIA,5.0,1,1,1,I've been to the other Federal Donuts location...,2012-10-13 14:39:37,Federal Donuts,1464,"Donuts, Sandwiches, Soul Food, Food, Coffee & ...",1632 Sansom St,Philadelphia,PA,19103,Positive


In [6]:
# Define the mapping from sentiment labels to scores
sentiment_score_map = {
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3
}

# Apply the mapping to create a new column
sentiment_df["sentiment_score"] = sentiment_df["sentiment"].map(sentiment_score_map)

In [7]:
def convert_to_3_scale(score):
    if score in [1, 2]:
        return 1
    elif score == 3:
        return 2
    elif score in [4, 5]:
        return 3

# Apply to both columns
sentiment_df["stars_3_scale"] = sentiment_df["stars"].apply(convert_to_3_scale)

In [8]:
sentiment_df.to_json('sentiment_score.json', orient='records', lines=True)

In [9]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(sentiment_df["stars_3_scale"].astype(int),
                          sentiment_df["sentiment_score"].astype(int))
print("Exact Match Accuracy:", accuracy)

Exact Match Accuracy: 0.8222619195753595
