# Ensemble Sentiment Analyser

## Author: Felipe Valencia

This project has the purpose to test the accuracy of several sentiment analysis libraries and to create an ensemble-like model to get the best outcome for classification of sentiment into a 5-star metric.

In [2]:
# Install libraries
#!pip install torch
#!pip install transformers

In [3]:
# Load libraries
import pandas as pd


In [4]:
# Read CSV

data_file = pd.read_csv("Datafiniti_Hotel_Reviews.csv")

In [5]:
# Convert ratings from float to integer

data_file['reviews.rating'] = data_file['reviews.rating'].astype(int)

# Convert text to string

data_file['reviews.text'] = data_file['reviews.text'].astype(str)

In [6]:
# Simplify the dataframe

data = data_file[['id', 'reviews.rating', 'reviews.text']]

In [7]:
data

Unnamed: 0,id,reviews.rating,reviews.text
0,AVwc252WIN2L1WUfpqLP,5,Our experience at Rancho Valencia was absolute...
1,AVwc252WIN2L1WUfpqLP,5,Amazing place. Everyone was extremely warm and...
2,AVwc252WIN2L1WUfpqLP,5,We booked a 3 night stay at Rancho Valencia to...
3,AVwdOclqIN2L1WUfti38,2,Currently in bed writing this for the past hr ...
4,AVwdOclqIN2L1WUfti38,5,I live in Md and the Aloft is my Home away fro...
...,...,...,...
9995,AVwd4TMv_7pvs4fz-Ers,3,It is hard for me to review an oceanfront hote...
9996,AVwdRp4DIN2L1WUfuGZZ,4,"I live close by, and needed to stay somewhere ..."
9997,AVwd1TbkByjofQCxs6FH,4,Rolled in 11:30 laid out heads down woke up to...
9998,AVwdHbizIN2L1WUfsXto,1,Absolutely terrible..I was told I was being gi...


In [8]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm
import time
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Load model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [10]:
# Move to GPU if available
if torch.cuda.is_available():
    model = model.cuda()

def process_in_batches(texts, batch_size=64):
    sentiments = []
    texts_list = texts.tolist()
    
    for i in tqdm(range(0, len(texts_list), batch_size)):
        batch = texts_list[i:i + batch_size]
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        
        with torch.no_grad():
            logits = model(**inputs).logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            positive_probs = probabilities[:, 1].cpu()
            
            for prob in positive_probs:
                prob = prob.item()
                if prob > 0.8:
                    sentiments.append(5)
                elif prob > 0.6:
                    sentiments.append(4)
                elif prob > 0.4:
                    sentiments.append(3)
                elif prob < 0.4:
                    sentiments.append(2)
                else:
                    sentiments.append(1)
    
    return sentiments

In [11]:
# Process the reviews
print("Starting sentiment analysis...")
start_time = time.time()

valid_reviews = data['reviews.text'].dropna()
valid_indices = valid_reviews.index
sentiments = process_in_batches(valid_reviews)

# Update the DataFrame
data['distilbert.sentiment'] = ''
data.loc[valid_indices, 'distilbert.sentiment'] = sentiments

print(f"Processing completed in {(time.time() - start_time) / 60:.2f} minutes")

Starting sentiment analysis...


100%|██████████| 157/157 [20:45<00:00,  7.93s/it]

Processing completed in 20.76 minutes



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['distilbert.sentiment'] = ''


In [12]:
data

Unnamed: 0,id,reviews.rating,reviews.text,distilbert.sentiment
0,AVwc252WIN2L1WUfpqLP,5,Our experience at Rancho Valencia was absolute...,5
1,AVwc252WIN2L1WUfpqLP,5,Amazing place. Everyone was extremely warm and...,5
2,AVwc252WIN2L1WUfpqLP,5,We booked a 3 night stay at Rancho Valencia to...,5
3,AVwdOclqIN2L1WUfti38,2,Currently in bed writing this for the past hr ...,2
4,AVwdOclqIN2L1WUfti38,5,I live in Md and the Aloft is my Home away fro...,5
...,...,...,...,...
9995,AVwd4TMv_7pvs4fz-Ers,3,It is hard for me to review an oceanfront hote...,5
9996,AVwdRp4DIN2L1WUfuGZZ,4,"I live close by, and needed to stay somewhere ...",5
9997,AVwd1TbkByjofQCxs6FH,4,Rolled in 11:30 laid out heads down woke up to...,5
9998,AVwdHbizIN2L1WUfsXto,1,Absolutely terrible..I was told I was being gi...,2


In [13]:
# Save DataFrame to CSV
data.to_csv('output_DistilBert.csv', index=False)  # Set index=False to avoid saving row indices