<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/Projects/Data-mining/02-sentiments-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis

```sh
!pip install transformers[torch]
!pip install datasets evaluate
!pip install sentence-transformers pinecone-client
```

## Install Dependencies

In [1]:
!pip install transformers[torch]
!pip install datasets evaluate
!pip install sentence-transformers pinecone-client

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m 

In [2]:
# Built-in library
import re
import json
from typing import Any, Optional, TypeAlias, Union
import logging
import warnings

# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
# %load_ext lab_black

# auto reload imports
# %load_ext autoreload
# %autoreload 2

## Load And Prepare Dataset

- I'll be using a dataset containing approx. 90k hotel reviews by customers.
- The dataset can be loaded using HuggingFace datasets.

In [3]:
from datasets import load_dataset, Dataset


PATH: str = "ashraq/hotel-reviews"
reviews_data: Dataset = load_dataset(PATH, split="train")
reviews_data

Downloading readme:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.10M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/93757 [00:00<?, ? examples/s]

Dataset({
    features: ['review_date', 'hotel_name', 'review'],
    num_rows: 93757
})

In [4]:
RANDOM_STATE: int = 123

print(reviews_data[:3])

In [5]:
Example: TypeAlias = dict[str, Any]

def get_text_length(example: Example) -> Example:
  """This returns the length of the review. """
  key: str = "review"
  result: Example = {"review_length": [len(x) for x in example.get(key)]}
  return result

In [6]:
reviews_data: Dataset = reviews_data.map(get_text_length, batched=True)
reviews_data

Map:   0%|          | 0/93757 [00:00<?, ? examples/s]

Dataset({
    features: ['review_date', 'hotel_name', 'review', 'review_length'],
    num_rows: 93757
})

In [7]:
# Cheeck the length of the reviews
N: int = 5
result: Example = reviews_data.sort("review_length", reverse=True)[:N]
print(result.get("review_length"))

In [8]:
# Convert to tabular data
df: pd.DataFrame = reviews_data.to_pandas()

df_pl: pl.DataFrame = pl.from_pandas(data=df)
df_pl

review_date,hotel_name,review,review_length
str,str,str,i64
"""8/3/2017""","""Park Plaza Cou…",""" Extra bed was…",220
"""8/3/2017""","""Park Plaza Cou…",""" Just the loca…",27
"""8/3/2017""","""Park Plaza Cou…",""" Around the co…",384
"""8/2/2017""","""Park Plaza Cou…",""" I wish you ha…",33
"""8/2/2017""","""Park Plaza Cou…",""" You re always…",270
"""8/2/2017""","""Park Plaza Cou…",""" Bit of a wait…",40
"""8/2/2017""","""Park Plaza Cou…",""" The staff wer…",118
"""8/2/2017""","""Park Plaza Cou…",""" Housekeeping …",697
"""8/2/2017""","""Park Plaza Cou…",""" The location …",226
"""8/2/2017""","""Park Plaza Cou…",""" Breakfast was…",269


In [9]:
df_pl.describe(percentiles=[0.05, 0.10, 0.95])

describe,review_date,hotel_name,review,review_length
str,str,str,str,f64
"""count""","""93757""","""93757""","""93757""",93757.0
"""null_count""","""0""","""0""","""0""",0.0
"""mean""",,,,108.006496
"""std""",,,,140.482189
"""min""","""1/1/2016""","""Blakemore Hyde…",""" """,1.0
"""max""","""9/9/2016""","""Strand Palace …","""90""",1966.0
"""median""",,,,64.0
"""5%""",,,,9.0
"""10%""",,,,15.0
"""95%""",,,,342.0


```python
# Convert back to HF dataset
reviews_data_1: Dataset = Dataset.from_pandas(df=df_pl.to_pandas())
reviews_data_1
```

## Initialize Sentiment Analysis Model

- I'll be using a finetuned [RoBERTa](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest?text=This+hotel+is+not+so+great.+It+lacks+basic+facilities.) model.

In [10]:
# Set up the labels. The model label is obtained form the model card
label2id: dict[str, Any] = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label: dict[str, Any] = {_id:_label for _label, _id in label2id.items()}

print(f"label2id: {label2id}")
print(f"id2label: {id2label}")

In [11]:
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          pipeline,)


MODEL_CHECKPOINT: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"
TASK: str = "sentiment-analysis"

# Load the model from HuggingFace Hub
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT,
                                                           id2label=id2label,
                                                           label2id=label2id,)

# Load the tokenizer from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [12]:
# Check if GPU is available
device: str = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Device: {device}")

In [13]:
# Setup the classifier pipeline
clf = pipeline(task=TASK, model=model, device=device, tokenizer=tokenizer)

In [14]:
# Test the classifier pipeline
text: str = reviews_data[250].get("review")

print(text)

In [15]:
clf(text)

[{'label': 'positive', 'score': 0.9884264469146729}]

<br><hr>

## Initialize Retriever

- **Retriever**:
  - In NLP, a `retriever` is a component that identifies and retrieves relevant documents or passages from a large corpus of text.

- Applications:
  - Question answering
  - Information retrieval
  - Text summarization

- `Sentence-transformer` model will be used as the `retriever`.

- I'll be using [all-MiniLM-L6-v2](sentence-transformers/all-MiniLM-L6-v2) retrieval because of its small size (***~90 mb***).
- For higher accuracy, you can use a more powerful transformer like [all-mpnet-base-v2](sentence-transformers/all-mpnet-base-v2).

In [16]:
from sentence_transformers import SentenceTransformer


RETRIEVER_CHECKPOINT: str = "sentence-transformers/all-MiniLM-L6-v2"
# Load the model from HuggingFace Hub
retriever = SentenceTransformer(RETRIEVER_CHECKPOINT,
                                device=device)

retriever

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [17]:
retriever.get_max_seq_length(), retriever.get_sentence_embedding_dimension()

(256, 384)

### Initialize Pinecone Index

- I'll be using [Pinecone](https://www.pinecone.io/) as the vector DB.
- The dimension is `384` which was obtained by running:

```python
retriever.get_sentence_embedding_dimension()
# Returns: 384
```
- The metric used is `cosine` similarity.

In [18]:
import pinecone


YOUR_API_KEY: str = "YOUR_API_KEY"
YOUR_ENV: str = "us-west4-gcp-free"
DIMENSION: int = retriever.get_sentence_embedding_dimension()
METRIC : str = "cosine"
INDEX_NAME: str = "sentiment-analysis"

pinecone.init(api_key=YOUR_API_KEY, environment=YOUR_ENV)

In [19]:
# Check if index exists
if INDEX_NAME not in pinecone.list_indexes():
  pinecone.create_index(
      name=INDEX_NAME,
      dimension=DIMENSION,
      metric=METRIC
  )

# Connect to the created index
index = pinecone.Index(index_name=INDEX_NAME)

### Generate Embeddings

- Generate embeddings for all the reviews in the dataset.
- The embeddings, the sentiment label and score will be included in the Pinecone index as metadata.
- The metadata will be used to understand the customer opinions.

In [20]:
def get_sentiment(reviews: list[str]) -> list[list[str], list[float]]:
  """This is used to obtain the label and the corresponding score."""
  result: list[dict[str, Any]] = clf(reviews)
  label: str = [x.get("label") for x in result]
  score: str = [x.get("score") for x in result]
  return (label, score)

In [21]:
print(df_pl.head())

In [22]:
# Convert df -> series -> list
sample: pl.Series= df_pl.select(pl.col("review"))[:3].to_series().to_list()
print(sample)

In [23]:
get_sentiment(reviews=sample)

(['negative', 'neutral', 'neutral'],
 [0.910322904586792, 0.7918533682823181, 0.5664960741996765])

### To Do

- Convert the review dates to timestamp which can be used to filter query results for a given period.
- This is helpful to understand customer sentiment over a specific period.

In [24]:
from dateutil.parser import parse


def get_timestamp(dates: list[str]) -> float:
  """This is used to convert the dates to timestamp."""
  timestamps: list[float] = [parse(d).timestamp() for d in dates]
  return timestamps

In [25]:
get_timestamp(dates=["25-10-2023"])[0]

1698192000.0

In [26]:
date: tuple[str] = df_pl.select(pl.col("review_date")).row(50)

print(f"Date: {date}\nTimestamp: {get_timestamp(dates=date)[0]}")

In [27]:
dates: list[str] = df_pl.select(pl.col("review_date")).slice(10,15).to_series().to_list()

print(get_timestamp(dates=dates))

In [42]:
A = df_pl.slice(0, 5).clone()
# A.with_columns(pl.col("review_date").alias("Neidu"))
A.to_dicts()

[{'review_date': '8/3/2017',
  'hotel_name': 'Park Plaza County Hall London',
  'review': ' Extra bed was the worst breakfast queue was really terrible It s easy to tell people to come at a specific time though you have to arrange it somehow Parking is far away and hard to come back because of road diversions ',
  'review_length': 220},
 {'review_date': '8/3/2017',
  'hotel_name': 'Park Plaza County Hall London',
  'review': ' Just the location and view',
  'review_length': 27},
 {'review_date': '8/3/2017',
  'hotel_name': 'Park Plaza County Hall London',
  'review': ' Around the corner from the London eye and used it for a Trafalgar trip as the pickup for our tour was around the corner Walked down to the Borough market and took a river cruise to Greenwich Plent of eating spots close by as well Over the bridge was Westminster Abbey Parliament House and Big Ben and walked to Trafalgar Square as well also a laundry available which is very helpful ',
  'review_length': 384},
 {'review_dat

[{'review_date': '8/3/2017',
  'hotel_name': 'Park Plaza County Hall London',
  'review': ' Extra bed was the worst breakfast queue was really terrible It s easy to tell people to come at a specific time though you have to arrange it somehow Parking is far away and hard to come back because of road diversions ',
  'review_length': 220},
 {'review_date': '8/3/2017',
  'hotel_name': 'Park Plaza County Hall London',
  'review': ' Just the location and view',
  'review_length': 27},
 {'review_date': '8/3/2017',
  'hotel_name': 'Park Plaza County Hall London',
  'review': ' Around the corner from the London eye and used it for a Trafalgar trip as the pickup for our tour was around the corner Walked down to the Borough market and took a river cruise to Greenwich Plent of eating spots close by as well Over the bridge was Westminster Abbey Parliament House and Big Ben and walked to Trafalgar Square as well also a laundry available which is very helpful ',
  'review_length': 384},
 {'review_dat

In [None]:
# Encode the entire data
from tqdm.auto import tqdm


# we will use batches of 64
batch_size: int = 64

for i in tqdm(range(0, len(df_pl), batch_size)):
    # Find end of batch
    i_end: int = min(i + batch_size, df_pl.shape[0])

    # Extract batch
    batch = df_pl.slice(i, i_end)
    # Generate embeddings for batch
    batch_reviews: list[str] = batch.select(pl.col("review")).to_series().to_list()
    emb = retriever.encode(batch_reviews).tolist()
    # Convert review_date to timestamp to enable period filters
    batch_dates: list[str] = batch.select(pl.col("review_date")).to_series().to_list()
    timestamp = pl.DataFrame({"timestamp": get_timestamp(batch_dates)})
    # Get sentiment label and score for reviews in the batch
    label, score = get_sentiment(reviews=batch_reviews)

    # Add new column(s)
    batch: pl.DataFrame = pl.concat([batch, timestamp], how="horizontal")
    batch: pl.DataFrame = pl.concat([batch, pl.DataFrame({"label": label})],
                                    how="horizontal")
    batch: pl.DataFrame = pl.concat([batch, pl.DataFrame({"score": score})],
                                    how="horizontal")

    # Get metadata
    meta = batch.to_dicts()
    # Create unique IDs
    ids: list[str] = [f"{idx}" for idx in range(i, i_end)]
    # Add all to upsert list
    # i.e. dict[str, Any]
    to_upsert = list(zip(ids, emb, meta))
    # Upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# Check that we have all vectors in index
index.describe_index_stats()

  0%|          | 0/1465 [00:00<?, ?it/s]

In [150]:
import polars as pl

# Assuming 'batch' is a Polars DataFrame and 'timestamp' is a Polars Series
# Create some sample data for demonstration purposes
data_batch = {'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}
data_timestamp = {'timestamp': [100, 200, 300]}

# Create Polars DataFrames
batch = pl.DataFrame(data_batch)
timestamp = pl.DataFrame(data_timestamp)

# Add the 'timestamp' column to the 'batch' DataFrame
batch = pl.concat([batch, timestamp], how="horizontal")

# Display the result
print(batch)
