<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/Projects/Data-mining/02-sentiments-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis

```sh
!pip install transformers[torch]
!pip install datasets evaluate
!pip install sentence-transformers pinecone-client
```

## Install Dependencies

In [2]:
!pip install transformers[torch]
!pip install datasets evaluate
!pip install sentence-transformers pinecone-client

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pinecone-client
  Downloading pinecone_client-2.2.4-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
Collecting loguru>=0.5.0 (from pinecone-client)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting dnspython>=

In [45]:
# Built-in library
import re
import json
from typing import Any, Optional, TypeAlias, Union
import logging
import warnings

# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
# %load_ext lab_black

# auto reload imports
# %load_ext autoreload
# %autoreload 2

## Load And Prepare Dataset

- I'll be using a dataset containing approx. 90k hotel reviews by customers.
- The dataset can be loaded using HuggingFace datasets.

In [20]:
from datasets import load_dataset, Dataset


PATH: str = "ashraq/hotel-reviews"
reviews_data: Dataset = load_dataset(PATH, split="train")
reviews_data

Dataset({
    features: ['review_date', 'hotel_name', 'review'],
    num_rows: 93757
})

In [7]:
RANDOM_STATE: int = 123

print(reviews_data[:3])

In [21]:
Example: TypeAlias = dict[str, Any]

def get_text_length(example: Example) -> Example:
  """This returns the length of the review. """
  key: str = "review"
  result: Example = {"review_length": [len(x) for x in example.get(key)]}
  return result

In [22]:
reviews_data: Dataset = reviews_data.map(get_text_length, batched=True)
reviews_data

Map:   0%|          | 0/93757 [00:00<?, ? examples/s]

Dataset({
    features: ['review_date', 'hotel_name', 'review', 'review_length'],
    num_rows: 93757
})

In [31]:
# Cheeck the length of the reviews
N: int = 5
result: Example = reviews_data.sort("review_length", reverse=True)[:N]
print(result.get("review_length"))

In [32]:
# Convert to tabular data
df: pd.DataFrame = reviews_data.to_pandas()

df_pl: pl.DataFrame = pl.from_pandas(data=df)
df_pl

review_date,hotel_name,review,review_length
str,str,str,i64
"""8/3/2017""","""Park Plaza Cou…",""" Extra bed was…",220
"""8/3/2017""","""Park Plaza Cou…",""" Just the loca…",27
"""8/3/2017""","""Park Plaza Cou…",""" Around the co…",384
"""8/2/2017""","""Park Plaza Cou…",""" I wish you ha…",33
"""8/2/2017""","""Park Plaza Cou…",""" You re always…",270
"""8/2/2017""","""Park Plaza Cou…",""" Bit of a wait…",40
"""8/2/2017""","""Park Plaza Cou…",""" The staff wer…",118
"""8/2/2017""","""Park Plaza Cou…",""" Housekeeping …",697
"""8/2/2017""","""Park Plaza Cou…",""" The location …",226
"""8/2/2017""","""Park Plaza Cou…",""" Breakfast was…",269


In [35]:
df_pl.describe(percentiles=[0.05, 0.10, 0.95])

describe,review_date,hotel_name,review,review_length
str,str,str,str,f64
"""count""","""93757""","""93757""","""93757""",93757.0
"""null_count""","""0""","""0""","""0""",0.0
"""mean""",,,,108.006496
"""std""",,,,140.482189
"""min""","""1/1/2016""","""Blakemore Hyde…",""" """,1.0
"""max""","""9/9/2016""","""Strand Palace …","""90""",1966.0
"""median""",,,,64.0
"""5%""",,,,9.0
"""10%""",,,,15.0
"""95%""",,,,342.0


In [38]:
# Select reviews that are equal to or greater than the threshold.
THRESHOLD: int = 10
df_pl = df_pl.filter(pl.col("review_length") >= THRESHOLD)

df_pl.describe(percentiles=[0.05, 0.10, 0.95])

describe,review_date,hotel_name,review,review_length
str,str,str,str,f64
"""count""","""88350""","""88350""","""88350""",88350.0
"""null_count""","""0""","""0""","""0""",0.0
"""mean""",,,,114.143622
"""std""",,,,142.442105
"""min""","""1/1/2016""","""Blakemore Hyde…",""" 007 room had …",10.0
"""max""","""9/9/2016""","""Strand Palace …",""" zig zag aroun…",1966.0
"""median""",,,,70.0
"""5%""",,,,15.0
"""10%""",,,,21.0
"""95%""",,,,353.0


In [41]:
# Convert back to HF dataset
reviews_data_1: Dataset = Dataset.from_pandas(df=df_pl.to_pandas())
reviews_data_1

Dataset({
    features: ['review_date', 'hotel_name', 'review', 'review_length'],
    num_rows: 88350
})

## Initialize Sentiment Analysis Model

- I'll be using a finetuned [RoBERTa](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest?text=This+hotel+is+not+so+great.+It+lacks+basic+facilities.) model.

In [42]:
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          pipeline,)


MODEL_CHECKPOINT: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"
TASK: str = "sentiment-analysis"

# Load the model from HuggingFace Hub
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT)

# Load the tokenizer from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [44]:
# Check if GPU is available
device: str = "cuda" if torch.cuda.is_available() else "cpu"
device

NameError: ignored

In [None]:
clf = pipeline(task=TASK, )

In [43]:
pipeline??