<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/Projects/Data-mining/Sentiment-analysis/03a-train-netflix-text-classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Netflix Sentiments Analysis

In [1]:
!pip install -qU transformers[torch] datasets evaluate \
        sentence-transformers seqeval qdrant-client


In [2]:
# Built-in library
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
from pathlib import Path
import pandas as pd
import polars as pl
from pprint import pprint
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# # Black code formatter (Optional)
# %load_ext lab_black

# # auto reload imports
# %load_ext autoreload
# %autoreload 2

## Load Data From Google Drive

In [3]:
from google.colab import drive


drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from datasets import load_dataset, Dataset, DatasetDict


fp: str = "/content/drive/MyDrive/My doc/Deep Learning/Data/netflix_cleaned_data.jsonl"
data: Dataset = load_dataset(path="json", data_files=fp)

print(data)

In [5]:
# Split the data into train and validation set
from sklearn.model_selection import train_test_split


df: pl.DataFrame = pl.from_pandas(data.get("train").to_pandas())
df = (
    df.with_columns(pl.col("review_rating")
    .sub(1) # subtract 1 so that the range is 0 thru 4
    .alias("label"))
    .drop(columns=["review_rating"])
    )

df.head(2)

review_text,review_length,label
str,i64,i64
"""The app boots …",150,1
"""I like all mov…",37,4


In [6]:
# Note: The label MUST start at 0 otherwise, there'll be an error during training.
id2label: dict[str, Any] = {0: "very poor", 1: "poor", 2: "okay", 3: "very good", 4: "excellent"}
label2id: dict[str, Any] = {'very poor': 0, 'poor': 1, 'okay': 2, 'very good': 3, 'excellent': 4}

In [7]:
df.select(pl.col("label").unique())

label
i64
0
1
2
3
4


In [8]:
RANDOM_STATE: int = 123
TEST_SIZE = 0.2


X_train, X_split = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)

X_train.shape, X_split.shape

((140472, 3), (35118, 3))

In [9]:
X_test, X_val = train_test_split(X_split, test_size=TEST_SIZE, random_state=RANDOM_STATE)

X_test.shape, X_val.shape

((28094, 3), (7024, 3))

In [10]:
# Create HuggingFace Dataset

dataset: DatasetDict = DatasetDict({"train": Dataset.from_pandas(X_train.to_pandas()),
                                    "test": Dataset.from_pandas(X_test.to_pandas()),
                                    "validation": Dataset.from_pandas(X_val.to_pandas()),
                                    })

dataset

DatasetDict({
    train: Dataset({
        features: ['review_text', 'review_length', 'label'],
        num_rows: 140472
    })
    test: Dataset({
        features: ['review_text', 'review_length', 'label'],
        num_rows: 28094
    })
    validation: Dataset({
        features: ['review_text', 'review_length', 'label'],
        num_rows: 7024
    })
})

### Import Model and Tokenizer From HuggingFace

In [11]:
from transformers import (AutoModelForSequenceClassification,
                          DistilBertForSequenceClassification,
                          AutoTokenizer,
                          DistilBertTokenizer,
                          )


MODEL_CHECKPOINT: str = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [12]:
def convert_to_lowerase(example: dict[str, Any]) -> dict[str, Any]:
  """Convert the text to lowercase."""
  result: dict[str, Any] = {"review_text": [(text).lower() for text in example.get("review_text")]}
  return result


def tokenize_data(example: dict[str, Any]) -> dict[str, Any]:
  """This is used to tokenize the data."""
  result: dict[str, Any] = tokenizer(example.get("review_text"), truncation=True)
  return result


In [13]:
dataset = dataset.map(convert_to_lowerase, batched=True)
tokenized_dataset = dataset.map(tokenize_data, batched=True,
                    remove_columns=['review_text', 'review_length'])

Map:   0%|          | 0/140472 [00:00<?, ? examples/s]

Map:   0%|          | 0/28094 [00:00<?, ? examples/s]

Map:   0%|          | 0/7024 [00:00<?, ? examples/s]

Map:   0%|          | 0/140472 [00:00<?, ? examples/s]

Map:   0%|          | 0/28094 [00:00<?, ? examples/s]

Map:   0%|          | 0/7024 [00:00<?, ? examples/s]

In [14]:
print(tokenized_dataset.get("train")[5])

In [15]:
# Number of unique labels
N_LABELS: int = len(tokenized_dataset.get("train").unique("label"))

N_LABELS

5

In [16]:
from transformers import DataCollatorWithPadding


data_collator: DataCollatorWithPadding = DataCollatorWithPadding(tokenizer=tokenizer)

# Apply the data collator
batch: torch.Tensor = data_collator([tokenized_dataset["train"][idx] for idx in range(2)])

print(batch)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [17]:
# The 2nd idx has been padded with 0s
for row in batch.get("input_ids"):
    print(row)

In [18]:
from sklearn.metrics import recall_score, accuracy_score, f1_score


print(accuracy_score(y_true=[1, 2, 3, 4, 5], y_pred=[1, 1, 2, 4, 5]))
print(recall_score(y_true=[1, 2, 3, 4, 5], y_pred=[1, 1, 2, 4, 5], average="macro"))
print(f1_score(y_true=[1, 2, 3, 4, 5], y_pred=[1, 1, 2, 4, 5], average="macro"))

In [19]:
from sklearn.metrics import recall_score, accuracy_score
from datasets import load_metric


# metric = load_metric("seqeval")


def compute_metrics(eval_preds: tuple[Any, Any]) -> dict[str, Any]:
    """This is used to calculate the evaluation metrics."""
    logits, labels = eval_preds
    y_pred = np.argmax(logits, axis=-1)

    # Metrics
    accuracy: float = accuracy_score(y_true=labels, y_pred=y_pred)
    recall_val: float = recall_score(y_true=labels, y_pred=y_pred, average="macro")
    f1_score_val: float = f1_score(y_true=labels, y_pred=y_pred, average="macro")

    result: dict[str, Any] = {
        "accuracy": accuracy,
        "recall": recall_val,
        "f1_score": f1_score_val
      }

    return result

In [20]:
# Now we can just pass them to the AutoModelForTokenClassification.from_pretrained() method,
# and they will be set in the model’s configuration and then properly saved and uploaded to the Hub:

model: AutoModelForSequenceClassification = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=N_LABELS, id2label=id2label, label2id=label2id
)

model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [21]:
from huggingface_hub import notebook_login


# Login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from transformers import TrainingArguments


OUTPUT_DIR: str = "distilbert-base-uncased-finetuned-netflix-ratings"
STRATEGY: str = "epoch"
LEARNING_RATE: float = 2e-5
NUM_EPOCHS: int = 3
WEIGHT_DECAY: float = 0.01


args: TrainingArguments = TrainingArguments(
    OUTPUT_DIR,
    evaluation_strategy=STRATEGY,
    save_strategy=STRATEGY,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size= 16,
    weight_decay=WEIGHT_DECAY,
    push_to_hub=True,
)

In [23]:
from transformers import Trainer



# Train the model!
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset.get("train"),
    eval_dataset=tokenized_dataset.get("validation"),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Recall,F1 Score
1,0.7777,0.765709,0.734909,0.449597,0.441423
2,0.699,0.760056,0.735194,0.466084,0.464319
3,0.6297,0.784785,0.73377,0.472868,0.472146


TrainOutput(global_step=26340, training_loss=0.7078060651241873, metrics={'train_runtime': 3159.1336, 'train_samples_per_second': 133.396, 'train_steps_per_second': 8.338, 'total_flos': 7866281146668960.0, 'train_loss': 0.7078060651241873, 'epoch': 3.0})

In [24]:
trainer.push_to_hub(commit_message="Training complete")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

events.out.tfevents.1702825698.1eaed6a7c133.4675.0:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

'https://huggingface.co/chineidu/distilbert-base-uncased-finetuned-netflix-ratings/tree/main/'

In [25]:
from transformers import pipeline


TASK: str = "text-classification"
MODEL_CHECKPOINT: str = f"chineidu/{OUTPUT_DIR}"
netflix_classifier: pipeline = pipeline(task=TASK, model=MODEL_CHECKPOINT)

config.json:   0%|          | 0.00/845 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [27]:
text: str = "The previous versions were better. This particular version is awful. Please, fix it!"

netflix_classifier(text)

[{'label': 'very poor', 'score': 0.8723748922348022}]