# Netflix Text Analysis

In [1]:
# Built-in library
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
from pathlib import Path
import pandas as pd
import polars as pl
from pprint import pprint
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

## Load Data

In [2]:
fp: Path = Path("../../../../data/NETFLIX_REVIEWS.parquet")

data: pl.DataFrame = pl.read_parquet(source=fp)
data.head(3)

Unnamed: 0_level_0,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp
i64,str,str,str,str,i64,i64,str,str
0,"""7e73f80e-a8fd-…","""15261855397701…","""A Google user""","""Works great on…",5,1,"""1.2.0 build 81…","""2011-05-12 18:…"
1,"""dab55eca-c2a0-…","""23438294286543…","""A Google user""","""Works great on…",5,1,"""1.2.0 build 81…","""2011-05-12 18:…"
2,"""a3b8fa06-8b8f-…","""17447360460835…","""A Google user""","""Works great on…",5,0,"""1.5.2 build 38…","""2011-05-12 18:…"


In [3]:
data = data.drop(columns=["", "pseudo_author_id", "author_name", "review_likes"])
data.head(3)

review_id,review_text,review_rating,author_app_version,review_timestamp
str,str,i64,str,str
"""7e73f80e-a8fd-…","""Works great on…",5,"""1.2.0 build 81…","""2011-05-12 18:…"
"""dab55eca-c2a0-…","""Works great on…",5,"""1.2.0 build 81…","""2011-05-12 18:…"
"""a3b8fa06-8b8f-…","""Works great on…",5,"""1.5.2 build 38…","""2011-05-12 18:…"


In [4]:
# Check if GPU is available
device: str = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Device: {device}")

### To Do

- Can you predict the review rating based on a review text?

In [5]:
from transformers import pipeline


TASK: str = "text-classification"
OUTPUT_DIR: str = "distilbert-base-uncased-finetuned-netflix-ratings"
MODEL_CHECKPOINT: str = f"chineidu/{OUTPUT_DIR}"
netflix_classifier: pipeline = pipeline(task=TASK, model=MODEL_CHECKPOINT)

2023-12-21 17:50:00.860173: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
id2label: dict[str, Any] = {
    1: "very poor",
    2: "poor",
    3: "okay",
    4: "very good",
    5: "excellent",
}
label2id: dict[str, Any] = {
    "very poor": 1,
    "poor": 2,
    "okay": 3,
    "very good": 4,
    "excellent": 5,
}


def get_sentiment(reviews: list[str]) -> list[list[str], list[float]]:
    """This is used to obtain the label and the corresponding score."""
    result: list[dict[str, Any]] = netflix_classifier(reviews)
    labels: list[str] = [x.get("label") for x in result]
    ratings: list[str] = [label2id.get(x.get("label")) for x in result]

    return (labels, ratings)

In [7]:
texts: list[str] = data.select(pl.col("review_text")).to_series().to_list()[10:15]
texts

['Works great on Nexus S, but it needs subtitles support!',
 "Works on 3g even tho quality isn't the best but still...might be my new favorite app!",
 'Working great on Nexus S, picked up right where I left it watching.',
 'Works great. So far so good on Wifi. Nexus 1.',
 "Well I've tried it out on my Evo 4G and it works. Tried it through WiFi, and the quality is pretty decent.; it's very snappy. Then I tried it through 3G and I was thinking the worst, and while the quality is diminished; it still works. I don't live in an area with great signal but it works really well, though the pixelation can get bad at times. Haven't tried it on 4G but I'm expecting great results."]

In [8]:
netflix_classifier(texts)

[{'label': 'very good', 'score': 0.7804526686668396},
 {'label': 'very good', 'score': 0.4879053831100464},
 {'label': 'excellent', 'score': 0.8773263692855835},
 {'label': 'excellent', 'score': 0.7898459434509277},
 {'label': 'very good', 'score': 0.8486072421073914}]

In [9]:
# Actual
data.select(pl.col("review_rating")).to_series().to_list()[10:15]

[4, 5, 5, 5, 4]

In [10]:
# Predicted
get_sentiment(reviews=texts)

(['very good', 'very good', 'excellent', 'excellent', 'very good'],
 [4, 4, 5, 5, 4])

<br>

## Upsert Data To Qdrant

In [11]:
from sentence_transformers import SentenceTransformer


ENCODER_CHECKPOINT: str = "sentence-transformers/all-MiniLM-L6-v2"
# Load the model from HuggingFace Hub
encoder = SentenceTransformer(ENCODER_CHECKPOINT, device=device)

encoder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [12]:
encoder.get_max_seq_length(), encoder.get_sentence_embedding_dimension()

(256, 384)

In [13]:
import os
from getpass import getpass

from dotenv import load_dotenv, find_dotenv


_ = load_dotenv(find_dotenv())  # read local .env file
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

RUN_CELL: bool = False

if RUN_CELL:
    # Get API key without displaying it
    QDRANT_API_KEY: str = getpass("Please enter your API key: ")

In [14]:
from qdrant_client import QdrantClient, models
from qdrant_client.http.exceptions import UnexpectedResponse


URL: str = "http://localhost:6333"
client = QdrantClient(
    url=URL,
    api_key=QDRANT_API_KEY,
)
DIMENSION: int = encoder.get_sentence_embedding_dimension()
METRIC: Any = models.Distance.COSINE
COLLECTION_NAME: str = "netflix-sentiment-analysis"

# Create a collection
try:
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
            distance=METRIC,
        ),
        optimizers_config=models.OptimizersConfigDiff(indexing_threshold=0),
        shard_number=2,
    )
# If collection already exists
except UnexpectedResponse as err:
    print(err)

In [15]:
# Get collection details
collection_info: Any = client.get_collection(collection_name=COLLECTION_NAME)
print(collection_info)

### Generate Embeddings

- Generate embeddings for all the reviews in the dataset.
- The embeddings, the sentiment label and score will be included in the Qdrant collection as metadata.
- The metadata will be used to understand the customer opinions.

In [16]:
print(data.head())

In [17]:
def retrieve_labels(ratings: list[int]) -> list[str]:
    """This is used to retrieve the labels."""
    labels: list[str] = [id2label.get(x) for x in ratings]

    return labels

In [18]:
retrieve_labels(ratings=[2, 3, 4, 5])

['poor', 'okay', 'very good', 'excellent']

In [19]:
data.select("review_timestamp").to_series().to_list()[0]

'2011-05-12 18:50:37'

In [20]:
from dateutil.parser import parse


def get_timestamp(dates: list[str]) -> list[float]:
    """This is used to convert the dates to timestamp."""
    timestamps: list[float] = [parse(d).timestamp() for d in dates]
    return timestamps

In [21]:
d: str = "2011-05-12 18:50:37"

get_timestamp(dates=[d])

[1305222637.0]

### Qdrant Upsert

- Using “Upsert” with Qdrant Vector Database [Blog](https://redandgreen.co.uk/qdrant-upsert/ai-ml/).
- [Official Documentation](https://qdrant.tech/documentation/concepts/points/#upload-points).


In [22]:
# Encode the entire data (Batch Upload)
import uuid

from tqdm.auto import tqdm


batch_size: int = 500
counter: int = 0

if RUN_CELL:
    for i in tqdm(range(0, len(data), batch_size)):

        # Find end of batch
        i_end: int = min(i + batch_size, data.shape[0])

        # Extract batch
        sample_rows: list[Any] = data.rows()[i:i_end]
        batch: pl.DataFrame = pl.DataFrame(
            sample_rows,
            schema=[
                "review_id",
                "review_text",
                "review_rating",
                "author_app_version",
                "review_date",  # renamed!
            ],
        )

        # IDs: list[str] = [str(uuid.uuid4()) for _ in range(batch.shape[0])]
        IDs: list[str] = batch.select(pl.col("review_id")).to_series().to_list()

        # Generate embeddings for batch
        batch_reviews: list[str] = (
            batch.select(pl.col("review_text")).to_series().to_list()
        )
        vectors: list[float] = encoder.encode(
            batch_reviews, show_progress_bar=False
        ).tolist()

        batch_ratings: list[str] = (
            batch.select(pl.col("review_rating")).to_series().to_list()
        )

        # Get sentiment label and score for reviews in the batch
        labels: list[str] = retrieve_labels(ratings=batch_ratings)

        # Add new column(s)
        batch: pl.DataFrame = pl.concat(
            [batch, pl.DataFrame({"label": labels})], how="horizontal"
        )
        dates: list[str] = batch.select(pl.col("review_date")).to_series().to_list()
        timestamp: list[float] = get_timestamp(dates=dates)
        timestamp_df: pl.DataFrame = pl.DataFrame(data={"timestamp": timestamp})
        batch: pl.DataFrame = pl.concat([batch, timestamp_df], how="horizontal")

        # Get metadata
        counter += batch_size
        if counter % 12_000 == 0:
            print(">>> [Creating payload] [Upserting data] <<<")
        payload_data: list[dict[str, Any]] = batch.to_dicts()

        # Upsert/insert these records to Qdrant
        _ = client.upsert(
            collection_name=COLLECTION_NAME,
            points=models.Batch(
                ids=IDs,
                payloads=payload_data,
                vectors=vectors,
            ),
        )

print(">>> [Done] <<<")

### Note

- Due to the size of the data, I only upserted ~775.5k datapoints.
- I took ~1hr 58m to upsert.

### To Do

- Can you predict the review rating based on a review text?

<hr><br>

## Opinion Mining
- What are the reviewers saying about the app?
- Are there any patterns in sentiment over time or across different app versions?
- Automatically categorize reviews into different topics.
- What features are most indicative of a high/low review rating?

In [23]:
def count_sentiment(data: list[dict[str, Any]]) -> dict[str, int]:
    """This is used to count the sentiments."""
    from collections import Counter

    sentiments: list[str] = [x.dict().get("payload").get("label") for x in data]
    result: dict[str, int] = dict(Counter(sentiments))

    return result


import plotly.express as px


def visualize_result(sentiment: dict[str, Any], query: str) -> Any:

    try:
        vis_data: pd.DataFrame = (
            pd.DataFrame(sentiment, index=[0]).drop(columns=[None]).T
        )
    except KeyError:
        vis_data: pd.DataFrame = pd.DataFrame(sentiment, index=[0]).T

    vis_data = vis_data.reset_index(drop=False).rename(
        columns={0: "rating", "index": "label"}
    )
    fig = px.bar(
        vis_data,
        x="label",
        y="rating",
        labels=dict(index="rating", rating="count"),
        color="label",
        text_auto=True,
        title=query,
    )
    return fig.show()

### Q1 What are the reviewers saying about the app?

- e.g. are the customers satisfied with the app?
- do you like it? 

In [24]:
N: int = 5_000
query: str = "are the customers satisfied with the app?"
query_vector: list[float] = encoder.encode(query).tolist()


hits: list[Any] = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    query_filter=None,
    limit=N,
)
# for hit in hits:
# print(hit.payload, "score:", hit.score)

print(hits[:5])

In [25]:
sentiment: dict[str, int] = count_sentiment(hits)
sentiment

{'very good': 684,
 'poor': 92,
 'excellent': 3767,
 'okay': 225,
 'very poor': 231,
 None: 1}

In [26]:
visualize_result(sentiment=sentiment, query=query)

In [27]:
# N: int = 2
query: str = "do you like it?"
query_vector: list[float] = encoder.encode(query).tolist()


hits: list[Any] = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    query_filter=None,
    limit=N,
)

print(hits[:5])
sentiment: dict[str, int] = count_sentiment(hits)
visualize_result(sentiment=sentiment, query=query)

### Q2 Are there any patterns in sentiment over time? 

- e.g. between Jan 2018 to Jan 2019

In [28]:
data.select(pl.col("review_timestamp")).describe()

describe,review_timestamp
str,str
"""count""","""1531126"""
"""null_count""","""0"""
"""mean""",
"""std""",
"""min""","""2011-05-12 18:…"
"""25%""",
"""50%""",
"""75%""",
"""max""","""2023-11-15 22:…"


In [29]:
get_timestamp(["2018-01-01 00:00:00"])[0]

1514761200.0

In [30]:
N: int = 8_000
query: str = "are the customers satisfied with the app?"
query_vector: list[float] = encoder.encode(query).tolist()

start_date, stop_date = [
    "2018-01-01 00:00:00",
    "2019-01-01 00:00:00",
]  # yyyy-mm-dd HH:MM:SS
start_time: float = get_timestamp([start_date])[0]
stop_time: float = get_timestamp([stop_date])[0]

query_filter: dict[str, Any] = {
    "must": [
        {"key": "timestamp", "range": {"gte": start_time, "lte": stop_time}},
    ]
}

hits: list[Any] = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    query_filter=query_filter,
    limit=N,
)

print(hits[:15])

In [31]:
sentiment: dict[str, int] = count_sentiment(hits)
visualize_result(sentiment=sentiment, query=query)