# Netflix Text Analytics

<hr>

### Load Data From Table (Pyarrow Dataset)

```text
NETFLIX_REVIEWS
├── review_rating=0
│   └── 2418bd2b279a4829b6f69d61165509fd-0.parquet
├── review_rating=1
│   └── 2418bd2b279a4829b6f69d61165509fd-0.parquet
├── review_rating=2
│   └── 2418bd2b279a4829b6f69d61165509fd-0.parquet
├── review_rating=3
│   └── 2418bd2b279a4829b6f69d61165509fd-0.parquet
├── review_rating=4
│   └── 2418bd2b279a4829b6f69d61165509fd-0.parquet
└── review_rating=5
    └── 2418bd2b279a4829b6f69d61165509fd-0.parquet

```


```python
from pathlib import Path
import pyarrow.dataset as ds

# Specify the directory containing Parquet files
directory_path: Path = Path("../../../../data/NETFLIX_REVIEWS")

# Use PyArrow to read the partitioned Parquet dataset
table = ds.dataset(directory_path)

# Convert the PyArrow table to a Pandas DataFrame
data: pl.DataFrame = pl.scan_pyarrow_dataset(table).collect()

# Display the DataFrame
print(data)
```

In [1]:
# Built-in library
import re
import json
from typing import Any, Optional, TypeAlias, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

## Load Data

In [2]:
from pathlib import Path

# Specify the directory containing Parquet files
directory_path: Path = Path("../../../../data/NETFLIX_REVIEWS.parquet")

# Convert the PyArrow table to a Pandas DataFrame
data: pl.DataFrame = pl.read_parquet(
    directory_path,
).drop(columns=[""])

# Display the DataFrame
print(data)

<hr><br>

### Descriptive Statistics

- Distribution of rating.
- Number of unique authors.
- 5-summary stats of review ratings.
- Percentage of rating >= 3 and rating < 3
  

In [3]:
data.describe()

describe,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp
str,str,str,str,str,f64,f64,str,str
"""count""","""1531126""","""1531126""","""1531126""","""1531126""",1531126.0,1531126.0,"""1531126""","""1531126"""
"""null_count""","""0""","""0""","""0""","""0""",0.0,0.0,"""0""","""0"""
"""mean""",,,,,3.91526,1.323521,,
"""std""",,,,,1.577103,32.524688,,
"""min""","""00000ca0-ac53-…","""10000001724899…","""!M***********e…","""""",0.0,0.0,"""""","""2011-05-12 18:…"
"""25%""",,,,,3.0,0.0,,
"""50%""",,,,,5.0,0.0,,
"""75%""",,,,,5.0,0.0,,
"""max""","""fffffb20-d555-…","""99999937773079…","""﴾G*********1﴿""","""🫶🏽""",5.0,8231.0,"""8.95.0 build 2…","""2023-11-15 22:…"


In [4]:
# Unique authors
data.select(pl.col("author_name").value_counts())

author_name
struct[2]
"{""gr*************el"",1}"
"{""Er********ka"",2}"
"{""va**********on"",1}"
"{""Az*****zu"",2}"
"{""Ha***********************ah"",1}"
"{""FI*********su"",1}"
"{""Pe*******mi"",1}"
"{""Ge************me"",1}"
"{""Mi*********ju"",1}"
"{""Am********th"",9}"


In [5]:
# Unique review rating
data.select(pl.col("review_rating").value_counts(sort=True))

review_rating
struct[2]
"{5,935781}"
"{1,282021}"
"{4,156718}"
"{3,93793}"
"{2,62790}"
"{0,23}"


In [6]:
# Drop review rating of 0
data = data.filter(pl.col("review_rating").ne(0))

# Unique review rating
data.select(pl.col("review_rating").value_counts(sort=True))

review_rating
struct[2]
"{5,935781}"
"{1,282021}"
"{4,156718}"
"{3,93793}"
"{2,62790}"


In [7]:
# Percentage of review rating >=3
data.filter(pl.col("review_rating").gt(2)).select(pl.col("review_rating")).shape[
    0
] / data.shape[0]

0.7747956865083537

In [8]:
# Percentage of review rating <3
data.filter(pl.col("review_rating").lt(3)).select(pl.col("review_rating")).shape[
    0
] / data.shape[0]

0.2252043134916462

In [9]:
# Unique author_app_version
print(data.select(pl.col("author_app_version").n_unique()))
data.select(pl.col("author_app_version").value_counts())

author_app_version
struct[2]
"{""5.8.0 build 24644"",16}"
"{""5.11.0 build 25361"",1}"
"{""8.10.0 build 10 40088"",1}"
"{""8.22.1 build 22 40193"",1934}"
"{""7.7.0 build 20 34181"",2160}"
"{""7.110.0 build 6 35520"",41}"
"{""8.67.1 build 11 50414"",955}"
"{""5.13.1 build 25871"",1460}"
"{""6.7.0 build 28708"",2}"
"{""6.17.0 build 18 31525"",57}"


In [10]:
data.select(pl.col("author_app_version"))

author_app_version
str
"""1.2.0 build 81…"
"""1.2.0 build 81…"
"""1.5.2 build 38…"
"""1.2.1 build 84…"
"""1.5.2 build 38…"
"""1.4.1 build 26…"
"""1.5.2 build 38…"
"""1.2.0 build 81…"
"""1.2.0 build 81…"
"""1.5.2 build 38…"


In [11]:
pattern: str = r"\d{1,4}\.\d{1,4}\.\d{1,4}\s?\w+"

data = data.with_columns(
    pl.col("author_app_version").str.extract(pattern, 0).alias("cleaned_app_version")
)

In [12]:
df: pl.DataFrame = (
    data.group_by(pl.col("cleaned_app_version"))
    .agg(pl.mean("review_rating"))
    .sort(by="cleaned_app_version", descending=False)
)
df.head(10)

cleaned_app_version,review_rating
str,f64
,4.076079
"""1.2.0 build""",4.211864
"""1.2.1 build""",3.819461
"""1.2.2 build""",3.67928
"""1.3.0 build""",3.689362
"""1.4.0 build""",3.036388
"""1.4.1 build""",3.344689
"""1.5.0 build""",3.676969
"""1.5.1 build""",3.417691
"""1.5.2 build""",3.149737


In [13]:
import plotly.express as px


fig = px.line(
    df,
    x="cleaned_app_version",
    y="review_rating",
    title=f"Average Review Rating of App Over Time",
)
fig.show()

In [14]:
data.head(2)

review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp,cleaned_app_version
str,str,str,str,i64,i64,str,str,str
"""7e73f80e-a8fd-…","""15261855397701…","""A Google user""","""Works great on…",5,1,"""1.2.0 build 81…","""2011-05-12 18:…","""1.2.0 build"""
"""dab55eca-c2a0-…","""23438294286543…","""A Google user""","""Works great on…",5,1,"""1.2.0 build 81…","""2011-05-12 18:…","""1.2.0 build"""


In [15]:
review_data: pl.DataFrame = data.select(["review_text", "review_rating"])

review_data.head()

review_text,review_rating
str,i64
"""Works great on…",5
"""Works great on…",5
"""Works great on…",5
"""Working perfec…",5
"""cm7 2.3.3 N1""",5


In [16]:
from datasets import Dataset


review_dataset: Dataset = Dataset.from_pandas(review_data.to_pandas())
review_dataset

Dataset({
    features: ['review_text', 'review_rating'],
    num_rows: 1531103
})

In [17]:
print(review_dataset[:3])

In [18]:
SEED: int = 45
N: int = 250_000

sample_data: Dataset = review_dataset.shuffle(seed=23).select(range(N))
sample_data

Dataset({
    features: ['review_text', 'review_rating'],
    num_rows: 250000
})

In [19]:
def calculate_length(example: dict[str, Any]) -> dict[str, Any]:
    """This returns the length of the reviews."""
    result: dict[str, Any] = {
        "review_length": [len(x) for x in example.get("review_text")]
    }
    return result

In [20]:
sample_data = sample_data.map(calculate_length, batched=True)
print(sample_data[:3])

Map:   0%|          | 0/250000 [00:00<?, ? examples/s]

In [21]:
# Drop reviews with fewer characters
sample_data = sample_data.filter(lambda x: x.get("review_length") > 10)
sample_data

Filter:   0%|          | 0/250000 [00:00<?, ? examples/s]

Dataset({
    features: ['review_text', 'review_rating', 'review_length'],
    num_rows: 175590
})

In [22]:
save_fp: Path = Path("../../../../data/netflix_cleaned_data.jsonl")

sample_data.to_json(save_fp)

Creating json from Arrow format:   0%|          | 0/176 [00:00<?, ?ba/s]

24094158