In [1]:
import json
import os
from pathlib import Path
from typing import Any, Literal

import outlines
import polars as pl
from llama_cpp import Llama
from openai import OpenAI
from pydantic import BaseModel

In [2]:
# https://www.kaggle.com/datasets/farhan999/tokopedia-product-reviews
dataset_path = Path("datasets/tokopedia-product-reviews-2019.csv")

df = pl.read_csv(dataset_path, ignore_errors=True)
df.head()

Unnamed: 0_level_0,text,rating,category,product_name,product_id,sold,shop_id,product_url
i64,str,i64,str,str,i64,i64,i64,str
1,"""Barang sesuai pesanan dan cepa…",5,"""pertukangan""","""Staples Dekorasi Staples Kayu …",418660637,1,1740837,"""https://www.tokopedia.com/shak…"
2,"""Barang bagus harga murah""",5,"""pertukangan""","""STAPLE GUN ATS 3 WAY TACKER - …",416032545,11,1477109,"""https://www.tokopedia.com/jura…"
3,"""Paket rapi...mantap....cepat..…",5,"""pertukangan""","""STAPLE GUN ATS 3 WAY TACKER - …",416032545,11,1477109,"""https://www.tokopedia.com/jura…"
4,"""ya saya puas dgn barangnya""",5,"""pertukangan""","""ALAT STAPLES TEMBAK &#40;AIR N…",102279869,5,771395,"""https://www.tokopedia.com/kama…"
5,"""Responya luar biasa b mantap""",5,"""pertukangan""","""Isi Refill Staples Jok Kulit M…",190679689,787,969999,"""https://www.tokopedia.com/mitr…"


In [3]:
llama = Llama.from_pretrained(
    repo_id="unsloth/gemma-3-4b-it-GGUF",
    filename="*Q4_K_M.gguf",
    n_gpu_layers=-1,  # offload to GPU
    n_ctx=4096,
    verbose=False,
    seed=42,
)
model_llama = outlines.from_llamacpp(llama)


or_client = OpenAI(
    base_url="https://openrouter.ai/api/v1/",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

model_or = outlines.from_openai(
    client=or_client,
    model_name="meta-llama/llama-3.3-70b-instruct:free",
)

llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_set_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_c4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f16                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64

In [4]:
class ProductReview(BaseModel):
    pros: list[str]
    cons: list[str]
    summary: str
    sentiment: Literal["positive", "negative", "neutral"]


In [5]:
def try_or_dump(txt: Any, cls: type[BaseModel]) -> BaseModel | None:
    try:
        return cls.model_validate_json(txt)
    except Exception:
        print("Validation failed:")
        print(txt)
        return None


def pprint(obj: BaseModel | None):
    data = json.dumps(obj.model_dump(), indent=2) if obj else None

    print(data)

In [6]:
review_llama_string = model_llama(
    "Review: The XPS 13 has great battery life and a stunning display, but it runs hot and the webcam is poor quality.",
    ProductReview,
)

review_llama = try_or_dump(review_llama_string, ProductReview)

pprint(review_llama)

Validation failed:
{ "pros": [ "Excellent battery life", "Beautiful display with
None


In [7]:
review_or_string = model_or(
    "Review: The XPS 13 has great battery life and a stunning display, but it runs hot and the webcam is poor quality.",
    ProductReview,
)

review_or = try_or_dump(review_or_string, ProductReview)

pprint(review_or)

{
  "pros": [
    "great battery life",
    "stunning display"
  ],
  "cons": [
    "runs hot",
    "poor quality webcam"
  ],
  "summary": "The XPS 13 has both positive and negative aspects. On the positive side, it offers great battery life and a stunning display. However, it has some drawbacks, including running hot and having a poor-quality webcam.",
  "sentiment": "neutral"
}


In [8]:
# error: cannot use Literal
# sentiment = model_or(
#     "Analyze: 'This product completely changed my life!'",
#     Literal["Positive", "Negative", "Neutral"]
# )
# sentiment

In [9]:
sentiment = model_llama(
    "Analyze: 'This product completely changed my life!'",
    Literal["Positive", "Negative", "Neutral"],
)
sentiment

'Positive'

In [10]:
group_rating = (
    df.sample(fraction=1.0, shuffle=True, seed=42)
    .with_columns(pl.col("text").str.replace_all(r"\n", " ").alias("text"))
    .group_by("rating", maintain_order=True)
    .agg(
        pl.col("text").first(),
    )
    .sort("rating")
)
group_rating.head()

rating,text
i64,str
1,"""Cacat produk nya gak bisa nyal…"
2,"""Barang baru dipakai beberapa h…"
3,"""Harga cukup murah di flash sal…"
4,"""Barang sudah sampai di rumah d…"
5,"""Barang ori sesuai pesanan, ter…"


In [11]:
for text in group_rating["text"]:
    print("Review: ", text)
    review_str = model_llama(text, ProductReview)
    review = try_or_dump(review_str, ProductReview)
    pprint(review)
    print("-" * 20)

Review:  Cacat produk nya gak bisa nyala padahal baru beli
Validation failed:
{ "pros" : ["Product defect,
None
--------------------
Review:  Barang baru dipakai beberapa hari tidak bisa di clear bekas coretannya.
Validation failed:
{ "pros" :["{" ],"c
None
--------------------
Review:  Harga cukup murah di flash sale, kulit sintetis, sy kira semua produk brodo genuine leather
Validation failed:
{ "pros": ["23-07-202
None
--------------------
Review:  Barang sudah sampai di rumah dengan selamat. Dari matras dan tas matrasnya juga sama persis seperti di gambar yg diiklankan. Overall puas belanja di lapak ini. Terima kasih yaaa~
Validation failed:
{ "pros" :["t" ] , "
None
--------------------
Review:  Barang ori sesuai pesanan, terima kasih
Validation failed:
{ "pros" : ["s","s
None
--------------------


In [12]:
for text in group_rating["text"]:
    print("Review: ", text)
    review_str = model_or(text, ProductReview)
    review = try_or_dump(review_str, ProductReview)
    pprint(review)
    print("-" * 20)

Review:  Cacat produk nya gak bisa nyala padahal baru beli
{
  "pros": [],
  "cons": [
    "Kualitas produk yang buruk",
    "Produk tidak berfungsi sebagaimana mestinya"
  ],
  "summary": "Saya baru saja membeli produk ini, tapi sangat kecewa karena produknya tidak bisa dinyalakan. Ini menunjukkan kualitas produk yang buruk dan tidak sesuai dengan harapan. Saya berharap bisa mendapatkan pengembalian uang atau produk pengganti yang berfungsi dengan baik.",
  "sentiment": "negative"
}
--------------------
Review:  Barang baru dipakai beberapa hari tidak bisa di clear bekas coretannya.
{
  "pros": [],
  "cons": [
    "Barang baru dipakai beberapa hari tidak bisa di clear bekas coretannya."
  ],
  "summary": "Barang baru yang digunakan beberapa hari sudah menunjukkan tanda-tanda aus, seperti bekas coretan yang tidak bisa dihilangkan, menandakan kualitas barang yang kurang baik.",
  "sentiment": "negative"
}
--------------------
Review:  Harga cukup murah di flash sale, kulit sintetis, sy 