In [1]:
import json
import os
from pathlib import Path
from typing import Literal

import instructor
import polars as pl
from llama_cpp import Llama
from openai import OpenAI
from pydantic import BaseModel


In [2]:
# https://www.kaggle.com/datasets/farhan999/tokopedia-product-reviews
dataset_path = Path("datasets/tokopedia-product-reviews-2019.csv")

df = pl.read_csv(dataset_path, ignore_errors=True)
df.head()

Unnamed: 0_level_0,text,rating,category,product_name,product_id,sold,shop_id,product_url
i64,str,i64,str,str,i64,i64,i64,str
1,"""Barang sesuai pesanan dan cepa…",5,"""pertukangan""","""Staples Dekorasi Staples Kayu …",418660637,1,1740837,"""https://www.tokopedia.com/shak…"
2,"""Barang bagus harga murah""",5,"""pertukangan""","""STAPLE GUN ATS 3 WAY TACKER - …",416032545,11,1477109,"""https://www.tokopedia.com/jura…"
3,"""Paket rapi...mantap....cepat..…",5,"""pertukangan""","""STAPLE GUN ATS 3 WAY TACKER - …",416032545,11,1477109,"""https://www.tokopedia.com/jura…"
4,"""ya saya puas dgn barangnya""",5,"""pertukangan""","""ALAT STAPLES TEMBAK &#40;AIR N…",102279869,5,771395,"""https://www.tokopedia.com/kama…"
5,"""Responya luar biasa b mantap""",5,"""pertukangan""","""Isi Refill Staples Jok Kulit M…",190679689,787,969999,"""https://www.tokopedia.com/mitr…"


In [3]:
llama = Llama.from_pretrained(
    repo_id="unsloth/gemma-3-4b-it-GGUF",
    filename="*Q4_K_M.gguf",
    n_gpu_layers=-1,  # offload to GPU
    n_ctx=4096,
    verbose=False,
    seed=42,
)

model_llama = instructor.patch(
    create=llama.create_chat_completion_openai_v1,
    mode=instructor.Mode.JSON_SCHEMA,
)

or_client = OpenAI(
    base_url="https://openrouter.ai/api/v1/",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)
model_or = instructor.from_openai(client=or_client)
or_model_name = "meta-llama/llama-3.3-70b-instruct:free"

llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_set_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_c4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f16                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64

In [4]:
class ProductReview(BaseModel):
    pros: list[str]
    cons: list[str]
    summary: str
    sentiment: Literal["positive", "negative", "neutral"]


In [5]:
def try_or_dump(txt: str, cls: BaseModel):
    try:
        return cls.model_validate_json(txt)
    except Exception:
        print("Validation failed:")
        print(txt)
        return None


def pprint(obj: BaseModel | None):
    data = json.dumps(obj.model_dump(), indent=2) if obj else None
    print(data)

In [6]:
# review_llama = model_llama(
#     response_model=ProductReview,
#     messages=[
#         {
#             "role": "user",
#             "content": "The XPS 13 has great battery life and a stunning display, but it runs hot and the webcam is poor quality.",
#         }
#     ],
# )

# pprint(review_llama)

In [7]:
try:
    review_or = model_or.chat.completions.create(
        model=or_model_name,
        response_model=ProductReview,
        messages=[
            {
                "role": "user",
                "content": "The XPS 13 has great battery life and a stunning display, but it runs hot and the webcam is poor quality.",
            }
        ],
    )  # ty:ignore[no-matching-overload]

    pprint(review_or)
except Exception as e:
    print("Error:", e)

{
  "pros": [
    "great battery life",
    "stunning display"
  ],
  "cons": [
    "runs hot",
    "poor quality webcam"
  ],
  "summary": "The XPS 13 has both positive and negative aspects",
  "sentiment": "neutral"
}


In [8]:
# sentiment_llama = model_llama(
#     response_model=Literal["positive", "negative", "neutral"],
#     messages=[
#         {
#             "role": "user",
#             "content": "This product completely changed my life!",
#         }
#     ],
# )

# sentiment_llama

In [9]:
sentiment_or = model_or.chat.completions.create(
    model=or_model_name,
    response_model=Literal["positive", "negative", "neutral"],
    messages=[
        {
            "role": "user",
            "content": "This product completely changed my life!",
        }
    ],
)  # ty:ignore[no-matching-overload]

sentiment_or

'positive'

In [10]:
group_rating = (
    df.sample(fraction=1.0, shuffle=True, seed=42)
    .with_columns(pl.col("text").str.replace_all(r"\n", " ").alias("text"))
    .group_by("rating", maintain_order=True)
    .agg(
        pl.col("text").first(),
    )
    .sort("rating")
)
group_rating.head()

rating,text
i64,str
1,"""Cacat produk nya gak bisa nyal…"
2,"""Barang baru dipakai beberapa h…"
3,"""Harga cukup murah di flash sal…"
4,"""Barang sudah sampai di rumah d…"
5,"""Barang ori sesuai pesanan, ter…"


In [11]:
for text in group_rating["text"]:
    print("Review: ", text)
    try:
        review = model_or.chat.completions.create(
            model=or_model_name,
            response_model=ProductReview,
            messages=[
                {
                    "role": "user",
                    "content": text,
                }
            ],
        )  # ty:ignore[no-matching-overload]
    except Exception:
        review = None
    pprint(review)
    print("-" * 20)

Review:  Cacat produk nya gak bisa nyala padahal baru beli
{
  "pros": [],
  "cons": [
    "produk tidak bisa nyala"
  ],
  "summary": "Produk baru tidak bisa nyala",
  "sentiment": "negative"
}
--------------------
Review:  Barang baru dipakai beberapa hari tidak bisa di clear bekas coretannya.
{
  "pros": [],
  "cons": [
    "tidak bisa di clear bekas coretannya"
  ],
  "summary": "Barang baru dipakai beberapa hari tidak bisa di clear bekas coretannya",
  "sentiment": "negative"
}
--------------------
Review:  Harga cukup murah di flash sale, kulit sintetis, sy kira semua produk brodo genuine leather
{
  "pros": [
    "Harga cukup murah di flash sale"
  ],
  "cons": [
    "Kulit sintetis, bukan genuine leather"
  ],
  "summary": "Produk ini memiliki harga yang murah, tapi menggunakan kulit sintetis bukan genuine leather",
  "sentiment": "negative"
}
--------------------
Review:  Barang sudah sampai di rumah dengan selamat. Dari matras dan tas matrasnya juga sama persis seperti di gam

In [12]:
for text in group_rating["text"]:
    print("Review: ", text)
    try:
        review = model_llama(
            response_model=ProductReview,
            messages=[
                {
                    "role": "user",
                    "content": text,
                }
            ],
        )
    except Exception:
        review = None
    pprint(review)
    print("-" * 20)

Review:  Cacat produk nya gak bisa nyala padahal baru beli
None
--------------------
Review:  Barang baru dipakai beberapa hari tidak bisa di clear bekas coretannya.
None
--------------------
Review:  Harga cukup murah di flash sale, kulit sintetis, sy kira semua produk brodo genuine leather
None
--------------------
Review:  Barang sudah sampai di rumah dengan selamat. Dari matras dan tas matrasnya juga sama persis seperti di gambar yg diiklankan. Overall puas belanja di lapak ini. Terima kasih yaaa~
None
--------------------
Review:  Barang ori sesuai pesanan, terima kasih
None
--------------------
