In [1]:
# OPTIONAL
# import os
# os.environ["MLFLOW_TRACKING_USERNAME"] = ""
# os.environ["MLFLOW_TRACKING_PASSWORD"] = ""

In [2]:
# OPTIONAL
# import mlflow
# mlflow.set_tracking_uri("http://10.20.20.102:8009")
# mlflow.set_experiment("DSPy Quickstart")
# mlflow.dspy.autolog()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from typing import Literal, Optional

import dspy
import pandas as pd

from rich import print as print_pretty

## Data Loading

In [4]:
df = pd.read_json("../data/clean/ecom-products.jsonl", lines=True)
df.head()

Unnamed: 0,id,product_name,initial_price,final_price,seller_name,description,url,extra_descripton
0,6f6d353462836c7b0a641413a78eec014dd9528bffb1d3...,MSI THIN 15 B12UC RTX3050 I5-12450H 8GB 512GB ...,14399000,9799000,AGRES ID,PROMO SPECIAL !!\nFREE SPEAKER HOME THEATER (S...,https://www.tokopedia.com/agresid/msi-thin-15-...,
1,2cf4400e5eb525758f9f4e4e31a9976ec20343299065bd...,MSI KATANA 15 B13VEK i7-13620H 16GB 1TB SSD RT...,19999000,16999000,MSI Official Store,Katana 15 B13VEK-1851ID - INCLUDE OHS2021\n\nB...,https://www.tokopedia.com/msi-official/msi-kat...,
2,d0c5989db9761d9b0b1eca7a0d8f3d1266cdef99a25755...,MSI KATANA 15 B13VFK i7-13620H 16GB 1TB SSD RT...,22599000,18699000,MSI Official Store,SKU\tKatana 15 B13VFK-1850D \n\nBundle office ...,https://www.tokopedia.com/msi-official/msi-kat...,
3,1f7a82c3880c3b54b98408ed062fea43e9aaa27ab15577...,MSI Thin 15 B12UC i7-12650H RTX 3050 4GB 512GB...,14999000,11199000,MSI Official Store,SKU\tThin 15 B12UC-2405ID\n\nSpesifikasi:\n\nD...,https://www.tokopedia.com/msi-official/msi-thi...,
4,a6e5daed20d7e527d85a2d32d37775cf68bdb07f137624...,MSI Thin 15 B13VE i7-13620H RTX4050 16GB 512GB...,17999000,15699000,MSI Official Store,SKU\tThin 15 B13VE-2406ID\n\nWarna\t:Cosmos Gr...,https://www.tokopedia.com/msi-official/msi-thi...,


In [5]:
def get_product_name(i: int) -> str:
    return df.iloc[i, 1]

def get_product_desc(i: int) -> str:
    return df.iloc[i, 1] + "\n" + df.iloc[i, 5]

## DSPy Init

In [7]:
lm = dspy.LM('ollama_chat/llama3.1:latest', api_base='http://localhost:7869', api_key='')
dspy.configure(lm=lm)

## Product Categorization

In [8]:
class ProductCategoryClassification(dspy.Signature):
    """Classify electronic product category of a given description."""

    description: str = dspy.InputField()
    category: Literal['LAPTOP', 'COMPUTER', 'OTHER'] = dspy.OutputField()

In [9]:
df_product_category_fewshot = pd.read_json("../data/inputs/fewshot-product-category.jsonl", lines=True)
product_category_trainset = [dspy.Example(**entry).with_inputs("description", "reasoning") for entry in df_product_category_fewshot.to_dict(orient="records")]

In [10]:
optimizer = dspy.LabeledFewShot()

clf_product_category = dspy.ChainOfThoughtWithHint(ProductCategoryClassification)
clf_product_category_opt = optimizer.compile(clf_product_category, trainset=product_category_trainset)

In [11]:
product_name = get_product_name(1021)

print_pretty({
    "vanilla": clf_product_category(description=product_name),
    "few-shot": clf_product_category_opt(description=product_name),
    "text": product_name,
})

## Specification Extraction

In [15]:
class LaptopSpecification(dspy.Signature):
    """Extracts laptop technical specification."""

    description: str = dspy.InputField()

    brand: Optional[str] = dspy.OutputField(
        desc="Brand or manufacturer, for example: Acer, ASUS, Merdeka, Libera, Lenovo, MSI, SPC, etc. If the device has no recognizable brand, return Unknown",
    )
    model: Optional[str] = dspy.OutputField(
        description="Model or type, for example: Thin 15 B12UC, Katana 15, Cyborg 15, X454YA, P214, C120, A514, etc.",
    )
    processor: Optional[str] = dspy.OutputField(
        desc="CPU brand, model number, and generation"
    )
    memory: Optional[str] = dspy.OutputField(
        desc="Total memory or RAM, in GigaBytes (GB). If the memory is in TeraBytes (TB), convert it to GB first"
    )
    storage: Optional[str] = dspy.OutputField(
        desc="Total disk storage including hard drives, SSDs, and other mass storage devices",
    )
    graphics_card: Optional[str] = dspy.OutputField(
        desc="GPU name, for example: Nvidia RTX 3060, Intel UHD Graphics, etc.",
    )

In [16]:
spec_extractor = dspy.ChainOfThought(LaptopSpecification)

In [17]:
product_desc = get_product_desc(0)

print_pretty({
    "extracted": spec_extractor(description=product_desc),
    "text": product_desc,
})