In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
import requests
import json
import logging

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Endpoint API của Ollama
OLLAMA_ENDPOINT = "http://host.docker.internal:11434/api/generate"

In [3]:
spark = SparkSession.builder.appName("ReadCSV").getOrCreate()

# Đọc file CSV
df = spark.read.csv("recommendation_system.products.csv", header=True, inferSchema=True)
df = df.limit(1)


In [5]:
# Định nghĩa UDF để gọi API sinh mô tả sản phẩm
@udf(returnType=StringType())
def generate_description(name):
    if not name:  # Kiểm tra nếu name là None hoặc rỗng
        return "No name provided"

    logger.info(f"Generating description for: {name}")
    
    payload = {
        "model": "llama2:7b",
        "prompt": f"Generate product description based on the name: {name}",
        "stream": False,
        "format": "json"
    }

    try:
        response = requests.post(
            OLLAMA_ENDPOINT,
            json=payload, 
            timeout=10  
        )
        
        response.raise_for_status()
        response_json = response.json()

        logger.info(f"Full API response: {json.dumps(response_json, indent=2)}")

        return json.dumps(response_json.get("response", "No response found")).replace("\n", " ")


    except requests.RequestException as e:
        logger.error(f"Request failed: {e}")
        return "Error generating description"

In [None]:
df = df.withColumn("new_description", generate_description(col("name")))

# Hiển thị dữ liệu mới
# df.select("name", "new_description").show(truncate=False)

# Ghi dữ liệu ra file CSV (coalesce(1) để lưu thành một file duy nhất)
df.coalesce(1).write.option("quote", "\"").option("escape", "\"").csv("product_output", header=True, mode="overwrite")

# Dừng Spark session
spark.stop()