# Experimental Sandbox

This sandbox notebook is designed for **exploring, testing, and debugging** the NLP pipeline.
Use the sections below to:
- Verify project paths
- Load modules dynamically
- Run classification functions interactively
- Experiment with data snippets
- Prototype new prompts
---

### Importing packages and setting up NLP modules

In [None]:
import os
from pathlib import Path
from datetime import datetime
import pandas as pd

### Add project root to sys.path for module imports
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

NOTEBOOK_DIR = Path.cwd()

NLP_ROOT = Path(PROJECT_ROOT)
INGEST_DATA_DIR = NLP_ROOT / "ingest" / "data"
OUTPUT_DIR = NLP_ROOT / "output"

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("PROJECT_ROOT:", PROJECT_ROOT)
print("INGEST_DATA:", INGEST_DATA_DIR)
print("OUTPUT_DIR :", OUTPUT_DIR)




NOTEBOOK_DIR: /Users/jjun44/Documents/GTRI/CDCF/cdcf_web_scraping_vape_products_project/nlp/sandbox
PROJECT_ROOT: /Users/jjun44/Documents/GTRI/CDCF/cdcf_web_scraping_vape_products_project/nlp
INGEST_DATA: /Users/jjun44/Documents/GTRI/CDCF/cdcf_web_scraping_vape_products_project/nlp/ingest/data
OUTPUT_DIR : /Users/jjun44/Documents/GTRI/CDCF/cdcf_web_scraping_vape_products_project/nlp/output


In [None]:
# Now import via the full package path

# Product features
from process.regex.classify import classify_regex_df
from process.product_type.classify import classify_product_category_df
from process.cbd.classify import classify_cbd_df
from process.tfn.classify import classify_tfn_df
# Flavor features
from process.flavor_classify.classify import classify_flavor_df
from process.other_flavor_classify.classify import classify_other_flavor_df

### Core functions for use

In [10]:
def load_dataset(filename: str, dtype=str) -> pd.DataFrame:
    path = INGEST_DATA_DIR / filename
    print(f"Loading {path}")
    df = pd.read_csv(path, dtype=dtype)
    print("Shape:", df.shape)
    return df

def preview_df(df: pd.DataFrame, n: int = 5):
    display(df.head(n))
    print("\nColumns:\n", df.columns.tolist())
    print("\nNull counts:")
    display(df.isna().sum().sort_values(ascending=False).head(20))

#### We can classify the product pipeline or flavor pipeline using different datasets. Please adjust according to your needs and feel free to use a custom function!
##### NOTE: Your custom function must be declared prior to running the cell below nad added to the sandbox_custom_pipeline function.

In [None]:
def sandbox_product_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    print("Initial shape:", df.shape)
    ### TODO: REMOVE ANY PROCESSING STEPS NOT NEEDED FOR TESTING ###
    df = classify_regex_df(df)
    df = classify_product_category_df(df)
    df = classify_cbd_df(df)
    df = classify_tfn_df(df)
    print("Final shape:", df.shape)
    return df

def sandbox_flavor_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    print("Initial shape:", df.shape)
    ### TODO: REMOVE ANY PROCESSING STEPS NOT NEEDED FOR TESTING ###
    df = classify_flavor_df(df)
    df = classify_other_flavor_df(df)
    print("Final shape:", df.shape)
    return df

def sandbox_custom_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    print("Initial shape:", df.shape)
    ### TODO: ADD CUSTOM PROCESSING STEPS HERE ###
    print("Final shape:", df.shape)
    return df

### Set-up and run code

#### PRODUCT

In [23]:
SAMPLE_PRODUCT_FILE = "sample_10_products.csv" ### Replace with actual sample file name. 
product_df = load_dataset(SAMPLE_PRODUCT_FILE, dtype=str)
preview_df(product_df)

Loading /Users/jjun44/Documents/GTRI/CDCF/cdcf_web_scraping_vape_products_project/nlp2/ingest/data/sample_10_products.csv
Shape: (10, 15)


Unnamed: 0,id,product_name,description,site_name,site_category,site_tag,brand,eliquid_contents,nicotine_level_text,NICOTINE_LEVEL,NIC_LEVEL_123,ELIQUID_CONTENT,PRODUCT_CATEGORY,CBD,NICOTINE_FREE
0,12829,Rincoe Neso ET Pod System Kit 1400mAh 30W,Rincoe Neso ET Pod System Kit PACKAGE LIST\nRi...,VAPE_SOURCING,starter-kit,rincoe-neso-et-kit,,5ml,,,,3.5ml,Closed System,0,
1,13285,Diamond Glass DG-1002 Water Pipe - (Clear Mans...,Diamond Glass DG-1002 Water Pipe - (Clear Mans...,VAPE_SOURCING,bongs-water-pipes,diamond-glass-dg-1002-water-pipe,Diamond Glass,,,,,,Accessories,0,
2,9042,Pod Juice Strawberry Apple Watermelon Freeze,Pod Juice Strawberry Apple Watermelon Freeze P...,VAPE_WH,e-liquid,pod-juice-strawberry-apple-watermelon-freeze,,,,Levels,3mg / 6mg / 12mg,,E-Liquid,0,0.0
3,13035,ThunderHead Creations Blaze Solo RDA Glass Cap,ThunderHead Creations Blaze Solo RDA Glass Cap...,VAPE_SOURCING,accessories,thunderhead-creations-blaze-solo-rda-glass-cap,ThunderHead Creations,,,,,,Accessories,0,
4,7564,Esco Bars Mesh 2500 Lychee Mango,Highlights \nBattery: 1000mAh\nNicotine: 5%\n2...,GETPOP,all,esco-bars-mesh-2500-lychee-mango,,6ml,0.05,0.05,,6ml,Disposable System,0,0.0



Columns:
 ['id', 'product_name', 'description', 'site_name', 'site_category', 'site_tag', 'brand', 'eliquid_contents', 'nicotine_level_text', 'NICOTINE_LEVEL', 'NIC_LEVEL_123', 'ELIQUID_CONTENT', 'PRODUCT_CATEGORY', 'CBD', 'NICOTINE_FREE']

Null counts:


NIC_LEVEL_123          9
nicotine_level_text    8
NICOTINE_LEVEL         7
NICOTINE_FREE          7
brand                  5
eliquid_contents       4
ELIQUID_CONTENT        4
id                     0
product_name           0
description            0
site_name              0
site_category          0
site_tag               0
PRODUCT_CATEGORY       0
CBD                    0
dtype: int64

In [None]:
product_df_processed = sandbox_product_pipeline(product_df.copy())
display(product_df_processed.head())

#### FLAVOR

In [None]:
SAMPLE_FLAVOR_FILE = "sample_10_products_flavor.csv"

try:
    flavor_df = load_dataset(SAMPLE_FLAVOR_FILE, dtype=str)
    preview_df(flavor_df)
    flavor_df_processed = sandbox_flavor_pipeline(flavor_df.copy())
    display(flavor_df_processed.head())
except FileNotFoundError:
    print(f"Missing flavor file: {SAMPLE_FLAVOR_FILE}")

In [None]:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")

if 'product_df_processed' in globals():
    out_path = OUTPUT_DIR / f"sandbox_product_output_{ts}.csv"
    product_df_processed.to_csv(out_path, index=False)
    print("Saved product output:", out_path)

if 'flavor_df_processed' in globals():
    out_path = OUTPUT_DIR / f"sandbox_flavor_output_{ts}.csv"
    flavor_df_processed.to_csv(out_path, index=False)
    print("Saved flavor output:", out_path)

## Scratch Space

Use extra cells below for ad-hoc regex tweaks, prompt prototypes, or row-level inspection. A template is provided for use.

In [None]:
# PROMPT SANDBOX — MODIFY THIS SECTION TO CHANGE MODEL BEHAVIOR
# Tips:
# - Edit instructions to adjust how strict/loose the classifier should be.
# - Add or remove examples to influence how the model generalizes.
# - Keep the JSON schema consistent so the parser doesn’t break.
# - Use relevant data features (i.e. product_name, description) to give the model full context.

my_prompt = """
You are an expert vape-product materials classifier. Your job is to determine the 
primary **material** of a vape device based on its product name and description.

### VALID MATERIAL CATEGORIES
- METAL
- GLASS
- OTHER

### CLASSIFICATION INSTRUCTIONS
1. Choose *only one* of the material categories.
2. Consider tank, body, housing, and main components.
3. Use key signals such as:
   - Metal: stainless steel, alloy, aluminum, titanium
   - Glass: glass tank, Pyrex, clear glass chamber
   - Other: plastic, silicone, unknown, disposable not specifying materials

### EXAMPLE FORMAT
### Example 1
Product Name: "Stainless Steel Pod System"
Description: "Full metal body with refillable tank."
Output:
{
  "material": "METAL",
  "confidence": "high",
  "reasoning": "Contains 'stainless steel' and 'metal body', clear metal signals."
}

### Example 2
Product Name: "Glass Tank Cartridge"
Description: "Clear Pyrex chamber for oils."
Output:
{
  "material": "GLASS",
  "confidence": "high",
  "reasoning": "Mentions 'glass' and 'Pyrex', strong glass indicators."
}

### OUTPUT FORMAT (MUST BE VALID JSON)
{{
    "material": "METAL | GLASS | OTHER",
    "confidence": "high" | "low",
    "reasoning": "short explanation referencing key words"
}}

### PRODUCT INFORMATION
Product Name: "{product_name}"
Description: "{description}"

JSON Output:
"""

In [None]:
import json, requests, pandas as pd
import logging

# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

MODEL = "llama3.1:8b"  
API_URL = "http://localhost:11434/api/generate"

# Create the prompt for classification
def create_prompt(product_name, description=""):    
    # TODO: Change prompt as desired
    prompt = f"""
    {my_prompt.format(product_name=product_name, description=description)}
"""
    return prompt


def classify_prompt(product_name, description=""):
    prompt = create_prompt(product_name, description)
    
    data = {
        "model": MODEL,
        "prompt": prompt,
        "stream": False,
        "format": "json"  # Request structured output
    }
    
    try:
        response = requests.post(API_URL, json=data, timeout=60)
        response.raise_for_status()
        
        response_json = response.json()
        raw_result = response_json.get("response", "{}")
        
        # Clean markdown fences if Ollama adds them
        if raw_result.startswith("```json"):
            raw_result = raw_result[7:]
        if raw_result.endswith("```"):
            raw_result = raw_result[:-3]
        
        result_data = json.loads(raw_result.strip())
        
        return {
            "category": result_data.get("category", "Error"),
            "confidence": result_data.get("confidence", "low"),
            "reasoning": result_data.get("reasoning", "No reasoning provided.")
        }

    except requests.exceptions.RequestException as e:
        logging.error(f"API request failed for '{product_name}': {e}")
        return {"category": "Error", "confidence": "low", "reasoning": str(e)}

    except json.JSONDecodeError as e:
        logging.error(f"Failed to parse JSON response for '{product_name}': {e}")
        logging.debug(f"Raw response: {raw_result}")
        return {"category": "Error", "confidence": "low", "reasoning": "Invalid JSON returned by model."}

    except Exception as e:
        logging.error(f"Unexpected error for '{product_name}': {e}")
        return {"category": "Error", "confidence": "low", "reasoning": str(e)}

def classify_df(df, name_col="product_name", description_col="description"):
    results = []

    for _, row in df.iterrows():
        name = row.get(name_col, "")
        desc = row.get(description_col, "")

        output = classify_prompt(name, desc)
        results.append(output)

    result_df = pd.DataFrame(results)
    return pd.concat([df.reset_index(drop=True), result_df], axis=1)


In [None]:
# Run the code on a sample dataframe
product_df = load_dataset(SAMPLE_PRODUCT_FILE, dtype=str)
labeled_df = classify_df(product_df)
print(labeled_df)