In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

1. Notebook: 01_data_prep

Ingest raw comments, clean, de‐duplicate, and save as Bronze table

In [None]:
# 01_data_prep

# Install dependencies once per cluster
# %pip install -q sentence-transformers requests mlflow

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
import pandas as pd

spark = SparkSession.builder.getOrCreate()

# 1. Read source table
bronze = spark.read.table("gold_dev.unified_posts_comments_testing") \
    .selectExpr(
        "platform","post_id","post_user_id","post_content","media_type","media_url",
        "post_created_time","post_load_timestamp","comment_load_timestamp","comment_id",
        "comment_user_id","comment_text","comment_created_time","load_timestamp",
        "post_sentiment_score_positive","post_sentiment_score_negative",
        "post_sentiment_score_neutral","post_predicted_sentiment",
        "comment_sentiment_score_positive","comment_sentiment_score_negative",
        "comment_sentiment_score_neutral","comment_predicted_sentiment",
        "createdOn","createdBy","updatedOn","updatedBy",
        "businessEntityId","businessEntityName",
        "businessFunctionId","businessFunctionName","keywords_list","category",
        "date_key","ReviewerName","siteID"
    )

# 2. Clean & de-dup
bronze_clean = bronze \
    .withColumn("comment_text", when(col("comment_text").isNull(), "").otherwise(col("comment_text"))) \
    .dropDuplicates(["comment_text", "comment_id"])

# 3. Write Bronze
bronze_clean.write.mode("overwrite").saveAsTable("ml_bronze.comments_clean")
display(bronze_clean.limit(5))

2. Notebook: 02_feature_engineering

SBERT + GPT mapping, feature engineering for sales-weather tasks (LIVC1703). Saves Silver table

In [None]:
# 02_feature_engineering

import mlflow
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd
import requests, time
from sentence_transformers import SentenceTransformer, util

spark = SparkSession.builder.getOrCreate()

# Load cleaned comments
df = spark.read.table("ml_bronze.comments_clean").toPandas()

# 1) SBERT Mapping
cat_defs = {
    "Food Quality":"taste,freshness,quality of food",
    "Ambiance":"atmosphere,lighting,seating,music",
    "Service":"staff behavior,speed,hospitality",
    "Value for Money":"pricing,portion size",
    "Communication":"ordering and communication"
}
cats, texts = list(cat_defs.keys()), list(cat_defs.values())
model = SentenceTransformer('all-MiniLM-L6-v2')
cat_emb = model.encode(texts, convert_to_tensor=True)
embs = model.encode(df['comment_text'].tolist(), convert_to_tensor=True, batch_size=64)
scores = util.cos_sim(embs, cat_emb)
best = scores.argmax(dim=1).cpu().numpy()
df['sbert_cat'] = [cats[i] for i in best]

# 2) GPT Mapping (Retail Category)
AZURE_ENDPOINT="https://...openai.azure.com"
DEPLOYMENT="gpt-35-turbo"; API_VER="2024-02-01"; API_KEY="..."
url=f"{AZURE_ENDPOINT}/openai/deployments/{DEPLOYMENT}/chat/completions?api-version={API_VER}"
hdr={"Content-Type":"application/json","api-key":API_KEY}
retail_cats=[...your list...]
fmt_cats="\n".join(f"- {c}" for c in retail_cats)

def classify_retail(comment):
    if not comment.strip(): return "NoComment"
    p = f"Categories:\n{fmt_cats}\nComment: “{comment}”\nReply only with one category."
    payload={"messages":[{"role":"user","content":p}],"max_tokens":20,"temperature":0.0}
    for _ in range(3):
        r = requests.post(url, headers=hdr, json=payload)
        if r.ok:
            return r.json()["choices"][0]["message"]["content"].strip()
        time.sleep(2)
    return "Error"

df['retail_cat'] = df['comment_text'].apply(classify_retail)

# 3) Persist Silver
silver = spark.createDataFrame(df)
silver.write.mode("overwrite").saveAsTable("ml_silver.comments_features")
display(silver.limit(5))

3. Notebook: 03_train_register

Train simple classifier (e.g., logistic regression on SBERT+features), log with MLflow, register model

In [None]:
# 03_train_register

import mlflow, mlflow.sklearn
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors

spark = SparkSession.builder.getOrCreate()

# Read Silver data
df = spark.read.table("ml_silver.comments_features").toPandas()

# Example: train a model to predict SBERT category from SBERT embeddings
# (In practice, you'd save embeddings in DF. Here, re-encode small sample.)
from sentence_transformers import SentenceTransformer
mod = SentenceTransformer('all-MiniLM-L6-v2')
embs = mod.encode(df['comment_text'].tolist())
X = pd.DataFrame(embs)
y = df['sbert_cat']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
Xtr,Xte,ytr,yte = train_test_split(X,y,test_size=0.2,random_state=42)

# MLflow experiment
mlflow.set_experiment("SentimentCategorization")
with mlflow.start_run():
    clf = LogisticRegression(max_iter=200)
    clf.fit(Xtr, ytr)
    acc = clf.score(Xte,yte)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(clf, "sbert_clf", registered_model_name="SentimentCatModel")
    print("Registered model with accuracy:", acc)

4. Notebook: 04_inference

Load registered model, score new data, write predictions to Gold table

In [None]:
# 04_inference

import mlflow
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Load model
model_uri = "models:/SentimentCatModel/Production"
clf = mlflow.sklearn.load_model(model_uri)

# Load new comments from Silver
df = spark.read.table("ml_silver.comments_features").toPandas()
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
X = embedder.encode(df['comment_text'].tolist())

# Predict
df['predicted_cat'] = clf.predict(X)

# Write to Gold
gold_df = spark.createDataFrame(df)
gold_df.write.mode("overwrite").saveAsTable("gold_dev.sentiment_predictions")
display(gold_df.limit(5))

5. MLOps Pipeline (Azure DevOps YAML)

This ci-pipeline builds, tests, and deploys the notebooks as jobs in Databricks and registers models

In [None]:
# azure-pipelines.yml
trigger:
- main

variables:
  DATABRICKS_HOST: $(databricksHost)
  DATABRICKS_TOKEN: $(databricksToken)

stages:
- stage: Build
  jobs:
  - job: Test_Notebooks
    pool: ubuntu-latest
    steps:
    - script: |
        # run notebooks in headless mode to catch errors
        databricks workspace import_dir . /Repos/$(Build.Repository.Name)
        databricks runs submit --json-file ci/config_test_run.json
      displayName: "Test Notebooks on Databricks"

- stage: Train
  dependsOn: Build
  jobs:
  - job: Train_Model
    pool: ubuntu-latest
    steps:
    - script: |
        databricks runs submit --json-file ci/config_train_run.json
      displayName: "Train & Register Model"

- stage: Deploy
  dependsOn: Train
  jobs:
  - job: Deploy_Inference
    pool: ubuntu-latest
    steps:
    - script: |
        databricks jobs create --json-file ci/config_inference_job.json --overwrite
      displayName: "Schedule Inference Job"

Example ci/config_test_run.json

In [None]:
{
  "run_name": "CI Test Run",
  "existing_cluster_id": "<CLUSTER_ID>",
  "notebook_task": {
    "notebook_path": "/Repos/$(Build.Repository.Name)/01_data_prep"
  }
}

Example ci/config_train_run.json

In [None]:
{
  "run_name": "Train Sentiment Model",
  "existing_cluster_id": "<CLUSTER_ID>",
  "notebook_task": {
    "notebook_path": "/Repos/$(Build.Repository.Name)/03_train_register"
  }
}

Example ci/config_inference_job.json

In [None]:
{
  "name": "Daily Sentiment Inference",
  "existing_cluster_id": "<CLUSTER_ID>",
  "schedule": {
    "quartz_cron_expression": "0 0 * * * ? *",
    "timezone_id": "UTC"
  },
  "notebook_task": {
    "notebook_path": "/Repos/$(Build.Repository.Name)/04_inference"
  }
}

What You’ll Have:
	1.	Bronze/Silver/Gold tables in your Databricks metastore.
	2.	SBERT + GPT mapping pipelines.
	3.	MLflow-managed model with registration and versioning.
	4.	Databricks Jobs scheduled daily for inference.
	5.	CI/CD via Azure DevOps (or adapt to GitHub Actions) to enforce code quality and automate retraining/deployment.

You can plug in your own cluster IDs, tokens, and keys. This framework delivers a production-grade MLOps workflow on Azure Databrick