# AI Assisted Customer Review Triage System

Author: Christopher Bacani, 2026 

### Dataset Import

In [30]:
#Un-comment and run the following command below to ensure requirements of the workflow are 
#properly installed.

# pip install -r requirements.txt

In [31]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path(".")
DATA_PATH = PROJECT_ROOT / "data" / "DisneylandReviews.csv"

DATA_PATH

PosixPath('data/DisneylandReviews.csv')

In [32]:
reviews_df = pd.read_csv(DATA_PATH, encoding="latin-1")
print(reviews_df.shape)
reviews_df.head()

(42656, 6)


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


#### Quick data clean

In [33]:
reviews_df = reviews_df.dropna(subset=["Review_Text"]).copy()

reviews_df["Review_Text"] = reviews_df["Review_Text"].astype(str).str.strip()

reviews_df = reviews_df.reset_index(drop = True)

print(reviews_df.shape)
reviews_df.head()

(42656, 6)


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


#### Creating Small Working Subset

In [34]:
sample_df = reviews_df.sample(20, random_state = 42).copy()

sample_df.reset_index(drop = True, inplace = True)

print(sample_df.shape)
sample_df.head()

(20, 6)


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,540713188,5,2017-9,Malta,Disneyland is so beautiful and large.To see al...,Disneyland_Paris
1,119781124,1,2011-10,Canada,"The lines for rides are too long. Yes, the fas...",Disneyland_California
2,576395715,5,2018-4,Australia,Loved Hong Kong Disneyland although it is much...,Disneyland_HongKong
3,310041955,5,2015-9,United States,Love Disneyland! We are annual pass holders an...,Disneyland_California
4,184009554,4,2013-11,United States,The California Adventure Park is much improved...,Disneyland_California


#### Setup of OpenAI Client

In [35]:
from openai import OpenAI
import os
from dotenv import load_dotenv

In [39]:
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

if api_key:
    print("API key successfully loaded")
else:
    print("API key NOT found, please reference OpenAI settings.")

client = OpenAI()

if client:
    print("Client initialized successfully.")

API key successfully loaded
Client initialized successfully.


#### Testing API Call

In [43]:
response = client.chat.completions.create(
    model = "gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a concise and friendly assistant."},
        {"role": "user", "content": "Summarize this sentence: Disneyland was way too crowded and everything was so expensive but the food was very tasty and the experience was magical."}
    ],
    temperature = 0
)

print(response.choices[0].message.content)

Disneyland was crowded and pricey, but the food was delicious and the experience magical.


#### Delineating risk flags

In [51]:
RISK_KEYWORDS = [
    "injury",
    "hurt",
    "discrimination",
    "unsafe",
    "refund",
    "charged twice",
    "lawsuit",
    "legal",
    "harassment",
    "fraud",
    "scam"
]

def rule_based_risk(review_text):
    text = review_text.lower()
    for keyword in RISK_KEYWORDS:
        if keyword in text:
            return True, f"Keyword trigger: {keyword}"
    return False, None

#### Building `analyze_review()` function

In [65]:
import json
import re

def analyze_review(review_text):
    rule_flag, rule_reason = rule_based_risk(review_text)
    prompt = f"""
    You are an AI customer support triage assistant.

    Analayze the following review and return ONLY valid JSON with the following fields:
    - a summary (concise)
    - sentiment_label (positive, neutral, negative)
    - sentiment_score (number between -1 and 1)
    - model_risk_flag (true ONLY if the review includes safety concerns, legal threats, discrimination claims, fraud accusations, refund disputes, or harassment allegations. Otherwise false.)
    - model_risk_reason (short explanation if true, otherwise null)

    Review:
    \"\"\"{review_text}\"\"\"
    """

    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [
        {"role": "system", "content": "You output strictly valid JSON only."},
        {"role": "user", "content": prompt}
        ],
        temperature = 0.2
    )

    content = response.choices[0].message.content

    # Remove markdown fences if present
    content = re.sub(r"```json|```", "", content).strip()

    parsed = json.loads(content)

    final_risk = rule_flag or parsed["model_risk_flag"]

    # try:
    #     return json.loads(content)
    # except:
    #     print("JSON parsing unsuccessful, raw output: ")
    #     print(content)
    #     return None

    return {
        "summary": parsed["summary"],
        "sentiment_label": parsed["sentiment_label"],
        "sentiment_score": parsed["sentiment_score"],
        "risk_flag": final_risk,
        "risk_reason": rule_reason if rule_flag else parsed["model_risk_reason"]
    }


In [66]:
test_review = reviews_df.loc[0, "Review_Text"]

result = analyze_review(test_review)

result

{'summary': 'Disneyland Hong Kong has a familiar layout and enjoyable rides, despite being busy and hot during the visit.',
 'sentiment_label': 'positive',
 'sentiment_score': 0.8,
 'risk_flag': False,
 'risk_reason': None}

#### Batch test

In [67]:
from tqdm import tqdm

results = []

for review in tqdm(sample_df["Review_Text"]):
    result = analyze_review(review)
    results.append(result)

results[:3]

100%|███████████████████████████████████████████| 20/20 [00:33<00:00,  1.68s/it]


[{'summary': 'Disneyland is beautiful and well-organized, requiring at least 3 days to fully enjoy.',
  'sentiment_label': 'positive',
  'sentiment_score': 0.9,
  'risk_flag': False,
  'risk_reason': None},
 {'summary': 'Long wait times for rides detract from the experience, leading to a decision to visit other parks.',
  'sentiment_label': 'negative',
  'sentiment_score': -0.7,
  'risk_flag': False,
  'risk_reason': None},
 {'summary': 'Positive experience at a smaller, well-organized Hong Kong Disneyland.',
  'sentiment_label': 'positive',
  'sentiment_score': 0.9,
  'risk_flag': False,
  'risk_reason': None}]

In [68]:
triage_df = pd.DataFrame(results)

triage_df.head()

Unnamed: 0,summary,sentiment_label,sentiment_score,risk_flag,risk_reason
0,"Disneyland is beautiful and well-organized, re...",positive,0.9,False,
1,Long wait times for rides detract from the exp...,negative,-0.7,False,
2,"Positive experience at a smaller, well-organiz...",positive,0.9,False,
3,The reviewer expresses a strong affection for ...,positive,0.9,False,
4,California Adventure Park has improved with Ca...,neutral,0.2,False,


In [69]:
triage_df["risk_flag"].value_counts()


risk_flag
False    20
Name: count, dtype: int64

In [64]:
triage_df[triage_df["risk_flag"] == True]

Unnamed: 0,summary,sentiment_label,sentiment_score,risk_flag,risk_reason
16,The reviewer expresses strong dissatisfaction ...,negative,-0.9,True,Use of derogatory language and strong negative...


#### FAQ Retrieval (RAG)

In [70]:
faq_data = [
    {
        "question": "What is the refund policy for Disneyland tickets?",
        "answer": "Tickets are generally non-refundable unless purchased with flexible cancellation options."
    },
    {
        "question": "What are the height requirements for rides?",
        "answer": "Each ride has specific height requirements for safety. Guests should check signage before queuing."
    },
    {
        "question": "What should I do if I am double charged?",
        "answer": "Guests who believe they were double charged should contact customer service with their transaction details."
    },
    {
        "question": "What should I do if someone is injured in the park?",
        "answer": "Guests should immediately notify park staff or visit first aid stations located throughout the park."
    },
    {
        "question": "How long are wait times for rides?",
        "answer": "Wait times vary depending on season and demand. Guests can check the official app for live updates."
    }
]

In [71]:
faq_df = pd.DataFrame(faq_data)
faq_df

Unnamed: 0,question,answer
0,What is the refund policy for Disneyland tickets?,Tickets are generally non-refundable unless pu...
1,What are the height requirements for rides?,Each ride has specific height requirements for...
2,What should I do if I am double charged?,Guests who believe they were double charged sh...
3,What should I do if someone is injured in the ...,Guests should immediately notify park staff or...
4,How long are wait times for rides?,Wait times vary depending on season and demand...


In [74]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [75]:
faq_embeddings = embedding_model.encode(faq_df["question"].tolist())

In [76]:
def match_faq(review_text, threshold = 0.65):

    return None