In [None]:
import kagglehub
import pandas as pd
import ollama
import json
from collections import defaultdict
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
path = kagglehub.dataset_download("farukalam/yelp-restaurant-reviews")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/yelp-restaurant-reviews


In [None]:
df1 = pd.read_csv(path+"/Yelp Restaurant Reviews.csv")


In [None]:
limiting_rows =50
collation_batch_size = 10

In [None]:

def analyze_reviews_dishes(review):
    try:
        response = ollama.chat(
            model='qwen3',
            messages=[{
                'role': 'user',
                'content': f"""Extract sentiment and detailed feedback from this restaurant review.
        Return the output as JSON with these exact keys:

        - rating (integer from 0 to 5; 0 = highly negative, 5 = highly positive)
        - pain_points (list of non-dish issues like 'slow service', 'dirty table')
        - positive_points (list of non-dish praise like 'quick service', 'friendly staff')
        - pain_point_dishes (list of specific food/dishes with problems like 'burger', 'pizza')
        - positive_point_dishes (list of specific food/dishes praised like 'ice cream', 'salad')

        Example:
        If the review says "The burger was cold, but service was good", then:
        - pain_point_dishes: ["burger"]
        - positive_points: ["good service"]

        Here is the review:
        \"\"\"{review}\"\"\"
        """
            }],
            tools=[{
                'type': 'function',
                'function': {
                    'name': 'extract_review_analysis',
                    'description': 'Extract sentiment score, dish-specific feedback, and general service points from a restaurant review',
                    'parameters': {
                        'type': 'object',
                        'properties': {
                            'rating': {'type': 'integer'},
                            'pain_points': {
                                'type': 'array',
                                'items': {'type': 'string'}
                            },
                            'positive_points': {
                                'type': 'array',
                                'items': {'type': 'string'}
                            },
                            'pain_point_dishes': {
                                'type': 'array',
                                'items': {'type': 'string'}
                            },
                            'positive_point_dishes': {
                                'type': 'array',
                                'items': {'type': 'string'}
                            }
                        },
                        'required': ['rating']
                    }
                }
            }]
        )

        # result = response['message']['content'] if 'message' in response else {}
        message = response['message'] if 'message'in response else None
        if message:
            tool_call = message ['tool_calls'][0] if 'tool_calls'in message else None
        else:
            result = {}
        if tool_call:
            result = tool_call['function']['arguments']
        else:
            result = {}

    except Exception as e:
        result = {"rating": None, "pain_points": [],"pain_point_dishes":[], "positive_points": [],"positive_point_dishes":[]}
        print(f"Error processing review: {e}")

    return result,response


In [None]:

review_dishes_total = []
llm_response_dishes_total = []
for i,review_text in enumerate(df1["Review Text"][:limiting_rows]):
    review,response = analyze_reviews_dishes(review_text)
    review_dishes_total.append(review)
    llm_response_dishes_total.append(response)
    if (i+1)%10 == 0:
        print(f"Completed {i+1}")


Completed 10
Completed 20
Completed 30
Completed 40
Completed 50


In [None]:
review_dishes_total

[{'pain_point_dishes': [],
  'pain_points': [],
  'positive_point_dishes': ['ice cream', 'cookies and creme ice cream'],
  'positive_points': [],
  'rating': 5},
 {'pain_point_dishes': ['ice cream'],
  'pain_points': [],
  'positive_point_dishes': ['pumpkin shake', 'pina colada', 'Banana Split'],
  'positive_points': ['Nice little local place'],
  'rating': 4},
 {'pain_point_dishes': [],
  'pain_points': [],
  'positive_point_dishes': [],
  'positive_points': ['friendly and helpful staff', 'allowed early ordering'],
  'rating': 5},
 {'pain_point_dishes': ['chocolate ice cream'],
  'pain_points': [],
  'positive_point_dishes': ['banana cream pie', 'chocolate ice cream'],
  'positive_points': ['great service'],
  'rating': 5},
 {'pain_point_dishes': [],
  'pain_points': [],
  'positive_point_dishes': ['cappuccino ice cream', 'cookie dough'],
  'positive_points': ['kid-friendly atmosphere'],
  'rating': 5},
 {'pain_point_dishes': [],
  'pain_points': [],
  'positive_point_dishes': ['ice c

In [None]:
llm_response_dishes_total

[ChatResponse(model='qwen3', created_at='2025-06-12T17:36:40.999633023Z', done=True, done_reason='stop', total_duration=24655011818, load_duration=2818397814, prompt_eval_count=424, prompt_eval_duration=672530794, eval_count=574, eval_duration=21162571001, message=Message(role='assistant', content='<think>\nOkay, let\'s tackle this query. The user wants me to extract sentiment and detailed feedback from a restaurant review and return it in a specific JSON format. Let me start by understanding the example they provided. \n\nThe example review mentions a burger being cold, which is a pain point for a dish, and good service, which is a positive point. The JSON output lists the rating, pain_points, positive_points, pain_point_dishes, and positive_point_dishes. \n\nNow, looking at the actual review given: "All I can say is they have very good ice cream I would for sure recommend their cookies and creme ice cream it is very good". The user mentions ice cream and cookies and creme ice cream p

# ----- Saving the quantitative response --------

In [None]:
# with open("review_dishes_total.json","w") as jsonfile:
#     json.dump(review_dishes_total,jsonfile,indent=4)



review_dishes_total2 = review_dishes_total.copy()
for i,key_value_dict in enumerate(review_dishes_total2):
    key_value_dict["original_review"]=df1["Review Text"].iloc[i]

df2 = df1[:len(review_dishes_total2)]

df2["quantitative_llm_response"]= review_dishes_total2
# df2.to_csv("Yelp Restaurant Reviews_Qunatitative.csv",index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["quantitative_llm_response"]= review_dishes_total2


#Combine and Standardize positive and negative reviews

In [None]:
total_pain_points,total_positive_points = [], []
for i,quant_dict in enumerate(review_dishes_total2):
    total_pain_points.extend(quant_dict.get("pain_points",[]))
    total_positive_points.extend(quant_dict.get("positive_points",[]))

print(f"total pain points{len(total_pain_points)}  positive points{len(total_positive_points)}")


total pain points25  positive points61


In [None]:

def get_prompt(existing_mapping, new_points,label_type):
        return f"""
    You are a smart assistant helping normalize restaurant feedback.

    Here is the current mapping of {label_type} points to standardized labels:
    {json.dumps(existing_mapping, indent=2)}

    Now standardize these new points:
    {', '.join(new_points)}

    Return a JSON dictionary mapping each point to a standardized label.
    If any point is similar to existing ones, use the same label.
    Only introduce new labels if truly distinct.
    """


In [None]:

def standardize_points_batched(all_points, label_type="positive", batch_size=10):
    mapping_dict = {}
    for i in range(0, len(all_points), batch_size):
        batch = all_points[i:i+batch_size]

        prompt = get_prompt(mapping_dict, batch,label_type)

        try:
            response = ollama.chat(
                model='qwen3',
                messages=[{'role': 'user', 'content': prompt}],
                tools=[{
                    'type': 'function',
                    'function': {
                        'name': 'standardize_feedback_points',
                        'description': 'Map free-text points to standardized feedback labels',
                        'parameters': {
                            'type': 'object',
                            'properties': {
                                'mapping': {
                                    'type': 'object',
                                    'additionalProperties': {'type': 'string'}
                                }
                            },
                            'required': ['mapping']
                        }
                    }
                }]
            )

            # Try tool call
            tool_calls = response.get('message', {}).get('tool_calls', [])
            if tool_calls:
                args = tool_calls[0]['function']['arguments']
                args = json.loads(args if isinstance(args, str) else json.dumps(args))
                mapping_batch = args.get("mapping", {})
            else:
                # Fallback to raw JSON
                content = response.get("message", {}).get("content", "{}")
                mapping_batch = json.loads(content)

            mapping_dict.update(mapping_batch)

        except Exception as e:
            print(f"Error in batch {i}-{i+batch_size}: {e}")
            continue

    return mapping_dict


In [None]:

mapping_dict_positive = standardize_points_batched(total_positive_points, label_type="positive", batch_size=collation_batch_size)
mapping_dict_positive

Error in batch 40-50: Expecting value: line 1 column 1 (char 0)
Error in batch 60-70: Expecting value: line 1 column 1 (char 0)


{'Nice little local place': 'Local Establishment',
 'accommodating': 'Accommodating Policies',
 'allowed early ordering': 'Accommodating Policies',
 'friendly and helpful staff': 'Friendly and Helpful Staff',
 'great family place': 'Family-Friendly Atmosphere',
 'great service': 'Excellent Service',
 'kid-friendly atmosphere': 'Family-Friendly Atmosphere',
 'long line indicates success': 'High Customer Demand',
 'rotating special flavor': 'Special Menu Items',
 'supporting local business': 'Local Establishment',
 'ample parking': 'Convenient Parking',
 'better service': 'Excellent Service',
 'decent prices': 'Competitive Pricing',
 'fast service': 'Excellent Service',
 'friendly and efficient service': 'Excellent Service',
 'friendly staff': 'Friendly and Helpful Staff',
 'superb taste': 'Superior Food Quality',
 'worth the drive': 'Good Value',
 'customizable service': 'Customizable Service',
 'efficient service': 'Excellent Service',
 'excellent service': 'Excellent Service',
 'good 

In [None]:
mapping_dict_pain = standardize_points_batched(total_pain_points, label_type="pain", batch_size=collation_batch_size)
mapping_dict_pain

Error in batch 20-30: Expecting value: line 1 column 1 (char 0)


{'cash only': 'Payment Methods Limitations',
 'crowded parking': 'Parking Difficulty',
 'disappointing unavailable flavors': 'Limited Menu Options',
 'inaccurate orders': 'Order Accuracy Issues',
 'inattentive service': 'Inattentive Service',
 'lack of customer attention': 'Inattentive Service',
 'long lines': 'Long Wait Times',
 'nothing too special': 'Average Food Quality',
 'poor attention to detail': 'Attention to Detail',
 'smell of cow dung': 'Unpleasant Odors',
 'items forgotten': 'Order Accuracy Issues',
 'limited seating': 'Limited Seating',
 'long wait': 'Long Wait Times',
 'no lid provided': 'Packaging Issues',
 'no refunds': 'Refund Policy Issues',
 'poor customer service': 'Inattentive Service',
 'rude manager': 'Staff Behavior Issues',
 'slow boba preparation': 'Service Speed Issues',
 'slow service': 'Service Speed Issues',
 'workers not attentive': 'Inattentive Service'}

In [None]:

def map_points(points, mapping):
    if not isinstance(points, list):
        return []
    return [mapping.get(p.strip().lower(), p.strip().lower()) for p in points]


In [None]:

review_dishes_total3 = review_dishes_total2.copy()
for i,quant_dict in enumerate(review_dishes_total3):
    if "pain_points" in quant_dict:
        quant_dict["standardized_pain_points"] = map_points(quant_dict["pain_points"],mapping_dict_pain)
    if "positive_points" in quant_dict:
        quant_dict["standardized_positive_points"] = map_points(quant_dict["positive_points"],mapping_dict_positive)


In [None]:

with open("review_dishes_total_standardized.json","w") as jsonfile:
    json.dump(review_dishes_total3,jsonfile,indent=4)
df2["quantitative_llm_response_standardized"]= review_dishes_total3
df2.to_csv("Yelp Restaurant Reviews_Qunatitative_Standardized.csv",index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["quantitative_llm_response_standardized"]= review_dishes_total3


#Reduced *Labels*

In [None]:

def reverse_mapping(mapping_dict):
    reversed_map = defaultdict(list)
    for original, standard in mapping_dict.items():
        reversed_map[standard].append(original)
    return dict(reversed_map)


In [None]:

standarized_mapping_dict_pain =  reverse_mapping(mapping_dict_pain)
print(f"Negative Standardized labels{standarized_mapping_dict_pain}")

standarized_mapping_dict_positive =  reverse_mapping(mapping_dict_positive)
print(f"Positive Standardized labels{standarized_mapping_dict_positive}")



Negative Standardized labels{'Payment Methods Limitations': ['cash only'], 'Parking Difficulty': ['crowded parking'], 'Limited Menu Options': ['disappointing unavailable flavors'], 'Order Accuracy Issues': ['inaccurate orders', 'items forgotten'], 'Inattentive Service': ['inattentive service', 'lack of customer attention', 'poor customer service', 'workers not attentive'], 'Long Wait Times': ['long lines', 'long wait'], 'Average Food Quality': ['nothing too special'], 'Attention to Detail': ['poor attention to detail'], 'Unpleasant Odors': ['smell of cow dung'], 'Limited Seating': ['limited seating'], 'Packaging Issues': ['no lid provided'], 'Refund Policy Issues': ['no refunds'], 'Staff Behavior Issues': ['rude manager'], 'Service Speed Issues': ['slow boba preparation', 'slow service']}
Positive Standardized labels{'Local Establishment': ['Nice little local place', 'supporting local business'], 'Accommodating Policies': ['accommodating', 'allowed early ordering'], 'Friendly and Helpf

In [None]:
standarized_mapping_dict_pain

{'Payment Methods Limitations': ['cash only'],
 'Parking Difficulty': ['crowded parking'],
 'Limited Menu Options': ['disappointing unavailable flavors'],
 'Order Accuracy Issues': ['inaccurate orders', 'items forgotten'],
 'Inattentive Service': ['inattentive service',
  'lack of customer attention',
  'poor customer service',
  'workers not attentive'],
 'Long Wait Times': ['long lines', 'long wait'],
 'Average Food Quality': ['nothing too special'],
 'Attention to Detail': ['poor attention to detail'],
 'Unpleasant Odors': ['smell of cow dung'],
 'Limited Seating': ['limited seating'],
 'Packaging Issues': ['no lid provided'],
 'Refund Policy Issues': ['no refunds'],
 'Staff Behavior Issues': ['rude manager'],
 'Service Speed Issues': ['slow boba preparation', 'slow service']}

In [None]:
standarized_mapping_dict_positive

{'Local Establishment': ['Nice little local place',
  'supporting local business'],
 'Accommodating Policies': ['accommodating', 'allowed early ordering'],
 'Friendly and Helpful Staff': ['friendly and helpful staff',
  'friendly staff',
  'kind and friendly staff',
  'friendly service'],
 'Family-Friendly Atmosphere': ['great family place',
  'kid-friendly atmosphere'],
 'Excellent Service': ['great service',
  'better service',
  'fast service',
  'friendly and efficient service',
  'efficient service',
  'excellent service',
  'lines usually significantly shorter',
  'good timing between order and handout',
  'quick service',
  'good service',
  'timely service'],
 'High Customer Demand': ['long line indicates success'],
 'Special Menu Items': ['rotating special flavor',
  'good selection of flavors',
  'variety of drinks and desserts',
  'delicious drinks',
  'excellent boba',
  'great Thai tea flavor'],
 'Convenient Parking': ['ample parking', 'plenty of parking and seating'],
 'C

In [None]:

with open("standarized_mapping_dict_pain.json","w") as jsonfile:
    json.dump(standarized_mapping_dict_pain,jsonfile,indent=4)

with open("standarized_mapping_dict_positive.json","w") as jsonfile:
    json.dump(standarized_mapping_dict_positive,jsonfile,indent=4)
