# PoC Aspect NER and Sentiment Analysis on Restaurant Reviews data
---

# Set up

In [1]:
import os

from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

openai_client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

# Test OpenAI API

In [24]:
system_prompt = """
You are a helpful expert Machine Learning annotator.
"""

prompt = """
Given these text review, your task is to generate structured training data for an NER task.

Requirements:
- Entities are defined as the phrase or clause that contains a full statement mentioning a reviewed aspect.
- List of useful aspects: FOOD, BEVERAGE, AMBIENCE, SERVICE, LOCATION, MUSIC, KITCHEN, PRICE, VIEW, GOOD_FOR
- The extracted phrases/clauses should be shortest possible while still containing enough information about ther sentiment regarding the aspect
- The extracted phrases/clauses should be a subset of the input text
- The extracted phrases/clauses can be overlapping with each other

Output should be a list of extracted phrases, clauses with annotated entities.

Example 1:
Input:
[
  {{"text": "But the staff was so horrible to us."}}
]

Output:
[
  {{
    "text": "But the staff was so horrible to us.",
    "entities": [
      ["staff was so horrible", "STAFF"],
      ["staff was so horrible", "SERVICE"],
    ]
  }}
]

Example 2:
Input:
[
  {{"text": "But the staff was so horrible to us."}}
]

Output:
[
  {{
    "text": "The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not.",
    "entities": [
      ["food is uniformly exceptional", "FOOD"],
      ["very capable kitchen", "KITCHEN"],
    ]
  }}
]

Example 3:
Input:
[
  {{"text": "I particularly love their yellowfun tuna and their mussel selection."}}
]

Output:
[
  {{
    "text": "I particularly love their yellowfun tuna and their mussel selection.",
    "entities": [
      ["love their yellowfun tuna", "FOOD"],
      ["love their yellowfun tuna and their mussel selection", "FOOD"],
    ]
  }}
]



Inputs:
{input_texts}
"""

In [36]:
input_texts = [
  "But the staff was so horrible to us.",
  "The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not.",
  "this little place has a cute interior decor and affordable city prices."
]
input_texts_formatted = [{"text": text} for text in input_texts]

response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": system_prompt
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": prompt.format(input_texts=str(input_texts_formatted))
        }
      ]
    }
    ],
    temperature=0,
    max_tokens=1024,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)

In [37]:
print(response.choices[0].message.content)

```json
[
  {
    "text": "But the staff was so horrible to us.",
    "entities": [
      ["staff was so horrible", "SERVICE"]
    ]
  },
  {
    "text": "The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not.",
    "entities": [
      ["food is uniformly exceptional", "FOOD"],
      ["very capable kitchen", "KITCHEN"]
    ]
  },
  {
    "text": "this little place has a cute interior decor and affordable city prices.",
    "entities": [
      ["cute interior decor", "AMBIENCE"],
      ["affordable city prices", "PRICE"]
    ]
  }
]
```


In [35]:
from typing import List

def get_openai_annotations(input_texts: List[str]):
    input_texts_formatted = [{"text": text} for text in input_texts]
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
        {
          "role": "system",
          "content": [
            {
              "type": "text",
              "text": system_prompt
            }
          ]
        },
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": prompt.format(input_texts=str(input_texts_formatted))
            }
          ]
        }
        ],
        temperature=0,
        max_tokens=1024,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message.content

In [39]:
input_texts = [
    "They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it.",
    "The pizza is the best if you like thin crusted pizza.",
    "All the money went into the interior decoration, none of it went to the chefs."
]

result = get_openai_annotations(input_texts)
print(result)

```json
[
  {
    "text": "They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it.",
    "entities": [
      ["did not have mayonnaise", "FOOD"],
      ["forgot our toast", "FOOD"],
      ["left out ingredients (ie cheese in an omelet)", "FOOD"],
      ["below hot temperatures", "FOOD"],
      ["bacon was so over cooked it crumbled on the plate", "FOOD"]
    ]
  },
  {
    "text": "The pizza is the best if you like thin crusted pizza.",
    "entities": [
      ["pizza is the best", "FOOD"],
      ["thin crusted pizza", "FOOD"]
    ]
  },
  {
    "text": "All the money went into the interior decoration, none of it went to the chefs.",
    "entities": [
      ["money went into the interior decoration", "AMBIENCE"],
      ["none of it went to the chefs", "KITCHEN"]
    ]
  }
]
```


# Load data

In [40]:
import pandas as pd

In [47]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/tomaarsen/setfit-absa-semeval-restaurants/" + splits["train"])

In [51]:
semeval_texts = train_df['text'].unique()
len(semeval_texts)

2019

In [56]:
import numpy as np
np.random.seed(42)
sampled_texts = np.random.choice(semeval_texts, 30)
sampled_texts

array(['The place was nice and calm.', 'Their sake martini is wonderful.',
       'Great for groups, great for a date, great for early brunch or a nightcap.',
       'i recommend the thai popcorn :)',
       'Most of the servers are very attentive, friendly and quite attractive.',
       'My boyfriend had the New England Chowder it was good but I think the award should go to the Lobster Bisque.',
       'They are not helpful in the least and will give you the grand run around so by the time the event date rolls around you will not only regret chosing this place, but also become hostile!',
       'If you love seafood, you would love this place!',
       'Nice ambiance, nice little bar, good bartender, Francois, and good service.',
       'The food was bland oily.',
       'Over the years, it has always provided a pleasurable dining experience with quality food and wine.',
       "people are rude bit again it's new york!",
       'Great friendly service, Fast seating, Fast Delivery, Exce

In [67]:
import math
from tqdm.notebook import tqdm

CHUNK_SIZE = 10

def chunk_list(lst, k):
    """
    Break a list into chunks of size up to K and turn it into a generator.

    :param lst: List to be chunked
    :param k: Maximum size of each chunk
    :return: Generator yielding chunks of the list
    """
    for i in range(0, len(lst), k):
        yield lst[i:i + k]

results = []
for chunk in tqdm(chunk_list(sampled_texts, CHUNK_SIZE), total=math.ceil(len(sampled_texts) / CHUNK_SIZE)):
    result = get_openai_annotations(chunk)
    results.append(result)

  0%|          | 0/3 [00:00<?, ?it/s]

In [72]:
print(results[2])

```json
[
  {
    "text": "Warm, comfortable surroundings, nice appointments (witness the etched glass and brickwork separating the dining rooms).",
    "entities": [
      ["Warm, comfortable surroundings", "AMBIENCE"],
      ["nice appointments", "AMBIENCE"]
    ]
  },
  {
    "text": "The pesto pizza was excellent, thin-crust pizza with a nice amount of spicy Italian cheese that I'd never heard of before.",
    "entities": [
      ["pesto pizza was excellent", "FOOD"],
      ["thin-crust pizza with a nice amount of spicy Italian cheese", "FOOD"]
    ]
  },
  {
    "text": "The wine is always good, the tapas are always yummy, especially with the warm pita bread.",
    "entities": [
      ["wine is always good", "FOOD"],
      ["tapas are always yummy", "FOOD"],
      ["warm pita bread", "FOOD"]
    ]
  },
  {
    "text": "Pastrami or corned beef are juicy and piled high (ask for extra rye bread).",
    "entities": [
      ["Pastrami or corned beef are juicy and piled high", "FOOD"]
 

# Persist labeled samples

In [73]:
import json

In [81]:
def strip_result(text: str):
    return json.loads(text.strip("```json\n").strip("\n```"))

In [86]:
results_parsed = []
for text in results:
    parsed = strip_result(text)
    results_parsed.extend(parsed)
results_parsed

[{'text': 'The place was nice and calm.',
  'entities': [['place was nice and calm', 'AMBIENCE']]},
 {'text': 'Their sake martini is wonderful.',
  'entities': [['sake martini is wonderful', 'FOOD']]},
 {'text': 'Great for groups, great for a date, great for early brunch or a nightcap.',
  'entities': []},
 {'text': 'i recommend the thai popcorn :)',
  'entities': [['recommend the thai popcorn', 'FOOD']]},
 {'text': 'Most of the servers are very attentive, friendly and quite attractive.',
  'entities': [['servers are very attentive, friendly and quite attractive',
    'SERVICE']]},
 {'text': 'My boyfriend had the New England Chowder it was good but I think the award should go to the Lobster Bisque.',
  'entities': [['New England Chowder it was good', 'FOOD'],
   ['award should go to the Lobster Bisque', 'FOOD']]},
 {'text': 'They are not helpful in the least and will give you the grand run around so by the time the event date rolls around you will not only regret chosing this place, bu

In [87]:
sample_persist_path = '../data/sampled_results_openai.json'
with open(sample_persist_path, 'w') as f:
    json.dump(results_parsed, f)

In [88]:
results_parsed

[{'text': 'The place was nice and calm.',
  'entities': [['place was nice and calm', 'AMBIENCE']]},
 {'text': 'Their sake martini is wonderful.',
  'entities': [['sake martini is wonderful', 'FOOD']]},
 {'text': 'Great for groups, great for a date, great for early brunch or a nightcap.',
  'entities': []},
 {'text': 'i recommend the thai popcorn :)',
  'entities': [['recommend the thai popcorn', 'FOOD']]},
 {'text': 'Most of the servers are very attentive, friendly and quite attractive.',
  'entities': [['servers are very attentive, friendly and quite attractive',
    'SERVICE']]},
 {'text': 'My boyfriend had the New England Chowder it was good but I think the award should go to the Lobster Bisque.',
  'entities': [['New England Chowder it was good', 'FOOD'],
   ['award should go to the Lobster Bisque', 'FOOD']]},
 {'text': 'They are not helpful in the least and will give you the grand run around so by the time the event date rolls around you will not only regret chosing this place, bu

In [92]:
with open(sample_persist_path, 'r') as f:
    tmp = json.load(f)