# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0, '..')

# Load data

In [3]:
import pandas as pd

In [4]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/tomaarsen/setfit-absa-semeval-restaurants/" + splits["train"])

In [5]:
semeval_texts = train_df['text'].unique()
len(semeval_texts)

2019

In [6]:
import numpy as np
np.random.seed(41)
sampled_texts = np.random.choice(semeval_texts, 33)
sampled_texts

array(["I've had the chicken with garlic sauce, chicken with black bean sauce, and hunan chicken.",
       "We ate out in the back patio, which is worth it as it's cool and the music is hear well there.",
       'Traditional French decour was pleasant though the hall was rather noisy - the restaurant was full and we had to raise our voices to be able to maintain a conversation.',
       'Each table has a pot of boiling water sunken into its surface, and you get platters of thin sliced meats, various vegetables, and rice and glass noodles.',
       'We were worried we would have trouble getting in, but somehow managed to have a short wait.',
       'my personal favorite is an everything bagel with lox spread, but all the bagles are unbeliavably good.',
       'The man that was hosting promised to save a table for our party of 7, then sat a party of 2 at the very table he was saving (mean while there were boths open all around).',
       'The main course had an average portion, and was d

# Prompt

In [7]:
import os
from ollama import Client

In [8]:
# LLM_SERVER_HOST = '192.168.100.16'
LLM_SERVER_HOST = '192.168.100.10'
LLM_SERVER_PORT = 11434
client = Client(host=f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}')

In [9]:
system_prompt_fp = '../src/prompt/v6.txt'
with open(system_prompt_fp, 'r') as f:
    system_prompt = f.read()

In [10]:
prompt = """
Input:
{input_texts}
"""

In [11]:
APPROX_CHARS_PER_TOKEN = 4
prompt_approx_token_counts = (len(system_prompt) + len(prompt)) / APPROX_CHARS_PER_TOKEN
print(f"{prompt_approx_token_counts=}")

prompt_approx_token_counts=2368.0


## Call LLM Extract

In [12]:
from src.wrapper.v1 import llm_extract, logger as p_logger
from src.utils.id.idfy import deterministic_hash
from tqdm.notebook import tqdm
import math
from loguru import logger

In [13]:
input_texts = []
for t in sampled_texts:
    # Strangely if add the hashed id then the LLM would mess up with its output structure
    # t_ = {"id": str(deterministic_hash(t)), "text": t}
    t_ = {"text": t}
    input_texts.append(t_)

logger.info(f"{len(input_texts)=}")

[32m2024-07-05 19:05:04.908[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mlen(input_texts)=33[0m


In [14]:
input_texts[0]

{'text': "I've had the chicken with garlic sauce, chicken with black bean sauce, and hunan chicken."}

In [15]:
# input_texts = [
#     {"text": "They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it."},
#     {"text": "The pizza is the best if you like thin crusted pizza."},
#     {"text": "All the money went into the interior decoration, none of it went to the chefs."}
# ]
# input_texts = [
#     {"id": "302410369143631686124488347381689095554", "text": "They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it."},
#     {"id": "33613397660052989608475259485429283361", "text": "The pizza is the best if you like thin crusted pizza."},
#     {"id": "4217358929495711855940033134574696129", "text": "All the money went into the interior decoration, none of it went to the chefs."}
# ]

CHUNK_SIZE=10

extracted_results = []
for i in tqdm(range(0, len(input_texts), CHUNK_SIZE), total=math.ceil(len(input_texts) / CHUNK_SIZE)):
    batch = input_texts[i:i + CHUNK_SIZE]
    extracted = llm_extract(batch, system_prompt)
    extracted_results.extend(list(extracted.values()))

  0%|          | 0/4 [00:00<?, ?it/s]

[32m2024-07-05 19:06:04.381[0m | [34m[1mDEBUG   [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [34m[1mcall_llm runtime: 47.313s[0m
[32m2024-07-05 19:06:04.382[0m | [34m[1mDEBUG   [0m | [36msrc.wrapper.v1[0m:[36mllm_extract[0m:[36m62[0m - [34m[1m[OUTPUT] LLM Extracted successfully[0m
[32m2024-07-05 19:06:36.943[0m | [34m[1mDEBUG   [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [34m[1mcall_llm runtime: 32.559s[0m
[32m2024-07-05 19:06:36.944[0m | [34m[1mDEBUG   [0m | [36msrc.wrapper.v1[0m:[36mllm_extract[0m:[36m62[0m - [34m[1m[OUTPUT] LLM Extracted successfully[0m
[32m2024-07-05 19:07:09.301[0m | [34m[1mDEBUG   [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [34m[1mcall_llm runtime: 32.356s[0m
[32m2024-07-05 19:07:09.303[0m | [34m[1mDEBUG   [0m | [36msrc.wrapper.v1[0m:[36mllm_extract[0m:[36m62[0m - [34m[1m[OUTPUT] LLM Extracted successfully[0m
[32m2024-07-05 19:07:22.996

In [19]:
len(extracted_results)

33

In [22]:
# Check loguru logging output
import json

with open('llm_extract_output_2024-07-05_19-05-04_881767.log', 'r') as f:
    results = [json.loads(jline) for jline in f.readlines()]

result_list = []
for result in results:
    result_json = json.loads(result['record']['extra']['llm_extracted'])
    result_list.extend(list(result_json.values()))

In [25]:
result_list[8:13]

[{'text': 'The restaurant is rather small but we were lucky to get a table quickly.',
  'entities': [['restaurant is rather small', 'AMBIENCE', 0.4, -0.3],
   ['table quickly', 'SERVICE', 0.7, 0.5]]},
 {'text': 'It is nearly impossible to get a table, so if you ever have the chance to go here for dinner, DO NOT pass it up.',
  'entities': [['nearly impossible to get a table', 'SERVICE', 0.5, -0.6],
   ['DO NOT pass it up', 'FOOD', 0.9, 0.7]]},
 {'text': 'The service was poor.',
  'entities': [["could not catch our waiter's eye", 'SERVICE', 0.5, -0.6],
   ['he ignored us', 'SERVICE', 0.4, -0.7]]},
 {'text': 'Some servers are unfriendly.',
  'entities': [['some servers make you feel like they are doing you a favor',
    'SERVICE',
    0.3,
    -0.5]]},
 {'text': 'The prices are acceptable.',
  'entities': [['acceptable prices', 'PRICE', 0.7, 0.2]]}]

# Archive

## Convert output to hashed id