# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

import numpy as np

import sys
sys.path.insert(0, '..')

# Load data

In [3]:
import pandas as pd

In [4]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/tomaarsen/setfit-absa-semeval-restaurants/" + splits["train"])

In [5]:
semeval_texts = train_df['text'].unique()
len(semeval_texts)

2019

# Prompt

In [6]:
import os
from ollama import Client

In [7]:
# LLM_SERVER_HOST = '192.168.100.16'
LLM_SERVER_HOST = '192.168.100.10'
LLM_SERVER_PORT = 11434
client = Client(host=f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}')

In [8]:
system_prompt_fp = '../src/prompt/v11.txt'
with open(system_prompt_fp, 'r') as f:
    system_prompt = f.read()

In [9]:
prompt = """
Input:
{input_texts}
"""

In [10]:
APPROX_CHARS_PER_TOKEN = 4
prompt_approx_token_counts = (len(system_prompt) + len(prompt)) / APPROX_CHARS_PER_TOKEN
print(f"{prompt_approx_token_counts=}")

prompt_approx_token_counts=2876.0


## Call LLM Extract

In [11]:
from src.wrapper.v1 import llm_extract
from src.utils.id.idfy import deterministic_hash
from tqdm.notebook import tqdm
import math
from loguru import logger

In [12]:
to_process = semeval_texts

In [13]:
REMOVE_TEXTS_IN_CACHED = True
PREV_LOG_OUTPUT_FP = 'llm_extract_output_cache.jsonl'

def build_cache_from_log_output(fp):
    with open(fp, 'r') as f:
        results = [json.loads(jline) for jline in f.readlines()]

    cache = set()
    for result in results:
        result_json = result['record']['extra']['llm_extracted']
        for rj in result_json.values():
            cache.add(rj['text'])

    return cache

if REMOVE_TEXTS_IN_CACHED:
    cached = build_cache_from_log_output(PREV_LOG_OUTPUT_FP)
    
    # remainings = [text for text in semeval_texts if text not in cached]
    remainings = set(semeval_texts) - cached
    logger.info(f"original: {len(semeval_texts)} - cached: {len(cached)} - remainings: {len(remainings)}")
    if len(semeval_texts) - len(cached) != len(remainings):
        logger.error("original - cached != remainings")
    to_process = np.array(list(remainings))

[32m2024-07-06 16:50:40.051[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1moriginal: 2019 - cached: 66 - remainings: 1954[0m
[32m2024-07-06 16:50:40.052[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [31m[1moriginal - cached != remainings[0m


# Pipeline 

In [14]:
np.random.seed(41)
# SAMPLE_SIZE = 33
SAMPLE_SIZE = None

sampled_texts = to_process
if SAMPLE_SIZE:
    sampled_texts = np.random.choice(to_process, SAMPLE_SIZE)
sampled_texts[:5]

array(["The food at this place is 'gourmet' Indian cuisine.",
       'Really cool stauff inside.',
       'Ess-A-Bagel (either by Sty-town or midtown) is by far the best bagel in NY.',
       'Granted the space is smaller than most, it is the best service you will find in even the largest of restaurants.',
       "I like the ambience, it's very dark and original."], dtype='<U357')

In [15]:
input_texts = []
for t in sampled_texts:
    # Strangely if add the hashed id then the LLM would mess up with its output structure
    # t_ = {"id": str(deterministic_hash(t)), "text": t}
    t_ = {"text": t}
    input_texts.append(t_)

logger.info(f"{len(input_texts)=}")

[32m2024-07-06 16:50:40.091[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mlen(input_texts)=1954[0m


In [16]:
CHUNK_SIZE = 10

extracted_results = []
for i in tqdm(range(0, len(input_texts), CHUNK_SIZE), total=math.ceil(len(input_texts) / CHUNK_SIZE)):
    batch = input_texts[i:i + CHUNK_SIZE]
    extracted = llm_extract(batch, system_prompt)
    extracted_results.extend(list(extracted.values()))

  0%|          | 0/196 [00:00<?, ?it/s]

[32m2024-07-06 16:51:13.316[0m | [34m[1mDEBUG   [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [34m[1mcall_llm runtime: 33.200s[0m
[32m2024-07-06 16:51:13.317[0m | [34m[1mDEBUG   [0m | [36msrc.wrapper.v1[0m:[36mllm_extract[0m:[36m117[0m - [34m[1m[OUTPUT] LLM Extracted successfully[0m
[32m2024-07-06 16:51:49.183[0m | [34m[1mDEBUG   [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [34m[1mcall_llm runtime: 35.864s[0m
[32m2024-07-06 16:51:49.184[0m | [34m[1mDEBUG   [0m | [36msrc.wrapper.v1[0m:[36mllm_extract[0m:[36m117[0m - [34m[1m[OUTPUT] LLM Extracted successfully[0m
[32m2024-07-06 16:52:23.865[0m | [34m[1mDEBUG   [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [34m[1mcall_llm runtime: 34.679s[0m
[32m2024-07-06 16:52:23.866[0m | [34m[1mDEBUG   [0m | [36msrc.wrapper.v1[0m:[36mllm_extract[0m:[36m117[0m - [34m[1m[OUTPUT] LLM Extracted successfully[0m
[32m2024-07-06 16:53:03.

In [19]:
len(extracted_results)

1584

In [18]:
extracted_results

[{'text': "The food at this place is 'gourmet' Indian cuisine.",
  'entities': [['food at this place', 'FOOD', 0.8, 0.5],
   ['gourmet', 'FOOD', 0.7, 0.4],
   ['Indian cuisine', 'CUISINE', 0.9, 0.6]]},
 {'text': 'Really cool stauff inside.',
  'entities': [['cool stauff inside', 'AMBIENCE', 0.8, 0.5]]},
 {'text': 'Ess-A-Bagel (either by Sty-town or midtown) is by far the best bagel in NY.',
  'entities': [['best bagel in NY', 'FOOD', 0.9, 0.7]]},
 {'text': 'Granted the space is smaller than most, it is the best service you will find in even the largest of restaurants.',
  'entities': [['best service', 'SERVICE', 0.8, 0.6],
   ['largest of restaurants', 'SERVICE', 0.7, 0.4]]},
 {'text': "I like the ambience, it's very dark and original.",
  'entities': [['ambience', 'AMBIENCE', 0.9, 0.6],
   ['dark and original', 'AMBIENCE', 0.8, 0.5]]},
 {'text': 'To celebrate a birthday, three of us went to Mare anticipating great food.',
  'entities': [['great food', 'FOOD', 0.9, 0.7]]},
 {'text': 'N

In [19]:
batch

[{'text': "Some of the workers ignore me and talk to the female customers, other times, they've skipped my order."},
 {'text': "If you've ever been along the river in Weehawken you have an idea of the top of view the chart house has to offer."},
 {'text': 'What is even better, is that the prices are very affordable as well, and the food is really good.'}]

# Check loguru logging output

In [17]:
with open('llm_extract_output_2024-07-06_14-33-55_820197.jsonl', 'r') as f:
    results = [json.loads(jline) for jline in f.readlines()]

In [18]:
[e['text'] for e in results[2]['record']['extra']['input_texts']]

['Meanwhile, the bartender continued to pour champagne from his reserve after we had finished our bottle and we enjoyed an amuse of turnip soup with pureed basil, gratis.',
 'The staff is incredibly helpful and attentive.',
 'When we sat, we got great and fast service.',
 'Very good service and very good prices.',
 'Example is the soup which was about 6 oz for $12 dollars and the mushrooms where $12 for about 1oz.',
 'BUt their best dish is thh Thai spiced curry noodles with shrimp - a dish that would cost $23.95 is most places, but it is $16 here.',
 "I've eaten thai many times, and am very familiar with the cuisine.",
 'I highly recommend visiting this restaurant and having dinner and drinks!',
 'While this is a pretty place in that overly cute French way, the food was insultingly horrible.',
 'Found service above average, but that could be because we were 13 of us.']

In [19]:
[e['text'] for e in json.loads(results[2]['record']['extra']['llm_extracted']).values()]

['Meanwhile, the bartender continued to pour champagne from his reserve after we had finished our bottle and we enjoyed an amuse of turnip soup with pureed basil, gratis.',
 'The staff is incredibly helpful and attentive.',
 'When we sat, we got great and fast service.',
 'Very good service and very good prices.',
 'Example is the soup which was about 6 oz for $12 dollars and the mushrooms where $12 for about 1oz.',
 'BUt their best dish is thh Thai spiced curry noodles with shrimp - a dish that would cost $23.95 is most places, but it is $16 here.',
 "I've eaten thai many times, and am very familiar with the cuisine."]

In [28]:
result_list = []
for result in results:
    result_json = json.loads(result['record']['extra']['llm_extracted'])
    result_list.extend(list(result_json.values()))

In [32]:
result_list[-3:]

[{'text': "Some of the workers ignore me and talk to the female customers, other times, they've skipped my order.",
  'entities': [['workers ignore me and talk to the female customers',
    'SERVICE',
    0.8,
    -0.7],
   ['skipped my order', 'SERVICE', 0.85, -0.6]]},
 {'text': "If you've ever been along the river in Weehawken you have an idea of the top of view the chart house has to offer.",
  'entities': [['top of view the chart house has to offer',
    'VIEW',
    0.9,
    0.85]]},
 {'text': 'What is even better, is that the prices are very affordable as well, and the food is really good.',
  'entities': [['prices are very affordable', 'PRICE', 0.8, 0.7],
   ['food is really good', 'FOOD', 0.9, 0.85]]}]

# Archive

## Convert output to hashed id

## Check cached