# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

import sys
sys.path.insert(0, '..')

# Load data

In [3]:
import pandas as pd

In [4]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/tomaarsen/setfit-absa-semeval-restaurants/" + splits["train"])

In [5]:
semeval_texts = train_df['text'].unique()
len(semeval_texts)

2019

# Prompt

In [6]:
import os
from ollama import Client

In [7]:
# LLM_SERVER_HOST = '192.168.100.16'
LLM_SERVER_HOST = '192.168.100.10'
LLM_SERVER_PORT = 11434
client = Client(host=f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}')

In [8]:
system_prompt_fp = '../src/prompt/v8.txt'
with open(system_prompt_fp, 'r') as f:
    system_prompt = f.read()

In [9]:
prompt = """
Input:
{input_texts}
"""

In [10]:
APPROX_CHARS_PER_TOKEN = 4
prompt_approx_token_counts = (len(system_prompt) + len(prompt)) / APPROX_CHARS_PER_TOKEN
print(f"{prompt_approx_token_counts=}")

prompt_approx_token_counts=2849.75


## Call LLM Extract

In [11]:
from src.wrapper.v1 import llm_extract_modelfile
from src.utils.id.idfy import deterministic_hash
from tqdm.notebook import tqdm
import math
from loguru import logger

In [12]:
to_process = semeval_texts

In [13]:
REMOVE_TEXTS_IN_CACHED = False

def build_cache_from_log_output(fp):
    with open(fp, 'r') as f:
        results = [json.loads(jline) for jline in f.readlines()]

    cache = set()
    for result in results:
        result_json = json.loads(result['record']['extra']['llm_extracted'])
        for rj in result_json.values():
            cache.add(rj['text'])

    return cache

if REMOVE_TEXTS_IN_CACHED:
    prev_log_output_fp = 'llm_extract_output_2024-07-05_19-35-03_258376.jsonl'
    cached = build_cache_from_log_output(prev_log_output_fp)
    
    # remainings = [text for text in semeval_texts if text not in cached]
    remainings = set(semeval_texts) - cached
    logger.info(f"original: {len(semeval_texts)} - cached: {len(cached)} - remainings: {len(remainings)}")
    to_process = remainings

# Check cached

# Pipeline 

In [14]:
import numpy as np
np.random.seed(41)
sampled_texts = np.random.choice(to_process, 33)
sampled_texts[:5]

array(["I've had the chicken with garlic sauce, chicken with black bean sauce, and hunan chicken.",
       "We ate out in the back patio, which is worth it as it's cool and the music is hear well there.",
       'Traditional French decour was pleasant though the hall was rather noisy - the restaurant was full and we had to raise our voices to be able to maintain a conversation.',
       'Each table has a pot of boiling water sunken into its surface, and you get platters of thin sliced meats, various vegetables, and rice and glass noodles.',
       'We were worried we would have trouble getting in, but somehow managed to have a short wait.'],
      dtype=object)

In [15]:
input_texts = []
for t in sampled_texts:
    # Strangely if add the hashed id then the LLM would mess up with its output structure
    # t_ = {"id": str(deterministic_hash(t)), "text": t}
    t_ = {"text": t}
    input_texts.append(t_)

logger.info(f"{len(input_texts)=}")

[32m2024-07-06 14:03:28.358[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mlen(input_texts)=33[0m


In [17]:
CHUNK_SIZE = 10

extracted_results = []
for i in tqdm(range(0, len(input_texts), CHUNK_SIZE), total=math.ceil(len(input_texts) / CHUNK_SIZE)):
    batch = input_texts[i:i + CHUNK_SIZE]
    extracted = llm_extract_modelfile(batch)
    extracted_results.extend(list(extracted.values()))

  0%|          | 0/4 [00:00<?, ?it/s]

[32m2024-07-06 14:04:34.268[0m | [34m[1mDEBUG   [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [34m[1mcall_llm runtime: 49.936s[0m
[32m2024-07-06 14:04:34.270[0m | [34m[1mDEBUG   [0m | [36msrc.wrapper.v1[0m:[36mllm_extract_modelfile[0m:[36m108[0m - [34m[1m[OUTPUT] LLM Extracted successfully[0m


KeyError: 'text'

In [18]:
%debug

> [0;32m/home/dvquys/frostmourne/lets-build-mlsys/src/wrapper/v1.py[0m(114)[0;36m<listcomp>[0;34m()[0m
[0;32m    112 [0;31m        )
[0m[0;32m    113 [0;31m        [0minput_texts_texts[0m [0;34m=[0m [0;34m[[0m[0me[0m[0;34m[[0m[0;34m'text'[0m[0;34m][0m [0;32mfor[0m [0me[0m [0;32min[0m [0minput_texts[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 114 [0;31m        [0moutput_json_texts[0m [0;34m=[0m [0;34m[[0m[0me[0m[0;34m[[0m[0;34m'text'[0m[0;34m][0m [0;32mfor[0m [0me[0m [0;32min[0m [0moutput_json[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    115 [0;31m        [0;32massert[0m [0mset[0m[0;34m([0m[0minput_texts_texts[0m[0;34m)[0m [0;34m==[0m [0mset[0m[0;34m([0m[0moutput_json_texts[0m[0;34m)[0m[0;34m,[0m [0;34m"Input texts do not match output texts"[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    116 [0;31m        [0massert_extracted_entity_is_su

ipdb>  output_json.values()


*** NameError: name 'output_json' is not defined


ipdb>  output_json


*** NameError: name 'output_json' is not defined


ipdb>  u


> [0;32m/home/dvquys/frostmourne/lets-build-mlsys/src/wrapper/v1.py[0m(114)[0;36mllm_extract_modelfile[0;34m()[0m
[0;32m    112 [0;31m        )
[0m[0;32m    113 [0;31m        [0minput_texts_texts[0m [0;34m=[0m [0;34m[[0m[0me[0m[0;34m[[0m[0;34m'text'[0m[0;34m][0m [0;32mfor[0m [0me[0m [0;32min[0m [0minput_texts[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 114 [0;31m        [0moutput_json_texts[0m [0;34m=[0m [0;34m[[0m[0me[0m[0;34m[[0m[0;34m'text'[0m[0;34m][0m [0;32mfor[0m [0me[0m [0;32min[0m [0moutput_json[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    115 [0;31m        [0;32massert[0m [0mset[0m[0;34m([0m[0minput_texts_texts[0m[0;34m)[0m [0;34m==[0m [0mset[0m[0;34m([0m[0moutput_json_texts[0m[0;34m)[0m[0;34m,[0m [0;34m"Input texts do not match output texts"[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    116 [0;31m        [0massert_extracted_e

ipdb>  output_json


{'1': {'text': "I've had the chicken with garlic sauce, chicken with black bean sauce, and hunan chicken.", 'entities': [['chicken with garlic sauce', 'FOOD', 0.9, 0.8], ['chicken with black bean sauce', 'FOOD', 0.85, 0.7], ['hunan chicken', 'FOOD', 0.8, 0.6]]}, '2': {'text': "We ate out in the back patio, which is worth it as it's cool and the music is hear well there.", 'entities': [['back patio', 'AMBIENCE', 0.9, 0.7], ['cool', 'AMBIENCE', 0.8, 0.5], ['music is hear well there', 'MUSIC', 0.85, 0.6]]}, '3': {"text': 'Traditional French decour was pleasant though the hall was rather noisy - the restaurant was full and we had to raise our voices to be able to maintain a conversation.'": [['Traditional French decor', 'AMBIENCE', 0.8, 0.5], ['hall was rather noisy', 'AMBIENCE', 0.6, -0.4]]}, '4': {"text': 'Each table has a pot of boiling water sunken into its surface, and you get platters of thin sliced meats, various vegetables, and rice and glass noodles.'": [['pot of boiling water', '

ipdb>  output_json.values


<built-in method values of dict object at 0x7de8bf8444c0>


ipdb>  output_json.values()


dict_values([{'text': "I've had the chicken with garlic sauce, chicken with black bean sauce, and hunan chicken.", 'entities': [['chicken with garlic sauce', 'FOOD', 0.9, 0.8], ['chicken with black bean sauce', 'FOOD', 0.85, 0.7], ['hunan chicken', 'FOOD', 0.8, 0.6]]}, {'text': "We ate out in the back patio, which is worth it as it's cool and the music is hear well there.", 'entities': [['back patio', 'AMBIENCE', 0.9, 0.7], ['cool', 'AMBIENCE', 0.8, 0.5], ['music is hear well there', 'MUSIC', 0.85, 0.6]]}, {"text': 'Traditional French decour was pleasant though the hall was rather noisy - the restaurant was full and we had to raise our voices to be able to maintain a conversation.'": [['Traditional French decor', 'AMBIENCE', 0.8, 0.5], ['hall was rather noisy', 'AMBIENCE', 0.6, -0.4]]}, {"text': 'Each table has a pot of boiling water sunken into its surface, and you get platters of thin sliced meats, various vegetables, and rice and glass noodles.'": [['pot of boiling water', 'KITCHEN'

ipdb>  !type(output_json)


<class 'dict'>


ipdb>  output_json


{'1': {'text': "I've had the chicken with garlic sauce, chicken with black bean sauce, and hunan chicken.", 'entities': [['chicken with garlic sauce', 'FOOD', 0.9, 0.8], ['chicken with black bean sauce', 'FOOD', 0.85, 0.7], ['hunan chicken', 'FOOD', 0.8, 0.6]]}, '2': {'text': "We ate out in the back patio, which is worth it as it's cool and the music is hear well there.", 'entities': [['back patio', 'AMBIENCE', 0.9, 0.7], ['cool', 'AMBIENCE', 0.8, 0.5], ['music is hear well there', 'MUSIC', 0.85, 0.6]]}, '3': {"text': 'Traditional French decour was pleasant though the hall was rather noisy - the restaurant was full and we had to raise our voices to be able to maintain a conversation.'": [['Traditional French decor', 'AMBIENCE', 0.8, 0.5], ['hall was rather noisy', 'AMBIENCE', 0.6, -0.4]]}, '4': {"text': 'Each table has a pot of boiling water sunken into its surface, and you get platters of thin sliced meats, various vegetables, and rice and glass noodles.'": [['pot of boiling water', '

ipdb>  output_json.values()[0]


*** TypeError: 'dict_values' object is not subscriptable


ipdb>  list(output_json.values())[0]


*** Error in argument: '(output_json.values())[0]'


ipdb>  !list(output_json.values())[0]


{'text': "I've had the chicken with garlic sauce, chicken with black bean sauce, and hunan chicken.", 'entities': [['chicken with garlic sauce', 'FOOD', 0.9, 0.8], ['chicken with black bean sauce', 'FOOD', 0.85, 0.7], ['hunan chicken', 'FOOD', 0.8, 0.6]]}


ipdb>  for e in output_json.values(): print(e.keys())


dict_keys(['text', 'entities'])
dict_keys(['text', 'entities'])
dict_keys(["text': 'Traditional French decour was pleasant though the hall was rather noisy - the restaurant was full and we had to raise our voices to be able to maintain a conversation.'"])
dict_keys(["text': 'Each table has a pot of boiling water sunken into its surface, and you get platters of thin sliced meats, various vegetables, and rice and glass noodles.'"])
dict_keys(["text': 'We were worried we would have trouble getting in, but somehow managed to have a short wait.'"])
dict_keys(["text': 'my personal favorite is an everything bagel with lox spread, but all the bagles are unbeliavably good.'"])
dict_keys(["text': 'The man that was hosting promised to save a table for our party of 7, then sat a party of 2 at the very table he was saving (mean while there were boths open all around).'"])
dict_keys(["text': 'The main course had an average portion, and was decent overall.'"])
dict_keys(["text': 'The restaurant is ra

ipdb>  output


'{\n"1": {\n"text": "I\'ve had the chicken with garlic sauce, chicken with black bean sauce, and hunan chicken.",\n"entities": [\n["chicken with garlic sauce", "FOOD", 0.9, 0.8],\n["chicken with black bean sauce", "FOOD", 0.85, 0.7],\n["hunan chicken", "FOOD", 0.8, 0.6]\n]\n},\n"2": {\n"text": "We ate out in the back patio, which is worth it as it\'s cool and the music is hear well there.",\n"entities": [\n["back patio", "AMBIENCE", 0.9, 0.7],\n["cool", "AMBIENCE", 0.8, 0.5],\n["music is hear well there", "MUSIC", 0.85, 0.6]\n]\n},\n"3": {\n"text\': \'Traditional French decour was pleasant though the hall was rather noisy - the restaurant was full and we had to raise our voices to be able to maintain a conversation.\'"\n    : [\n      ["Traditional French decor", "AMBIENCE", 0.8, 0.5],\n      ["hall was rather noisy", "AMBIENCE", 0.6, -0.4]\n    ]\n},\n"4": {\n"text\': \'Each table has a pot of boiling water sunken into its surface, and you get platters of thin sliced meats, various ve

ipdb>  exit


In [17]:
len(extracted_results)

33

In [18]:
extracted_results

[{'text': "I've had the chicken with garlic sauce, chicken with black bean sauce, and hunan chicken.",
  'entities': [['chicken with garlic sauce', 'FOOD', 0.7, 0.5],
   ['chicken with black bean sauce', 'FOOD', 0.6, 0.4],
   ['hunan chicken', 'FOOD', 0.6, 0.3]]},
 {'text': "We ate out in the back patio, which is worth it as it's cool and the music is hear well there.",
  'entities': [['back patio', 'AMBIENCE', 0.7, 0.5],
   ['cool', 'AMBIENCE', 0.6, 0.4],
   ['music is hear well', 'AMBIENCE', 0.6, 0.3]]},
 {'text': 'Traditional French decour was pleasant though the hall was rather noisy - the restaurant was full and we had to raise our voices to be able to maintain a conversation.',
  'entities': [['Traditional French decour', 'AMBIENCE', 0.6, 0.4],
   ['hall was rather noisy', 'AMBIENCE', 0.5, -0.5],
   ['restaurant was full', 'SERVICE', 0.7, 0.3]]},
 {'text': 'Each table has a pot of boiling water sunken into its surface, and you get platters of thin sliced meats, various vegetables

In [19]:
batch

[{'text': "Some of the workers ignore me and talk to the female customers, other times, they've skipped my order."},
 {'text': "If you've ever been along the river in Weehawken you have an idea of the top of view the chart house has to offer."},
 {'text': 'What is even better, is that the prices are very affordable as well, and the food is really good.'}]

# Check loguru logging output

In [None]:
with open('llm_extract_output_2024-07-06_13-16-14_331055.jsonl', 'r') as f:
    results = [json.loads(jline) for jline in f.readlines()]

In [30]:
[e['text'] for e in results[3]['record']['extra']['input_texts']]

["Some of the workers ignore me and talk to the female customers, other times, they've skipped my order.",
 "If you've ever been along the river in Weehawken you have an idea of the top of view the chart house has to offer.",
 'What is even better, is that the prices are very affordable as well, and the food is really good.']

In [31]:
[e['text'] for e in json.loads(results[3]['record']['extra']['llm_extracted']).values()]

["Some of the workers ignore me and talk to the female customers, other times, they've skipped my order.",
 "If you've ever been along the river in Weehawken you have an idea of the top of view the chart house has to offer.",
 'What is even better, is that the prices are very affordable as well, and the food is really good.']

In [28]:
result_list = []
for result in results:
    result_json = json.loads(result['record']['extra']['llm_extracted'])
    result_list.extend(list(result_json.values()))

In [32]:
result_list[-3:]

[{'text': "Some of the workers ignore me and talk to the female customers, other times, they've skipped my order.",
  'entities': [['workers ignore me and talk to the female customers',
    'SERVICE',
    0.8,
    -0.7],
   ['skipped my order', 'SERVICE', 0.85, -0.6]]},
 {'text': "If you've ever been along the river in Weehawken you have an idea of the top of view the chart house has to offer.",
  'entities': [['top of view the chart house has to offer',
    'VIEW',
    0.9,
    0.85]]},
 {'text': 'What is even better, is that the prices are very affordable as well, and the food is really good.',
  'entities': [['prices are very affordable', 'PRICE', 0.8, 0.7],
   ['food is really good', 'FOOD', 0.9, 0.85]]}]

# Archive

## Convert output to hashed id