# Set up

In [19]:
import json
import pandas as pd
from loguru import logger

# Load extraction results

In [48]:
VERSION = '20240706095040'
LLM_ASPECTS_EXTRACT_FP = f'../data/output/llm_extract_output_{VERSION}.jsonl'

In [21]:
fp = LLM_ASPECTS_EXTRACT_FP
with open(fp, 'r') as f:
    results = [json.loads(jline) for jline in f.readlines()]

llm_output = [record['record']['extra']['llm_extracted'] for record in results]

In [24]:
unnested = []
for output in llm_output:
    for record in output.values():
        entities = record['entities']
        for entity in entities:
            if not isinstance(entity, list) or len(entity) != 4:
                logger.error(
                    "\n".join(
                        [
                            f"Entity should be a list like ['text', 'label', confidence_score, sentiment_score]",
                            f"Observed {entity=} in {entities=} in {text=}",
                        ]
                    )
                )
                continue
            record_u = {'text': record['text']}
            extracted_text, label, confidence_score, sentiment_score = entity
            record_u = {
                **record_u,
                **dict(
                    extracted_text=extracted_text,
                    label=label,
                    confidence_score=confidence_score,
                    sentiment_score=sentiment_score
                )
            }
            unnested.append(record_u)

unnested_df = pd.DataFrame(unnested)
unnested_df

[32m2024-07-09 09:54:55.749[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [31m[1mEntity should be a list like ['text', 'label', confidence_score, sentiment_score]
Observed entity='SERVICE' in entities=[['prices are very high', 'OVERALL', 0.4, -0.6], ['terrible food and service', 'FOOD', 0.3, -0.7], 'SERVICE', 0.2, -0.8] in text='wait staff is not overly efficient'[0m
[32m2024-07-09 09:54:55.751[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [31m[1mEntity should be a list like ['text', 'label', confidence_score, sentiment_score]
Observed entity=0.2 in entities=[['prices are very high', 'OVERALL', 0.4, -0.6], ['terrible food and service', 'FOOD', 0.3, -0.7], 'SERVICE', 0.2, -0.8] in text='wait staff is not overly efficient'[0m
[32m2024-07-09 09:54:55.752[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [31m[1mEntity should be a list like ['text', 'label', confidence_score, sentiment_

Unnamed: 0,text,extracted_text,label,confidence_score,sentiment_score
0,The service is not consistently excellent -- j...,service is not consistently excellent,SERVICE,0.4,-0.3
1,The service is not consistently excellent -- j...,just decent,SERVICE,0.5,0.2
2,I went with 5 friends and we lingered at the t...,didn't feel rushed,AMBIENCE,0.6,0.4
3,"Food was very good as well, considering that w...",food was very good,FOOD,0.7,0.5
4,"Food was very good as well, considering that w...",pork belly that I ordered,FOOD,0.6,-0.2
...,...,...,...,...,...
3699,"I found the food, service and value exceptiona...",value is exceptional,VALUE,0.9,0.7
3700,The place is small and intimate and you may fe...,place is small and intimate,AMBIENCE,0.6,0.4
3701,The place is small and intimate and you may fe...,service is excellent,SERVICE,0.8,0.5
3702,"The wait staff is very friendly, if not overly...",wait staff is very friendly,SERVICE,0.7,0.3


In [29]:
analyze_df = (
    unnested_df
    .groupby('label')
    .agg(
        count_records=('text', 'count')
    )
    .sort_values(['count_records'], ascending=[False])
    .assign(
        perc_records=lambda df: df['count_records'] / df['count_records'].sum(),
        perc_records_cumsum=lambda df: df['perc_records'].cumsum()
    )
)
analyze_df

Unnamed: 0_level_0,count_records,perc_records,perc_records_cumsum
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FOOD,1608,0.434125,0.434125
SERVICE,1047,0.282667,0.716793
AMBIENCE,490,0.132289,0.849082
BEVERAGE,210,0.056695,0.905778
PRICE,81,0.021868,0.927646
VALUE,78,0.021058,0.948704
LOCATION,37,0.009989,0.958693
STAFF,20,0.0054,0.964093
OVERALL,17,0.00459,0.968683
VIEW,17,0.00459,0.973272


In [31]:
COUNT_RECORDS_THRESHOLD = 10
analyze_df.loc[lambda df: df['count_records'].ge(COUNT_RECORDS_THRESHOLD)]

Unnamed: 0_level_0,count_records,perc_records,perc_records_cumsum
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FOOD,1608,0.434125,0.434125
SERVICE,1047,0.282667,0.716793
AMBIENCE,490,0.132289,0.849082
BEVERAGE,210,0.056695,0.905778
PRICE,81,0.021868,0.927646
VALUE,78,0.021058,0.948704
LOCATION,37,0.009989,0.958693
STAFF,20,0.0054,0.964093
OVERALL,17,0.00459,0.968683
VIEW,17,0.00459,0.973272


# Rewriting label rules

In [41]:
from typing import List


class RuleRewriter:
    name_mapper = {"GENERAL": "OVERALL"}
    allowed_labels = set(
        [
            "FOOD",
            "SERVICE",
            "AMBIENCE",
            "BEVERAGE",
            "PRICE",
            "VALUE",
            "LOCATION",
            "STAFF",
            "OVERALL",
            "VIEW"
        ]
    )

    def __init__(self): ...

    def rewrite(self, entity: List):
        extracted_text, label, confidence_score, sentiment_score = entity
        if label in self.name_mapper:
            label = self.name_mapper[label]
        if label not in self.allowed_labels:
            return []
        return [extracted_text, label, confidence_score, sentiment_score]


In [46]:
llm_output_rewrited = []
rewriter = RuleRewriter()

for output in llm_output:
    output_rewrited = dict()
    for k, record in output.items():
        for entity in record['entities']:
            if not isinstance(entity, list) or len(entity) != 4:
                logger.error(
                    "\n".join(
                        [
                            f"Entity should be a list like ['text', 'label', confidence_score, sentiment_score]",
                            f"Observed {entity=} in {entities=} in {text=}",
                        ]
                    )
                )
                continue
            entities_rewrited = []
            entity_rewrited = rewriter.rewrite(entity)
            if entity_rewrited:
                entities_rewrited.append(entity_rewrited)
            entities_rewrited = {'text': record['text'], 'entities': entities_rewrited}
        output_rewrited[k] = entities_rewrited
    llm_output_rewrited.append(output_rewrited)

[32m2024-07-09 10:21:18.038[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [31m[1mEntity should be a list like ['text', 'label', confidence_score, sentiment_score]
Observed entity='SERVICE' in entities=[['wait staff is very friendly', 'SERVICE', 0.7, 0.3], ['wait staff is not overly efficient', 'SERVICE', 0.5, -0.2]] in text='wait staff is not overly efficient'[0m
[32m2024-07-09 10:21:18.040[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [31m[1mEntity should be a list like ['text', 'label', confidence_score, sentiment_score]
Observed entity=0.2 in entities=[['wait staff is very friendly', 'SERVICE', 0.7, 0.3], ['wait staff is not overly efficient', 'SERVICE', 0.5, -0.2]] in text='wait staff is not overly efficient'[0m
[32m2024-07-09 10:21:18.040[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [31m[1mEntity should be a list like ['text', 'label', confidence_score, sentiment_score]
O

# Persist

In [49]:
LLM_OUTPUT_REWRITED_FP = f'../data/output/llm_extract_output_{VERSION}_rewrited.json'
with open(LLM_OUTPUT_REWRITED_FP, 'w') as f:
    json.dump(llm_output_rewrited, f)