#  Show the tradeoffs between Spacy vs ChatGPT for entity recognition


Requirements
```
python -m spacy download en_core_web_sm
```
```python
import nltk
nltk.download('vader_lexicon')
```

In [1]:
import asyncio
import datetime
from typing import Dict, List, NamedTuple

import spacy
from nltk.sentiment import SentimentIntensityAnalyzer
from pydantic import BaseModel, Field

from promptedgraphs.config import Config, load_config
from promptedgraphs.extraction.entities_from_text import entities_from_text
from promptedgraphs.generation.data_from_model import generate
from promptedgraphs.llms.openai_chat import LanguageModel
from promptedgraphs.llms.usage import Usage
from promptedgraphs.models import EntityReference
from promptedgraphs.vis import ensure_entities, render_entities

In [2]:
load_config()

Config(name=Prompted Graphs, description=From Dataset Labeling to Deployment: The Power of NLP and LLMs Combined., version=0.3.1, openai_api_key=***************************************************)

In [3]:
async def label_sentiment(text_of_reviews, model=None, temperature=0):
    labels = {
        "POSITIVE": "A postive review of a product or service.",
        "NEGATIVE": "A negative review of a product or service.",
        "NEUTRAL": "A neutral review of a product or service.",
    }

    ents = []
    usage = Usage(model=model)
    async for msg in entities_from_text(
        name="sentiment",
        description="Sentiment Analysis of Customer Reviews",
        text=text_of_reviews,
        temperature=temperature,
        labels=labels,
        model=model,
        config=Config(),
        include_reason=False,
        usage=usage,
    ):
        ents.append(msg)
    return ents, usage


text_of_reviews = """
1. "I absolutely love this product. It's been a game changer!"
2. "The service was quite poor and the staff was rude."
3. "The item is okay. Nothing special, but it gets the job done."
""".strip()

## Two GPT approaches

In [4]:
gpt35_entities, gpt35_usage = await label_sentiment(
    text_of_reviews, model=LanguageModel.GPT35_turbo, temperature=0
)

render_entities(text_of_reviews, gpt35_entities)
display(gpt35_usage)

Usage(model=LanguageModel.GPT35_turbo, prompt_tokens=783, completion_tokens=125, duration=2.9928, cost=0.001061), compute_cost=0.000028), llm_cost=0.001033)

In [5]:
gpt4_entities, gpt4_usage = await label_sentiment(
    text_of_reviews, model=LanguageModel.GPT4
)

render_entities(text_of_reviews, gpt4_entities)
display(gpt4_usage)

Usage(model=LanguageModel.GPT4, prompt_tokens=783, completion_tokens=124, duration=11.6574, cost=0.031039), compute_cost=0.000109), llm_cost=0.030930)

# Spacy implementation

In [6]:
nlp = spacy.load("en_core_web_sm")


async def extract_entities_spacy(
    name: str, description: str, text: str, labels: Dict[str, str]
) -> List[EntityReference]:
    # Load spaCy model for sentence tokenization

    usage = Usage(model="spacy")
    usage.start()

    # Initialize the sentiment analyzer
    sia = SentimentIntensityAnalyzer()

    # Tokenize the text into sentences
    doc = nlp(text)
    entities = []

    for sentence in doc.sents:
        # Analyze sentiment of the sentence
        sentiment_score = sia.polarity_scores(sentence.text)

        # Determine the sentiment label
        if sentiment_score["neu"] >= 0.99:
            continue
        if sentiment_score["compound"] >= 0.05:
            sentiment_label = "POSITIVE"
        elif sentiment_score["compound"] <= -0.05:
            sentiment_label = "NEGATIVE"
        else:
            sentiment_label = "NEUTRAL"

        # Create EntityReference
        entity = EntityReference(
            start=sentence.start_char,
            end=sentence.end_char,
            label=sentiment_label,
            text=sentence.text,
            reason=str(sentiment_score),
        )
        entities.append(entity)

    usage.end()
    return entities, usage

In [7]:
ents_spacy, spacy_usage = await extract_entities_spacy(
    name="sentiment",
    description="Sentiment Analysis of Customer Reviews",
    text=text_of_reviews,
    labels={
        "POSITIVE": "A positive review of a product or service.",
        "NEGATIVE": "A negative review of a product or service.",
        "NEUTRAL": "A neutral review of a product or service.",
    },
)

In [8]:
import pandas as pd

costs = pd.DataFrame(
    [spacy_usage.dict(), gpt35_usage.dict(), gpt4_usage.dict()]
).set_index("model")

Model spacy not found in pricing table, using default pricing of 0
Model spacy not found in pricing table, using default pricing of 0


In [9]:
costs.loc[:, "cost multiple"] = (
    (costs["cost"] / costs["cost"].min())
    .fillna(1)
    .apply(lambda x: f"{x:.1f}".title() + "x")
)
costs.loc[:, "duration multiple"] = (
    (costs["duration"] / costs["duration"].min())
    .fillna(1)
    .apply(lambda x: f"{x:.1f}".title() + "x")
)

In [10]:
costs

Unnamed: 0_level_0,prompt_tokens,completion_tokens,duration,cost,llm_cost,compute_cost,cost multiple,duration multiple
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
spacy,0,0,0.018291,0.0,0.0,0.0,1.0x,1.0x
LanguageModel.GPT35_turbo,783,125,2.992781,0.001061,0.001033,2.8e-05,Infx,163.6x
LanguageModel.GPT4,783,124,11.657372,0.031039,0.03093,0.000109,Infx,637.3x


# Visualize labels across models

In [11]:
from promptedgraphs.vis import get_colors, get_fields

colors = get_colors(get_fields(gpt35_entities + gpt4_entities + ents_spacy))

In [12]:
render_entities(text_of_reviews, gpt35_entities, color_dict=colors)

In [13]:
render_entities(text_of_reviews, gpt4_entities, color_dict=colors)

In [14]:
render_entities(text_of_reviews, ents_spacy, color_dict=colors)