In [1]:
import json
import dotenv
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from devtools import pprint

dotenv.load_dotenv(dotenv_path=".env")

True

In [2]:
feedbacks_deduped = pd.read_parquet("data/silver/feedback-text-deduped-2023.parquet")

In [3]:
feedbacks_deduped

Unnamed: 0,feedback_text
0,Excellent value for money!
1,Not satisfied with the service.
2,The pricing is too high for what you get.
3,"Great product, but the delivery was late."
4,The delivery was fast and the product is good.
5,The customer service was very helpful.
6,Received a defective item.
7,Easy to use website and quick checkout.
8,I had issues with the website.
9,The quality of the product was below expectati...


### Aspect-Based Sentiment Analysis with Category and Opinion Extraction

In [4]:
sentiments = ["positive", "negative", "neutral"]
categories = ["Product Quality", "Delivery", "Pricing", "Customer Service", "Website Usability", "Other"]

In [5]:
extraction_prompt = ChatPromptTemplate.from_template(
    """
Extract the desired information from the following customer feedback.

Only extract the properties you were instructed to in the 'AspectBasedSentimentAnalysisWithCategoryAndOpinionExtraction' function.

Feedback:
{input}
"""
)


class AspectSentimentWithCategoryAndOpinion(BaseModel):
    aspect: str = Field(
        ...,
        description="A mentioned aspect (explicitly)"
    )
    opinion: str = Field(
        ...,
        description="The opinion related to the extracted aspect (explicitly)"
    )
    sentiment: str = Field(
        ...,
        description="The sentiment related to the extracted aspect",
        enum=sentiments
    )
    category: str = Field(
        ...,
        description="The category related to the extracted aspect",
        enum=categories
    )


class AspectBasedSentimentAnalysisWithCategoryAndOpinionExtraction(BaseModel):
    aspect_based_sentiments_with_categories: list[AspectSentimentWithCategoryAndOpinion] = Field(
        ...,
        description="A list of 'AspectSentimentWithCategoryAndOpinion' where each contains the sentiment, category and opinion related to a mentioned aspect"
    )

gpt4o_extraction_llm = ChatOpenAI(temperature=0, model="gpt-4o").with_structured_output(
    AspectBasedSentimentAnalysisWithCategoryAndOpinionExtraction
)
gpt4omini_extraction_llm = ChatOpenAI(temperature=0, model="gpt-4o-mini").with_structured_output(
    AspectBasedSentimentAnalysisWithCategoryAndOpinionExtraction
)
gpt35turbo_extraction_llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo").with_structured_output(
    AspectBasedSentimentAnalysisWithCategoryAndOpinionExtraction
)

gpt4o_extraction_chain = extraction_prompt | gpt4o_extraction_llm
gpt4omini_extraction_chain = extraction_prompt | gpt4omini_extraction_llm
gpt35turbo_extraction_chain = extraction_prompt | gpt35turbo_extraction_llm

In [6]:
def majority_voting_classifier(chain1, chain2, chain3, input):
    answ1 = chain1.invoke(input)
    answ2 = chain2.invoke(input)
    answ3 = chain3.invoke(input)
    answers_list = [
        answ1.json().lower(),
        answ2.json().lower(),
        answ3.json().lower()
    ]
    answers_set = set(answers_list)
    answer = max(answers_set, key=answers_list.count)
    votes = answers_list.count(answer)
    agreement = votes/3
    return json.loads(answer)["aspect_based_sentiments_with_categories"], votes, agreement


answer, votes, agreement = majority_voting_classifier(
    chain1=gpt4o_extraction_chain,
    chain2=gpt4omini_extraction_chain,
    chain3=gpt35turbo_extraction_chain,
    input="The delivery was fast and the product is good.",
)

print(f"Number of votes on final answer: {votes} ({int(100 * agreement)}% agreement)")
pprint(answer)

Number of votes on final answer: 3 (100% agreement)
[
    {
        'aspect': 'delivery',
        'opinion': 'fast',
        'sentiment': 'positive',
        'category': 'delivery',
    },
    {
        'aspect': 'product',
        'opinion': 'good',
        'sentiment': 'positive',
        'category': 'product quality',
    },
]


In [7]:
feedbacks_deduped[["extraction", "votes", "agreement"]] = feedbacks_deduped.apply(
    lambda row: majority_voting_classifier(
        chain1=gpt4o_extraction_chain,
        chain2=gpt4omini_extraction_chain,
        chain3=gpt35turbo_extraction_chain,
        input=row["feedback_text"],
    ),
    axis=1,
    result_type="expand"
)

In [8]:
feedbacks_deduped

Unnamed: 0,feedback_text,extraction,votes,agreement
0,Excellent value for money!,"[{'aspect': 'value for money', 'opinion': 'exc...",3,1.0
1,Not satisfied with the service.,"[{'aspect': 'service', 'opinion': 'not satisfi...",3,1.0
2,The pricing is too high for what you get.,"[{'aspect': 'pricing', 'opinion': 'too high fo...",3,1.0
3,"Great product, but the delivery was late.","[{'aspect': 'product', 'opinion': 'great', 'se...",3,1.0
4,The delivery was fast and the product is good.,"[{'aspect': 'delivery', 'opinion': 'fast', 'se...",2,0.666667
5,The customer service was very helpful.,"[{'aspect': 'customer service', 'opinion': 've...",3,1.0
6,Received a defective item.,"[{'aspect': 'item', 'opinion': 'defective', 's...",2,0.666667
7,Easy to use website and quick checkout.,"[{'aspect': 'website', 'opinion': 'easy to use...",2,0.666667
8,I had issues with the website.,"[{'aspect': 'website', 'opinion': 'had issues'...",2,0.666667
9,The quality of the product was below expectati...,"[{'aspect': 'product quality', 'opinion': 'bel...",2,0.666667


In [9]:
print(f"average LLM agreement: {round(100 * feedbacks_deduped['agreement'].mean(), 2)}%")

average LLM agreement: 83.33%


In [10]:
def debug_classifier(chain1, chain2, chain3, input):
    answ1 = chain1.invoke(input)
    answ2 = chain2.invoke(input)
    answ3 = chain3.invoke(input)
    pprint(answ1.aspect_based_sentiments_with_categories)
    pprint(answ2.aspect_based_sentiments_with_categories)
    pprint(answ3.aspect_based_sentiments_with_categories)
    
debug_classifier(
    chain1=gpt4o_extraction_chain,
    chain2=gpt4omini_extraction_chain,
    chain3=gpt35turbo_extraction_chain,
    input="Easy to use website and quick checkout.",
)

[
    AspectSentimentWithCategoryAndOpinion(
        aspect='website',
        opinion='easy to use',
        sentiment='positive',
        category='Website Usability',
    ),
    AspectSentimentWithCategoryAndOpinion(
        aspect='checkout',
        opinion='quick',
        sentiment='positive',
        category='Website Usability',
    ),
]
[
    AspectSentimentWithCategoryAndOpinion(
        aspect='website',
        opinion='Easy to use',
        sentiment='positive',
        category='Website Usability',
    ),
    AspectSentimentWithCategoryAndOpinion(
        aspect='checkout',
        opinion='quick',
        sentiment='positive',
        category='Website Usability',
    ),
]
[
    AspectSentimentWithCategoryAndOpinion(
        aspect='website',
        opinion='easy to use',
        sentiment='positive',
        category='Website Usability',
    ),
]


In [11]:
debug_classifier(
    chain1=gpt4o_extraction_chain,
    chain2=gpt4omini_extraction_chain,
    chain3=gpt35turbo_extraction_chain,
    input="I had issues with the website.",
)

[
    AspectSentimentWithCategoryAndOpinion(
        aspect='website',
        opinion='had issues',
        sentiment='negative',
        category='Website Usability',
    ),
]
[
    AspectSentimentWithCategoryAndOpinion(
        aspect='website',
        opinion='issues',
        sentiment='negative',
        category='Website Usability',
    ),
]
[
    AspectSentimentWithCategoryAndOpinion(
        aspect='website',
        opinion='had issues',
        sentiment='negative',
        category='Website Usability',
    ),
]


In [12]:
feedbacks_silver = pd.read_parquet("data/silver/cleansed-feedbacks-2023.parquet")
granular_opinions = feedbacks_silver.merge(feedbacks_deduped, on="feedback_text", how="inner")
granular_opinions = granular_opinions.drop(columns="feedback_text").explode("extraction").reset_index(drop=True)
granular_opinions = pd.concat(
    [granular_opinions.drop(columns=['extraction']), pd.json_normalize(granular_opinions['extraction'])],
    axis=1
)

In [14]:
granular_opinions

Unnamed: 0,feedback_id,customer_id,date,quarter,votes,agreement,aspect,opinion,sentiment,category
0,1,4174,2023-06-05,2023Q2,3,1.000000,value for money,excellent,positive,pricing
1,2,4507,2023-10-02,2023Q4,3,1.000000,service,not satisfied,negative,customer service
2,3,1860,2023-08-10,2023Q3,3,1.000000,pricing,too high for what you get,negative,pricing
3,4,2294,2023-03-04,2023Q1,3,1.000000,product,great,positive,product quality
4,4,2294,2023-03-04,2023Q1,3,1.000000,delivery,late,negative,delivery
...,...,...,...,...,...,...,...,...,...,...
1311,998,3871,2023-09-17,2023Q3,2,0.666667,product,good,positive,product quality
1312,999,1797,2023-06-24,2023Q2,2,0.666667,website,easy to use,positive,website usability
1313,999,1797,2023-06-24,2023Q2,2,0.666667,checkout,quick,positive,website usability
1314,1000,4313,2023-02-27,2023Q1,2,0.666667,delivery,fast,positive,delivery


In [15]:
granular_opinions.to_parquet("data/gold/granular-opinions-2023.parquet", index=False)