In [None]:
import json

with open('comments.json', 'r') as f:
    comments = json.load(f)

comments

In [None]:
import pandas as pd

df = pd.DataFrame(comments)
df

In [ ]:
%load_ext dotenv
%dotenv

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

In [None]:
from langchain_core.messages import HumanMessage, BaseMessage
from typing import List

prompt_template = """\
"{}"

The complaint above is written in Albanian.
Does any part of it include any words related to public transport in Albanian, such as "transporti publik", "autobuz", "furgon", "tren", "tramvaj", etc...?
"""

def get_messages_for_long_answer(comment) -> List[BaseMessage]:
    prompt = prompt_template.format(comment)
    return [HumanMessage(prompt)]

In [None]:
messages_for_long_answer = df['text'].apply(get_messages_for_long_answer)
messages_for_long_answer

In [None]:
long_answers = llm.batch(list(messages_for_long_answer))
long_answers

In [None]:
summarization_message = HumanMessage("Summarize the above answer as just 'YES' or 'NO', don't use any punctuation or anything else.")

messages_for_final_answer = [
    previous_messages + [long_answer, summarization_message]
    for previous_messages, long_answer in zip(messages_for_long_answer, long_answers)
]
messages_for_final_answer

In [None]:
from langchain.output_parsers import BooleanOutputParser

final_answer_chain = llm | BooleanOutputParser()

final_answers = final_answer_chain.batch(messages_for_final_answer)
final_answers

In [None]:
df['public_transport_related'] = final_answers  
df

In [None]:
df['n_replies'] = df['replies'].apply(len)
df

In [None]:
n_total_comments = len(df) + sum(df['n_replies'])

df_public_transport = df[df['public_transport_related'] == True]
n_pt_comments = len(df_public_transport) + sum(df_public_transport['n_replies'])

pt_percentage = round(n_pt_comments / n_total_comments * 100, 2)

print(f"Total comments: {n_total_comments}")
print(f"Public transport related: {n_pt_comments} ({pt_percentage}%)")