In [26]:
import pandas as pd
import os
from dotenv import load_dotenv
import concurrent.futures

from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from operator import itemgetter
from langchain.schema.runnable import RunnableMap

In [22]:
with open("../../../../Keys/openai_natural_language_search_systembolaget.txt") as file:
    os.environ["OPENAI_API_KEY"] = file.read()  

In [161]:
system_template = (
    "You are an expert at tagging survey answers with themes of what they cover. "
    "Make sure that you pay close attention to what the answer is, and only use relvant themes as tags. "
    "One answer may have multiple themes tied to it. These should always be carefully chosen.\n\n"

    "The survey is describes as following:\n"
    "<survey_description>{survey_description}</survey_description>\n\n"

    "This answer is for the question:\n"
    "<question>{question}</question> \n\n"
    
    "These are the themes that you are allowed to use:\n"
    "<themes>{themes}</themes>\n\n"
    
    "The respondents answer: {answer}"
)

In [162]:
theme_dict = {'Är det något du vill kommentera kring dina svar?': [{'theme_name': 'Förbetalning',
   'theme_description': 'Många användare uttrycker missnöje med att behöva betala i förskott för sina beställningar. De föredrar att betala i butik vid upphämtning, vilket de anser vara mer bekvämt och säkert.',
   'sentiment': 'negative',
   'frequency': 5,
   'urgency': 4},
  {'theme_name': 'Betalningsmetoder',
   'theme_description': 'Användare efterfrågar fler betalningsalternativ, såsom Klarna och Apple Pay, samt möjligheten att spara betalkortsuppgifter för smidigare betalning.',
   'sentiment': 'neutral',
   'frequency': 4,
   'urgency': 3},
  {'theme_name': 'Tekniska problem',
   'theme_description': 'Flera användare rapporterar tekniska problem med appen, särskilt vid betalning med Swish och BankID. Problem inkluderar att fastna på webbsidan efter betalning och att appen inte återgår automatiskt efter genomförd betalning.',
   'sentiment': 'negative',
   'frequency': 4,
   'urgency': 3},
  {'theme_name': 'Användarvänlighet',
   'theme_description': 'Användare upplever att appen är krånglig och svår att navigera. De efterfrågar förbättringar i sökfunktionen och tydligare instruktioner för beställning och betalning.',
   'sentiment': 'negative',
   'frequency': 4,
   'urgency': 3},
  {'theme_name': 'Positiv feedback',
   'theme_description': 'Trots vissa problem är många användare positiva till den nya beställningsfunktionen i appen. De uppskattar möjligheten att beställa direkt i appen och tycker att det är en bra förbättring.',
   'sentiment': 'positive',
   'frequency': 3,
   'urgency': 2}]}

### Functions

In [225]:
CHAT_MODEL = ChatOpenAI(temperature=0, model_name="gpt-4o")

In [226]:
def _set_up_tools(themes):
    
    # Function Description
    function_description = (
        "To identify the relevant themes present in the survey answer based on the survey description, "
        "the question asked to the respondent, and the themes included in the survey. This function also "
        "assesses the sentiment and urgency expressed by the respondent for each identified theme."
    )

    # Theme Items Schema
    theme_items = {
        "type": "object",
        "properties": {
            "theme": {
                "type": "string",
                "description": "The identified theme present in the survey answer.",
                "enum": [theme["theme_name"] for theme in themes]
            },
            "sentiment": {
                "type": "string",
                "description": "The overall sentiment expressed in the response about the specified theme.",
                "enum": ["positive", "negative", "neutral", "neither", "both"]
            },
            "urgency": {
                "type": "integer",
                "description": (
                    "A scale from 1-5 defining the overall urgency of this theme in the response. "
                    "A higher number indicates a stronger sense of urgency expressed by the respondent. "
                ),
                "enum": [1, 2, 3, 4, 5]
            }
        },
        "required": ["theme", "sentiment", "urgency"]
    }

    # Answer Theme Function Schema
    answer_theme_function = {
        "type": "function",
        "function": {
            "name": "answer_theme_identifier",
            "description": function_description,
            "parameters": {
                "type": "object",
                "properties": {
                    "themes": {
                        "type": "array",
                        "description": "List of relevant themes identified in the survey response.",
                        "items": theme_items
                    }
                },
                "required": ["themes"]
            }
        }
    }

    
    return answer_theme_function
    

In [227]:
def _set_up_chain(themes):
    prompt = ChatPromptTemplate.from_template(system_template)
    tool = _set_up_tools(themes)
    model = CHAT_MODEL.bind_tools([tool]) 
    chain = RunnableMap({
        "answer_id" : itemgetter("answer_id"),
        "data" : (
            {
                "answer": itemgetter("answer"), 
                "question": itemgetter("question"), 
                "themes": itemgetter("themes"), 
                "survey_description": itemgetter("survey_description")
                }
            | prompt
            | model
            | JsonOutputToolsParser()
        )
    })
    
    return chain

In [228]:
def process_llm_output(llm_output):

    preprocessed_data = pd.DataFrame()

    for i, item in enumerate(llm_output):

        temp = pd.DataFrame(item["data"][0]["args"]["themes"])
        temp["answer_id"] = item["answer_id"]

        preprocessed_data = pd.concat([preprocessed_data, temp])
        
    return preprocessed_data

In [229]:
def parse_answers(data, question, survey_description, themes, batch_size:int = 250):

    theme_list = ""
    
    for theme in themes[question]:
        if theme['theme_name'] != "General" : 
            theme_list += f"{theme['theme_name']} : {theme['theme_description']}\n"

    data = data.dropna(subset=question).copy()
    data["words"] = data[question].apply(lambda x:len(x.split(" ")))
    data = data[data["words"] > 3]
    
    chain = _set_up_chain(themes[question])
    
    # Send batch of answers at a time to LLM
    parsed_data = pd.DataFrame()
    for i in range(0, len(data), batch_size):
        # Batching
        temp = data.iloc[i:i+batch_size]
        batch = [
            {"answer": row[question], 
             "question": question, 
             "answer_id": idx, 
             "themes" : theme_list, 
             "survey_description": survey_description
             } for idx, row in temp.iterrows()]
        
        # Parse reviews with LLM
        llm_output = chain.batch(batch) 
        processed_output = process_llm_output(llm_output)
        parsed_data = pd.concat([parsed_data, processed_output])

    return parsed_data.reset_index(drop=True)

In [230]:
data = pd.read_excel("data.xlsx")
survey_description = "En enkät om användares åsikt kring en ny funktion i Systembolagets app där man kan beställa till butik."
question_column = "Är det något du vill kommentera kring dina svar?"

In [231]:
parsed_data = parse_answers(data, question_column, survey_description, theme_dict)

In [233]:
parsed_data.groupby(["theme", "sentiment"])["answer_id"].nunique()

theme              sentiment
Användarvänlighet  both           1
                   negative      92
                   neutral       13
                   positive       7
Betalningsmetoder  negative      11
                   neutral       10
                   positive      13
Förbetalning       negative     115
                   neutral        8
                   positive       5
Positiv feedback   neutral        1
                   positive      91
Tekniska problem   negative      55
Name: answer_id, dtype: int64

In [234]:
parsed_data[(parsed_data["theme"] == "Användarvänlighet") & (parsed_data["sentiment"] == "both")]

Unnamed: 0,theme,sentiment,urgency,answer_id
388,Användarvänlighet,both,3,1534


In [236]:
parsed_data[parsed_data["answer_id"] == 1045]

Unnamed: 0,theme,sentiment,urgency,answer_id
273,Tekniska problem,negative,3,1045
274,Positiv feedback,positive,2,1045


In [237]:
data.iloc[1045]["Är det något du vill kommentera kring dina svar?"]

'Fick kontakta kundservice för hjälp med att beställa via appen. Fick hjälp direkt mkt bra. Var tvungen att aktivera funktionen beställa, sen gick det bra.'