In [16]:
import pandas as pd
import os
from tqdm import tqdm
from datetime import datetime
from dotenv import load_dotenv
import concurrent.futures

from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_tools import JsonOutputToolsParser

In [6]:
with open("../../../../Keys/openai_natural_language_search_systembolaget.txt") as file:
    os.environ["OPENAI_API_KEY"] = file.read()  

### Map template

In [None]:
os.environ["OPENAI_API_KEY"]

'sk-proj-LiSWlzQfCYjPVBpIFQu8T3BlbkFJ0iNqx1Ovwa57DJJOrp59'

In [292]:
batch_template = (
    "You are set to analyze the responses for a survey that is described as this:\n"
    "<survey_description>{survey_description}</survey_description>\n\n"

    "The following content is a set of survey answers for the question:"
    "<question>{question}</question>\n\n"

    "Answers:\n"
    "<answers>{answers}</answers>\n\n"

    "Instructions:\n"
    "Based on this list, please identify the main themes.\n"
    "The themes should be in a list. Every theme should have a short name and a well "
    "written description of user opinions.\n"
    "Pay close attention to the frequency and urgency of the answers.\n"
    "Include only insights that are prevalent in multiple answers and can be used to "
    "describe more general opinions.\n"
    "Always use the survey_theme_identifier function."
)

function_description = (
    "To identify the main themes present in the survey answers, based on the survey descirption "
    "and the question that has been asked to the respondent. Close attention must be paid the the "
    "frequency and urgency and the responses."
)

major_themes_items = {
    "type" : "object",
    "properties" : {
        "theme_name" : {
            "type" : "string",
            "description" : "A descriptive name for the identified theme in Swedish"
        },
        "theme_description" : {
            "type" : "string",
            "description" : "A longer description in Swedish that captures the overall opinion and variance of the identified theme."
        },
        "sentiment" : {
            "type" : "string",
            "description" : "What the overall sentiment is in the responses about the specified theme.",
            "enum" : ["positive", "negative", "neutral", "neither"]
        }
    },
     "required": ["theme_name", "theme_description", "sentiment"]
}

survey_function = {
    "type": "function",
    "function": {
        "name": "survey_theme_identifier",
        "description": function_description,
        "parameters": {
            "type": "object",
            "properties": {
                "major_themes": {
                    "type": "array",
                    "description": "List of distinct main themes in the survey responses.",
                    "items" : major_themes_items
                }
            },
            "required": ["major_themes"]
        }
    }
}

In [293]:
CHAT_MODEL = ChatOpenAI(temperature=0, model_name="gpt-4o")

In [294]:
model = CHAT_MODEL.bind_tools([map_function])
map_chain = map_prompt_template | model | JsonOutputToolsParser()

In [295]:
def get_batch_answers(data, survey_description, question_column):

    batch = []
    batch_size = 250
    data = data[~data[question_column].isna()]

    for i in range(0, len(data), batch_size):
        temp = data.iloc[i:i+batch_size]
        answers = "\n\n\n".join(temp[question_column])
        batch.append({
            "question": question_column, 
            "answers" : answers, 
            "survey_description" : survey_description
            })
        
    return batch

### Get data

In [296]:
data = pd.read_excel("data.xlsx")
survey_description = "En enkät om användares åsikt kring en ny funktion i Systembolagets app där man kan beställa till butik."
question_column = "Är det något du vill kommentera kring dina svar?"

In [297]:
batch_answers = get_batch_answers(data, survey_description, question_column)

### Reduce template

In [251]:
summary_template = (
    "You are set to analyze the responses for a survey that is described as this:\n"
    "<survey_description>{survey_description}</survey_description>\n\n"
    
    "The following is set of summaries of main themes for the question: "
    "<question>{question}</question> in the survey:\n"
    
    "<answer_summar>{answer_summary}</answer_summar>\n\n"
    
    "Instructions:"
    "Take these and distill it into a final, consolidated summary of the main themes.\n"
    "Make sure to make the list as relevant and consice as possible.\n"
    "Make sure that similar themes are consolidated into one.\n"
    "Every theme should have a name and a well written section on the details of the repondents opinions.\n"
    "Include only insights that can be used to describe more general opinions.\n"
    "The themes should be in a list in plain string format.\n"
    "Always include a theme called General, with an overall summary of the opinions.\n"
    "Always use the survey_theme_identifier function."
)

In [256]:
def get_string_summary(responses):
    collection = ""
    
    for run, response in enumerate(responses):
        collection+=f"# SUMMARY {run+1}:\n"
        for theme in response[0]["args"]["major_themes"]:
            theme_summary = (
                f"<theme_name>{theme['theme_name']}</theme_name>\n"
                f"<theme_description>{theme['theme_description']}</theme_description>\n"
                f"<information>Sentiment:{theme['sentiment']}\nFrequency: {theme['frequency']}/5\nUrgency: {theme['urgency']}/5</information>\n"
                "---\n"
            )
            collection+=f"{theme_summary}"
        
       #collection+="------\n"
        
    return collection

In [257]:
def _set_up_templates():
    batch_prompt = ChatPromptTemplate.from_template(batch_template)
    summary_prompt = ChatPromptTemplate.from_template(summary_template)
    
    return batch_prompt, summary_prompt


def _set_up_chains(batch_prompt, summary_prompt):
    
    model = CHAT_MODEL.bind_tools([survey_function]) 
    batch_chain = batch_prompt | model | JsonOutputToolsParser() 
    summary_chain = summary_prompt | model | JsonOutputToolsParser()

    return batch_chain, summary_chain

In [287]:
def get_question_summary(data, survey_description,  question):
    
    batch_prompt, summary_prompt = _set_up_templates()
    batch_chain, summary_chain = _set_up_chains(batch_prompt, summary_prompt)
    
    data = data.dropna(subset=question)
    data.loc[:,"words"] = data.loc[:,question].apply(lambda x:len(x.split(" ")))
    data = data[data["words"] > 3].copy()
    
    batch = get_batch_answers(data, survey_description, question)
    responses = batch_chain.batch(batch)    
    string_summary = get_string_summary(responses)
    
    question_summary = summary_chain.invoke({
        "answer_summary" : string_summary, 
        "survey_description": survey_description, 
        "question" : question
        })
    
    return question_summary[0]["args"]["major_themes"]

In [288]:
import concurrent.futures

def generate_theme_dict(data, survey_description, questions):
    theme_dict = {}

    # Using ThreadPoolExecutor to process each question in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Create a future for each question
        futures = {executor.submit(get_question_summary, data, survey_description, question): question for question in questions}

        # As each future completes, update theme_dict
        for future in concurrent.futures.as_completed(futures):
            question = futures[future]
            try:
                question_summary = future.result()
                theme_dict[question] = question_summary
            except Exception as exc:
                print(f'{question} generated an exception: {exc}')

    return theme_dict

In [289]:
a = generate_theme_dict(data, survey_description, [question_column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:,"words"] = data.loc[:,question].apply(lambda x:len(x.split(" ")))


In [291]:
with open()

{'Är det något du vill kommentera kring dina svar?': [{'theme_name': 'Förbetalning',
   'theme_description': 'Många användare uttrycker missnöje med att behöva betala i förskott för sina beställningar. De föredrar att betala i butik vid upphämtning, vilket de anser vara mer bekvämt och säkert.',
   'sentiment': 'negative',
   'frequency': 5,
   'urgency': 4},
  {'theme_name': 'Betalningsmetoder',
   'theme_description': 'Användare efterfrågar fler betalningsalternativ, såsom Klarna och Apple Pay, samt möjligheten att spara betalkortsuppgifter för smidigare betalning.',
   'sentiment': 'neutral',
   'frequency': 4,
   'urgency': 3},
  {'theme_name': 'Tekniska problem',
   'theme_description': 'Flera användare rapporterar tekniska problem med appen, särskilt vid betalning med Swish och BankID. Problem inkluderar att fastna på webbsidan efter betalning och att appen inte återgår automatiskt efter genomförd betalning.',
   'sentiment': 'negative',
   'frequency': 4,
   'urgency': 3},
  {'t