# Extract chat history data

## Connect postgersql instant

In [None]:
import psycopg2
import sqlalchemy
from sqlalchemy import MetaData, text
import os
from sqlalchemy import create_engine
import pandas as pd

DATABASE_URL = 'postgresql+psycopg2://postgres:postgres@localhost:5432'

engine = sqlalchemy.create_engine(DATABASE_URL, echo=True)

# print("Connection is", engine)

conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="postgres"
)

cur = conn.cursor()

## Create table views

In [5]:
create_view = """
                    CREATE OR REPLACE VIEW v_user_messages
                    AS
                    SELECT text, agent, conversation_id, created_at, id, tool_plan, position FROM messages 
                    WHERE agent = 'USER'
                    ORDER BY created_at desc;
"""
# Execute the SQL query
cur.execute(create_view)

# Commit the transaction
conn.commit()

create_view = """
                    CREATE OR REPLACE VIEW v_chatbot_messages
                    AS
                    SELECT text, agent, conversation_id, created_at, id, tool_plan, position FROM messages 
                    WHERE agent = 'CHATBOT'
                    ORDER BY created_at desc;
"""
# Execute the SQL query
cur.execute(create_view)

# Commit the transaction
conn.commit()

## Query user input

In [8]:
# Get the questions from user
query = """SELECT 
            user_msgs.agent, 
            user_msgs.text,
            user_msgs.conversation_id,
            user_msgs.created_at,
            user_msgs.position
        FROM public.v_user_messages as user_msgs
        ORDER BY user_msgs.created_at desc;"""

# Execute the query
cur.execute(query)

In [9]:
# Fetch all results from the executed query
user_msg = cur.fetchall()
user_msg

[('USER',
  "What is the purpose of adding a header image to a Confluence space and how does it enhance the space's visual appeal and welcoming atmosphere?",
  '8117578e-d06a-4bfd-988f-b2eee28121f1',
  datetime.datetime(2024, 9, 17, 21, 44, 52, 305768),
  6),
 ('USER',
  'How do employee engagement and disengagement differ in terms of emotional commitment, motivation, and their relation to labor laws?',
  '8117578e-d06a-4bfd-988f-b2eee28121f1',
  datetime.datetime(2024, 9, 17, 21, 44, 38, 766823),
  5),
 ('USER',
  'What is the significance of identifying growth areas in self-assessment?',
  '8117578e-d06a-4bfd-988f-b2eee28121f1',
  datetime.datetime(2024, 9, 17, 21, 44, 17, 904776),
  4),
 ('USER',
  'What services does the Employee Assistance Program (EAP) provide for employees?',
  '8117578e-d06a-4bfd-988f-b2eee28121f1',
  datetime.datetime(2024, 9, 17, 21, 44, 1, 697199),
  3),
 ('USER',
  'What training programs are offered in data science by Tech Innovators Inc.?',
  '8117578e-d0

**Convert the retrieved data to a dataframe.**

In [10]:
# Assign the appropriate column names
user_column_names = ['agent', 'question', 'conversation_id', 'timestamp', 'position']

# Create a DataFrame
df_questions = pd.DataFrame([row for row in user_msg], columns=user_column_names)

In [11]:
df_questions

Unnamed: 0,agent,question,conversation_id,timestamp,position
0,USER,What is the purpose of adding a header image t...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:52.305768,6
1,USER,How do employee engagement and disengagement d...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:38.766823,5
2,USER,What is the significance of identifying growth...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:17.904776,4
3,USER,What services does the Employee Assistance Pro...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:01.697199,3
4,USER,What training programs are offered in data sci...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:43:49.247020,2
5,USER,What resources should be added for new hires i...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:43:34.977663,1
6,USER,What is Tech Innovators Inc.'s approach to wor...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:43:17.862142,0
7,USER,How does Tech Innovators Inc. promote employee...,119128da-543a-4e36-b7f3-41fa2bebb6cb,2024-09-17 18:18:48.844662,8
8,USER,What steps are needed to extract data from Con...,119128da-543a-4e36-b7f3-41fa2bebb6cb,2024-09-17 18:18:34.719956,7
9,USER,How do employee engagement and disengagement d...,119128da-543a-4e36-b7f3-41fa2bebb6cb,2024-09-17 18:18:21.461914,6


## Query system response

### Answers

In [12]:
# Get the questions from user
query = """SELECT 
            chatbot_msgs.agent, 
            chatbot_msgs.text,
            chatbot_msgs.conversation_id,
            chatbot_msgs.created_at,
            chatbot_msgs.id,
            chatbot_msgs.position
        FROM public.v_chatbot_messages as chatbot_msgs
        ORDER BY chatbot_msgs.created_at desc;"""

# Execute the query
cur.execute(query)

In [13]:
# Fetch all results from the executed query
chatbot_msg = cur.fetchall()
chatbot_msg

[('CHATBOT',
  '[openai]: Adding a header image to a Confluence space serves several important purposes. Firstly, it enhances the visual appeal of the space, making it more attractive and engaging for visitors. A well-chosen header image can reflect the theme or purpose of the space, creating a cohesive look and feel that aligns with the content presented.\n\nSecondly, a header image contributes to a welcoming atmosphere. It can evoke positive emotions and set the tone for the user experience, making visitors feel more comfortable and inclined to explore the content. By providing a visually appealing introduction, the header image helps to draw users in and encourages them to interact with the space more actively.\n\nOverall, incorporating a header image is an effective way to improve user engagement and create a more inviting environment within a Confluence space.',
  '8117578e-d06a-4bfd-988f-b2eee28121f1',
  datetime.datetime(2024, 9, 17, 21, 45, 1, 526985),
  '718f1b2d-af3a-4e0e-8b3

**Convert the retrieved data to a dataframe.**

In [14]:
# Assign the appropriate column names
chatbot_column_names = ['agent', 'answer', 'conversation_id', 'timestamp', 'msg_id','position']

# Create a DataFrame
df_response = pd.DataFrame([row for row in chatbot_msg], columns=chatbot_column_names)

In [15]:
df_response

Unnamed: 0,agent,answer,conversation_id,timestamp,msg_id,position
0,CHATBOT,[openai]: Adding a header image to a Confluenc...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:45:01.526985,718f1b2d-af3a-4e0e-8b3c-251b687b763a,6
1,CHATBOT,[openai]: Employee engagement and disengagemen...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:49.006283,b9a78e51-e43c-4f2e-b9aa-e937a3bfacee,5
2,CHATBOT,[openai]: Identifying growth areas in self-ass...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:27.890469,77467460-8250-4984-b81b-e40b3e576435,4
3,CHATBOT,[openai]: The Employee Assistance Program (EAP...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:09.997083,b0ffe91d-f84d-482d-adab-78646b7492f4,3
4,CHATBOT,[openai]: Tech Innovators Inc. offers a range ...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:43:58.972825,1717b0f9-9d53-441d-84b9-06299d6949b6,2
5,CHATBOT,[openai]: For new hires in the onboarding proc...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:43:46.027698,bd2f1bd6-5d08-47b4-ba1a-280b69c0f9c5,1
6,CHATBOT,[openai]: Tech Innovators Inc. has a zero-tole...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:43:29.170986,0d735817-2b82-449b-bb35-1a6b81be8148,0
7,CHATBOT,[openai]: Tech Innovators Inc. promotes employ...,119128da-543a-4e36-b7f3-41fa2bebb6cb,2024-09-17 18:18:58.087223,bed1ac5d-6164-4924-97f5-5111509a5f8e,8
8,CHATBOT,[openai]: To extract data from Confluence and ...,119128da-543a-4e36-b7f3-41fa2bebb6cb,2024-09-17 18:18:45.436811,5d697220-b0fb-4fe0-a6f4-93c618568b25,7
9,CHATBOT,[openai]: Employee engagement and disengagemen...,119128da-543a-4e36-b7f3-41fa2bebb6cb,2024-09-17 18:18:32.417639,d86faf67-d07c-497c-b760-39ff8eba131d,6


### Contexts

In [16]:
# Fetch the context for the response if present
query_documents = """SELECT text, conversation_id, message_id, document_id 
        FROM public.documents
            ;"""

# Execute the query
cur.execute(query_documents)

# Fetch all results from the executed query
doc_result = cur.fetchall()
doc_result

[('the Senior Director responsible for Analytics Delivery, your role is critical to the success of Tech Innovators Inc. By leveraging your strategic vision, technical expertise, and leadership skills,',
  '119128da-543a-4e36-b7f3-41fa2bebb6cb',
  '80276b01-5a07-4be7-bb6a-470fa629315a',
  '0'),
 ('the Senior Director responsible for Analytics Delivery, your role is critical to the success of Tech Innovators Inc. By leveraging your strategic vision, technical expertise, and leadership skills,',
  '119128da-543a-4e36-b7f3-41fa2bebb6cb',
  '80276b01-5a07-4be7-bb6a-470fa629315a',
  '1'),
 ('the Senior Director responsible for Analytics Delivery, your role is critical to the success of Tech Innovators Inc. By leveraging your strategic vision, technical expertise, and leadership skills,',
  '119128da-543a-4e36-b7f3-41fa2bebb6cb',
  '80276b01-5a07-4be7-bb6a-470fa629315a',
  '2'),
 ('the Senior Director responsible for Analytics Delivery, your role is critical to the success of Tech Innovators 

**Convert the retrieved data to a dataframe.**

In [17]:
# Assign the appropriate column names
docs_column_names = ['contexts', 'conversation_id', 'msg_id', 'doc_id']

# Create a DataFrame
df_docs = pd.DataFrame([row for row in doc_result], columns=docs_column_names)

In [18]:
# Assuming you have a DataFrame named 'df'

# Group by 'conversation_id' and 'msg_id' and combine 'content' as a list
df_contexts = df_docs.groupby(['conversation_id','msg_id'])['contexts'].apply(list).reset_index()

print(df_contexts)

                         conversation_id  \
0   119128da-543a-4e36-b7f3-41fa2bebb6cb   
1   119128da-543a-4e36-b7f3-41fa2bebb6cb   
2   119128da-543a-4e36-b7f3-41fa2bebb6cb   
3   119128da-543a-4e36-b7f3-41fa2bebb6cb   
4   119128da-543a-4e36-b7f3-41fa2bebb6cb   
5   119128da-543a-4e36-b7f3-41fa2bebb6cb   
6   119128da-543a-4e36-b7f3-41fa2bebb6cb   
7   119128da-543a-4e36-b7f3-41fa2bebb6cb   
8   119128da-543a-4e36-b7f3-41fa2bebb6cb   
9   8117578e-d06a-4bfd-988f-b2eee28121f1   
10  8117578e-d06a-4bfd-988f-b2eee28121f1   
11  8117578e-d06a-4bfd-988f-b2eee28121f1   
12  8117578e-d06a-4bfd-988f-b2eee28121f1   
13  8117578e-d06a-4bfd-988f-b2eee28121f1   
14  8117578e-d06a-4bfd-988f-b2eee28121f1   
15  8117578e-d06a-4bfd-988f-b2eee28121f1   

                                  msg_id  \
0   5337da38-0f84-4dd3-be38-3839ea9c16bf   
1   5d697220-b0fb-4fe0-a6f4-93c618568b25   
2   7f03ffa6-b3d7-4428-a2a1-5a0fe53cab95   
3   80276b01-5a07-4be7-bb6a-470fa629315a   
4   bed1ac5d-6164-4924-97f5-511

## Data for evaluation

In [22]:
# Perform a merge between chatbot answers dataframe and contexts dataframe base on both 'conversation_id' and 'msg_id'
df_response_contexts = pd.merge(df_response, df_contexts, on=['conversation_id', 'msg_id'], how='inner')

In [24]:
# Merge the resulting dataframe with user questions dataframe base on conversation_id and position
df_questions_response_contexts = pd.merge(df_questions, df_response_contexts, on=['conversation_id', 'position'], how='inner')

In [25]:
# Final result dataframe
df_questions_response_contexts

Unnamed: 0,agent_x,question,conversation_id,timestamp_x,position,agent_y,answer,timestamp_y,msg_id,contexts
0,USER,What is the purpose of adding a header image t...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:52.305768,6,CHATBOT,[openai]: Adding a header image to a Confluenc...,2024-09-17 21:45:01.526985,718f1b2d-af3a-4e0e-8b3c-251b687b763a,"[Description#F4F5F7In a sentence or two, descr..."
1,USER,How do employee engagement and disengagement d...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:38.766823,5,CHATBOT,[openai]: Employee engagement and disengagemen...,2024-09-17 21:44:49.006283,b9a78e51-e43c-4f2e-b9aa-e937a3bfacee,"[are motivated and committed, disengaged emplo..."
2,USER,What is the significance of identifying growth...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:17.904776,4,CHATBOT,[openai]: Identifying growth areas in self-ass...,2024-09-17 21:44:27.890469,77467460-8250-4984-b81b-e40b3e576435,[to identify strengths and areas for improveme...
3,USER,What services does the Employee Assistance Pro...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:44:01.697199,3,CHATBOT,[openai]: The Employee Assistance Program (EAP...,2024-09-17 21:44:09.997083,b0ffe91d-f84d-482d-adab-78646b7492f4,"[provides counseling services, legal advice, a..."
4,USER,What training programs are offered in data sci...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:43:49.247020,2,CHATBOT,[openai]: Tech Innovators Inc. offers a range ...,2024-09-17 21:43:58.972825,1717b0f9-9d53-441d-84b9-06299d6949b6,[role in making data-driven decisions and gain...
5,USER,What resources should be added for new hires i...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:43:34.977663,1,CHATBOT,[openai]: For new hires in the onboarding proc...,2024-09-17 21:43:46.027698,bd2f1bd6-5d08-47b4-ba1a-280b69c0f9c5,[customersContact usHow can someone reach out ...
6,USER,What is Tech Innovators Inc.'s approach to wor...,8117578e-d06a-4bfd-988f-b2eee28121f1,2024-09-17 21:43:17.862142,0,CHATBOT,[openai]: Tech Innovators Inc. has a zero-tole...,2024-09-17 21:43:29.170986,0d735817-2b82-449b-bb35-1a6b81be8148,[representative.Q12: What is the process for h...
7,USER,How does Tech Innovators Inc. promote employee...,119128da-543a-4e36-b7f3-41fa2bebb6cb,2024-09-17 18:18:48.844662,8,CHATBOT,[openai]: Tech Innovators Inc. promotes employ...,2024-09-17 18:18:58.087223,bed1ac5d-6164-4924-97f5-5111509a5f8e,"[IntroductionAt Tech Innovators Inc., we belie..."
8,USER,What steps are needed to extract data from Con...,119128da-543a-4e36-b7f3-41fa2bebb6cb,2024-09-17 18:18:34.719956,7,CHATBOT,[openai]: To extract data from Confluence and ...,2024-09-17 18:18:45.436811,5d697220-b0fb-4fe0-a6f4-93c618568b25,[IntroductionThis guide provides a step-by-ste...
9,USER,How do employee engagement and disengagement d...,119128da-543a-4e36-b7f3-41fa2bebb6cb,2024-09-17 18:18:21.461914,6,CHATBOT,[openai]: Employee engagement and disengagemen...,2024-09-17 18:18:32.417639,d86faf67-d07c-497c-b760-39ff8eba131d,"[are motivated and committed, disengaged emplo..."


# Save data for evaluation

**The contexts column of the final dataframe has a data type list. We need a function to serialize it before saving the dataframe to a csv file, and a de-serialize function to read a csv file with a list column**

In [26]:
import json
import pandas as pd

def serialize_list(value):
    """Serializes a list to a JSON string."""
    return json.dumps(value)

def deserialize_list(value):
    """Deserializes a JSON string back into a list."""
    return json.loads(value)

def save_dataframe_with_list_column(df, filename):
    """Saves a DataFrame with a list column to a CSV file, preserving the list structure.

    Args:
        df: The DataFrame to save.
        filename: The name of the output CSV file.
    """

    # Apply the serialization function to the list column
    df['contexts'] = df['contexts'].apply(serialize_list)

    # Save the DataFrame to CSV
    df.to_csv(filename, index=False)

def load_dataframe_with_list_column(filename):
    """Loads a DataFrame from a CSV file, restoring the list structure.

    Args:
        filename: The name of the input CSV file.

    Returns:
        The loaded DataFrame.
    """

    # Load the DataFrame
    df = pd.read_csv(filename)

    # Apply the deserialization function to the list column
    df['contexts'] = df['contexts'].apply(deserialize_list)

    return df

**Save the result dataframe to a csv file for evaluation.**

In [27]:
from from_root import from_root

file_name = "test_dataset_it_openai_deployment.csv"

save_dataframe_with_list_column(df_questions_response_contexts[['question', 'answer', 'contexts']], os.path.join(from_root(), "data-test/test_dataset/", file_name))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['contexts'] = df['contexts'].apply(serialize_list)
