# Generating Sample Data for DSPy

## Notebook Setup

In [1]:
# Importing the necessary Python libraries
import os
import json
import time

import pandas as pd

from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI

In [2]:
# Loading in the synthetic knowledge items
df_kis = pd.read_csv('synthetic_knowledge_items.csv')
df_kis['sample_question'] = ''
df_kis['sample_answer'] = ''

## LangChain Setup

In [3]:
# Instantiating the chat model
chat_model = ChatOpenAI(api_key = os.environ['PERPLEXITY_API_KEY'],
                        base_url = 'https://api.perplexity.ai',
                        model = 'llama-3.1-70b-instruct')

In [4]:
# Defining the question-answer generation prompt
QA_GENERATION_PROMPT = '''Task:
Generate a single plausible question and answer pair based on the following knowledge item. Ensure that the question is one that would be commonly asked by a human.

Instructions:
    - Export the question and answer pair as a JSON object. Content must be able to be loaded with Python's json.loads() function.
    - Formulate an answer solely based on the information given in the knowledge item text.
    - Ensure that the answer is a response to the question as if it is a human answering another human's question.
    - Do not add any external knowledge.
    - Do not return any other text, like "Here is the question answer pair."
    - Please follow the example set below.

Example:
\{{
  "question": "What is the capital of France?",
  "answer": "Paris"
}}\

Knowledge item topic:
{ki_topic}

Knowledge item text:
{ki_text}
'''

# Creating the question-answer generation prompt template
qa_generation_prompt_template = ChatPromptTemplate.from_messages(messages = [
    HumanMessagePromptTemplate.from_template(template = QA_GENERATION_PROMPT)
])

# Creating the question-answer generation chain
qa_generation_chain = qa_generation_prompt_template | chat_model

In [5]:
def generate_sample_qas(row):
    '''
    Generating a new sample question-answer pair based on the knowledge item text.

    Inputs:
        - row (pd.Series): A row from the synthetic knowledge items DataFrame.

    Returns:
        - row (pd.Series): The input row with the new question-answer pair.
    '''
    # Checking if sample_question and sample_answer are already present
    if row['sample_question'] and row['sample_answer']:
        return row

    # Extracting the knowledge item text and topic from the row
    ki_text = row['ki_text']
    ki_topic = row['ki_topic']
    
    # Initializing retry count
    retries = 3
    for attempt in range(retries):
        try:
            # Generating the response using the qa_generation_chain
            response = qa_generation_chain.invoke(input = {
                'ki_topic': ki_topic,
                'ki_text': ki_text
            })
            
            # Checking if the response content is empty
            if response.content:
                content = response.content

                # Finding the index of the first left curly brace
                start_index = content.find('{')

                # Finding the index of the last right curly brace
                end_index = content.rfind('}') + 1

                if start_index != -1 and end_index != -1:

                    # Extracting the substring between the first and last left curly brace
                    content = content[start_index:end_index]

                try:

                    # Decoding the JSON content
                    data = json.loads(content)

                    # Adding the question and answer to the row
                    row['sample_question'] = data.get('question', '')
                    row['sample_answer'] = data.get('answer', '')
                    break  # Exiting the loop if successful

                except json.JSONDecodeError as e:

                    # Handling JSON decoding error
                    print(f'Error decoding JSON: {e}')
                    row['sample_question'] = ''
                    row['sample_answer'] = ''
            else:

                print('Response content is empty')
                row['sample_question'] = ''
                row['sample_answer'] = ''
                
        except Exception as e:
            print(f'Attempt {attempt + 1} failed: {e}')
            time.sleep(1)  # Waiting before retrying
            if attempt == retries - 1:
                print('All attempts failed. Setting sample_question and sample_answer to empty strings.')
                row['sample_question'] = ''
                row['sample_answer'] = ''
    
    return row

In [6]:
df = df_kis.apply(generate_sample_qas, axis = 1)

In [7]:
df.to_csv('synthetic_knowledge_items_with_qas.csv', index = False)

In [8]:
df

Unnamed: 0,ki_topic,ki_text,alt_ki_text,sample_question,sample_answer
0,Setting Up a Mobile Device for Company Email,**Setting Up a Mobile Device for Company Email...,"To set up a mobile device for company email, f...",What should I do if I encounter issues setting...,Ensure that your device has a stable internet ...
1,Resetting a Forgotten PIN,**Resetting a Forgotten PIN**\n\nIf you have f...,"If you have forgotten your PIN, you can reset ...",What should I do if I am unable to reset my PI...,If you are unable to reset your PIN using the ...
2,Configuring VPN Access for Remote Workers,**Configuring VPN Access for Remote Workers**\...,To configure VPN access for remote workers at ...,What should I do if I encounter issues connect...,Ensure that your device meets the minimum syst...
3,Troubleshooting Issues with Microsoft Office,**Troubleshooting Issues with Microsoft Office...,When troubleshooting issues with Microsoft Off...,What should I do if my Microsoft Office applic...,"First, try restarting the application. If that..."
4,Setting Up a Conference Call on Cisco Webex,"To set up a conference call on Cisco Webex, fo...","To set up a conference call on Cisco Webex, fo...",What should I do if I encounter issues joining...,Try restarting your browser or checking your i...
...,...,...,...,...,...
95,Setting Up a New User's Account in Dropbox,**Step 1: Create a New User Account**\n\nTo se...,"To set up a new user's account in Dropbox, fol...",What should I do if a new user is unable to ac...,Check that the new user has been assigned the ...
96,Creating a New IT Problem,**Creating a New IT Problem**\n\nAs an employe...,To create a new IT problem in Widgetco's IT se...,What information should I include when enterin...,You should provide a concise and detailed desc...
97,Troubleshooting Issues with a Slow-Performing ...,**Troubleshooting Issues with a Slow-Performin...,**Troubleshooting Issues with a Slow-Performin...,What is the first step to troubleshoot a slow-...,"The first step is to restart the computer, as ..."
98,Setting Up a Secure Connection to a Company-Is...,To set up a secure connection to a company-iss...,To set up a secure connection to a company-iss...,What should I do if I encounter issues during ...,Check that your device's firewall and antiviru...
