# Generate Synthetic Data

In [None]:
import os

from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import AzureOpenAIEmbeddings

gpt_35_16k = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_DEPLOYMENT_NAME"),
    model="gpt-3.5-turbo-16k",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)

gpt_4 = AzureChatOpenAI(
    azure_deployment="chat-gpt-4",
    model="gpt-4",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)

embedding_model = AzureOpenAIEmbeddings(
    model= "text-embedding-ada-002",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)

In [None]:
import random
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()

def check_similarity(row, dataframe):
    similarity_threshold = 0.9 
    for _, existing_row in dataframe.iterrows():
        tfidf_matrix = vectorizer.fit_transform([row['question'], existing_row['question']])
        cos_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
        cos_similarity = cos_similarity[0][0]
        if  cos_similarity > similarity_threshold:
            return True
    return False

def check_dataframe(final_df, df):
    rows_to_add = []
    for _, row in df.iterrows():
        if not check_similarity(row, final_df):
            rows_to_add.append(row)
    
    if rows_to_add:
        filtered_new_df = pd.DataFrame(rows_to_add)
        final_df = pd.concat([final_df, filtered_new_df], ignore_index=True)
    
    return final_df


def generate_floats():
    result = []

    first_float = round(random.uniform(0.0, 1.0), 1)
    result.append(first_float)
    
    remaining_sum = 1.0 - first_float
    
    # Generate subsequent floats based on the remaining sum
    for _ in range(3):
        if remaining_sum >= 0.09999999999999998:
            next_float = round(random.uniform(0.0, remaining_sum), 1)
            result.append(next_float)
            remaining_sum -= next_float
        else:
            result.append(0.0)
    
    return result


def generate_distribution():
    result = generate_floats()
    while sum(result) <= 1.0:
        result = generate_floats()

    testset_distribution = {
        "simple": 0,
        "multi_context": 0,
        "reasoning": 0,
        "conditional": 0,
    }

    for dist in testset_distribution.keys():
        random_element = random.choice(result)
        testset_distribution[dist] = random_element
        result.remove(random_element)

    return testset_distribution

In [None]:
from llama_index import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir=os.getenv("PDF_DIR"))
documents = reader.load_data()

In [None]:
from ragas.llms import LangchainLLM
from ragas.testset import TestsetGenerator

final_df = pd.DataFrame([], columns=[
    'question',
    'ground_truth_context',
    'ground_truth',
    'question_type',
    'episode_done'])

iteration = 0

while len(final_df) < 100:
    testset_distribution = generate_distribution()
    print(testset_distribution)
    # testset_distribution = {
    #     "simple": 0.6,
    #     "multi_context": 0.2,
    #     "reasoning": 0.1,
    #     "conditional": 0.1,
    # }
    
    # if iteration % 2 == 0:
    #     generator_llm = LangchainLLM(llm=gpt_4)
    # else:
    generator_llm = LangchainLLM(llm=gpt_35_16k)
    critic_llm = LangchainLLM(llm=gpt_4)

    test_generator = TestsetGenerator(
        generator_llm=generator_llm,
        critic_llm=critic_llm,
        embeddings_model=embedding_model,
        testset_distribution=testset_distribution,
        chat_qa=0.0,
        chunk_size=512,
        threshold=5.0,
    )
    test_set = test_generator.generate(documents, test_size=100)
    test_df = test_set.to_pandas()
    final_df = check_dataframe(final_df, test_df)

    iteration += 1

In [None]:
def rowIndex(row):
    return row.name

def generate_expected(row):
    truth = ' or '.join(row['ground_truth'])
    context = "\n\n\n\n".join(row['ground_truth_context'])
    return f"""The answer must have the same meaning with this answer not less and not more: {truth}
Also, the answer must be relevant with these contexts: {context}"""

def generate_scenario_name(row):
    return f"{os.getenv('FILE_NAME')} - {row['rowIndex']}"

# Apply the function to create the 'expected' column
final_df["rowIndex"] = final_df.apply(rowIndex, axis=1)
final_df["expected"] = final_df.apply(generate_expected, axis=1)
final_df["scenario name"] = final_df.apply(generate_scenario_name, axis=1)

In [None]:
final_df.to_csv(os.getenv("OUTPUT PATH"), index=False)