# Analyzing synthetic annotations

## Loading the synthetic conversations

In [5]:
import os
import pandas as pd

# code adapted from https://www.geeksforgeeks.org/python-list-all-files-in-directory-and-subdirectories/
def files_from_dir_recursive(start_path='.'):
    all_files = []
    for root, dirs, files in os.walk(start_path):
        for file in files:
            all_files.append(os.path.join(root, file))
    return all_files

In [4]:
conversation_files = files_from_dir_recursive(start_path="../output/conversations")

In [46]:
import json
from tqdm.auto import tqdm


def import_conversations(conv_dir: str) -> pd.DataFrame:
    file_paths = files_from_dir_recursive(conv_dir)
    rows = []

    for file_path in file_paths:
        with open(file_path, "r") as fin:
            conv = json.load(fin)

        conv = pd.json_normalize(conv)
        conv = conv[["id", "user_prompts", "logs"]]
        conv = conv.explode("logs")
        conv["user"] = conv.logs.apply(lambda x: x[0])
        conv["message"] = conv.logs.apply(lambda x: x[1])
        del conv["logs"]
        rows.append(conv)
    
    full_df = pd.concat(rows)
    full_df = full_df.set_index("id")
    return full_df
    
conv_df = import_conversations("../output/conversations")
conv_df

Unnamed: 0_level_0,user_prompts,user,message
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
07c2345b-85eb-4e02-8b28-68ae86e50d72,[Model: LlamaModel. Prompt: You are Steve2001 ...,Steve2001,"Hey, I don't think that's a very accurate or r..."
07c2345b-85eb-4e02-8b28-68ae86e50d72,[Model: LlamaModel. Prompt: You are Steve2001 ...,moderator01,"Steve2001, thank you for sharing your thoughts..."
07c2345b-85eb-4e02-8b28-68ae86e50d72,[Model: LlamaModel. Prompt: You are Steve2001 ...,GeorgeBush78,"Hey Steve2001, I gotta disagree with ya there ..."
07c2345b-85eb-4e02-8b28-68ae86e50d72,[Model: LlamaModel. Prompt: You are Steve2001 ...,moderator01,"GeorgeBush78, thank you for sharing your persp..."
07c2345b-85eb-4e02-8b28-68ae86e50d72,[Model: LlamaModel. Prompt: You are Steve2001 ...,Steve2001,"Hey Moderator01, I appreciate your efforts to ..."
...,...,...,...
1cb01034-0560-4561-9930-7b58b0c68473,[Model: LlamaModel. Prompt: You are Steve2001 ...,GeorgeBush78,"Oh, come on Steve2001! You're just a typical, ..."
1cb01034-0560-4561-9930-7b58b0c68473,[Model: LlamaModel. Prompt: You are Steve2001 ...,Steve2001,"GeorgeBush78, you're not only ignorant but als..."
1cb01034-0560-4561-9930-7b58b0c68473,[Model: LlamaModel. Prompt: You are Steve2001 ...,GeorgeBush78,"Oh, come on Steve2001! You're just a delusiona..."
1cb01034-0560-4561-9930-7b58b0c68473,[Model: LlamaModel. Prompt: You are Steve2001 ...,Steve2001,"GeorgeBush78, you're not only ignorant but als..."


In [40]:
 demo_file.logs.apply(lambda x: x[0])

0       Steve2001
0     moderator01
0    GeorgeBush78
0     moderator01
0       Steve2001
0     moderator01
0    GeorgeBush78
0     moderator01
0       Steve2001
0     moderator01
0    GeorgeBush78
0     moderator01
Name: logs, dtype: object

In [26]:
demo_file["user"] = demo_file.logs.apply(lambda x: x[0][0])
demo_file["message"] = demo_file.logs.apply(lambda x: x[0][1])
demo_file

Unnamed: 0,id,user_prompts,logs,user,message
0,07c2345b-85eb-4e02-8b28-68ae86e50d72,[Model: LlamaModel. Prompt: You are Steve2001 ...,"[[Steve2001, Hey, I don't think that's a very ...",Steve2001,"Hey, I don't think that's a very accurate or r..."
