In [1]:
import glob
import pandas as pd

# Combining from JSON 
NOTE: old way, we only use csv in this project now...

In [2]:
human_story_details = glob.glob("./data/v3/*/*.json")

In [5]:
def fetch_story(story_id):
    if not isinstance(story_id, str):
        return None
    human_stories = glob.glob("./data/v3/*/*.md")
    path = [path for path in human_stories if story_id in path][0]
    f = open(path, 'r', encoding="utf8")
    return f.read()

In [38]:
dfs = []
for path in human_story_details:
    df = pd.read_json(path, encoding='utf-8').T
    dfs.append(df)
    
human_df = pd.concat(dfs).reset_index().rename(columns={'index': 'key'})
human_df = human_df[human_df["key"] != "gpt4"]
human_df["story_id"] = human_df["url"].str.extract('/comment/(\w+)')
human_df["text"] = human_df["story_id"].apply(fetch_story)

In [40]:
human_df.to_csv("./data/human_stories_2.csv", encoding='utf-8')

In [126]:
# human_df = pd.read_csv("./data/human_stories.csv", encoding='cp1252')

In [16]:
llm_stories = glob.glob("./llm_story_generation_results_v1/*/*/*")

In [17]:
model_map = {
    "meta-llama_Llama-2-7b-chat-hf": "Llama-2-7B",  
    "meta-llama_Llama-2-13b-chat-hf": "Llama-2-13B", 
    "lmsys_vicuna-33b-v1.3": "Vicuna-33B", 
    "meta-llama_Llama-2-70b-chat-hf": "Llama-2-70B",
    "gpt-4": "GPT-4"    
}

In [18]:
dfs = []
for path in llm_stories:
    df = pd.read_json(path, typ='series').to_frame().T
    df["strategy"] = path.split("\\")[-2]
    df["story_id"] = path.split("\\")[-1].split("_")[-1].replace(".json", "")
    dfs.append(df)
        
llm_df = pd.concat(dfs)
llm_df["model_short"] = llm_df["model_name"].apply(model_map.get)
llm_df["characters"] = llm_df["characters"].str.strip()
llm_df["output"] = llm_df["output"].str.strip()
llm_df = llm_df.rename(columns={
    "output": "text",
    "story_prompt": "prompt"
})
llm_df = llm_df.drop(columns="id")

In [37]:
llm_df.to_csv("./data/llm_stories_2.csv", encoding='utf-8')

In [22]:
llm_df.columns

Index(['model_name', 'prompt', 'characters', 'text', 'strategy', 'story_id',
       'model_short'],
      dtype='object')

In [23]:
human_df.columns

Index(['key', 'url', 'net_upvotes', 'id', 'story_id', 'text'], dtype='object')

In [24]:
df = pd.concat([human_df, llm_df])

In [25]:
# df.to_csv("./data/stories.csv")

# Cleaning bad formatting

In [26]:
# Load the CSV file with appropriate encoding
file_path = './data/human_stories.csv'  # Update this path to your file location
df = pd.read_csv(file_path, encoding='ISO-8859-1')

In [27]:
# Define a mapping of common misencoded characters to their likely intended ASCII counterparts
replacement_map = {
    'ý': "'",  # Assuming 'ý' was intended to be an apostrophe
    '“': '"',  # Opening curly quote to straight quote
    '”': '"',  # Closing curly quote to straight quote
    '‘': "'",  # Opening single curly quote to straight apostrophe
    '’': "'",  # Closing single curly quote to straight apostrophe
    '—': '-',  # Em dash to hyphen
    '–': '-'   # En dash to hyphen
}

# Function to replace characters based on the mapping
def replace_characters(text, mapping):
    for wrong, correct in mapping.items():
        text = text.replace(wrong, correct)
    return text

# Apply the character replacements to the 'text' column
df['text'] = df['text'].apply(lambda x: replace_characters(x, replacement_map))

# Function to correct sequences of apostrophes that might represent double quotes or other misinterpretations
def correct_sequences(text):
    # Replace sequences of two or more apostrophes with double quotes
    text = text.replace("''''''", '"')
    text = text.replace("''''", '"')
    text = text.replace("'''", '"')
    return text

# Apply the sequence corrections to the 'text' column
df['text'] = df['text'].apply(correct_sequences)

# # Optionally, save the cleaned data back to a CSV file
# output_file_path = 'path_to_your_output_file.csv'  # Update this path to your desired output location
# df.to_csv(output_file_path, index=False, encoding='utf-8')

# print("Data cleaning complete and saved to:", output_file_path)


In [33]:
df.to_csv(file_path, index=False, encoding='utf-8')

# Retry Analysis

In [8]:
import pandas as pd
import glob
import numpy as np

In [3]:
writer_profile_files = glob.glob("./llm_story_generation_results_retry_count_analysis/*writer_profile*.csv")
plan_write_files = glob.glob("./llm_story_generation_results_retry_count_analysis/*plan_write*.csv")

In [9]:
average_lengths = []
for f in writer_profile_files + plan_write_files:
    df = pd.read_csv(f)
    cols = [c for c in df.columns if "retry" in c]
    print(f)
    print(df.describe()[cols])
    average_length = df["text"].str.split().str.len().mean()
    average_lengths.append(average_length)
    print(f"Average Length: {average_length}")

print(np.mean(average_lengths))

./llm_story_generation_results_retry_count_analysis\stories_TheBloke.Llama-2-13B-chat-GPTQ_writer_profile.csv
       length_retry_count  retry_count
count           60.000000         60.0
mean             9.483333          0.0
std             14.735512          0.0
min              0.000000          0.0
25%              2.000000          0.0
50%              5.000000          0.0
75%             12.250000          0.0
max            106.000000          0.0
Average Length: 499.73333333333335
./llm_story_generation_results_retry_count_analysis\stories_TheBloke.Llama-2-70B-chat-GPTQ_writer_profile.csv
       length_retry_count  retry_count
count           60.000000         60.0
mean             1.700000          0.0
std              3.082482          0.0
min              0.000000          0.0
25%              0.000000          0.0
50%              1.000000          0.0
75%              2.000000          0.0
max             19.000000          0.0
Average Length: 529.65
./llm_story_generati

# Combining CSVs

In [2]:
import pandas as pd
import glob

In [43]:
stories = glob.glob("./llm_story_generation_results_v2/*.csv")
df = pd.concat((pd.read_csv(s) for s in stories))
df.to_csv("data/llm_stories_v2.csv")

# More cleanup

In [37]:
# Function to clean a single story
def clean_story(story, strings_to_remove):
    cleaned_lines = []
    for line in story.split('\n'):
        stripped_line = line.strip()
        if not any(stripped_line.startswith(s) for s in strings_to_remove):
            cleaned_lines.append(stripped_line)
    # Join the cleaned lines and then strip spaces and newlines from the entire text
    cleaned_story = '\n'.join(cleaned_lines).strip()
    return cleaned_story

In [38]:
# Load the CSV file with appropriate encoding
file_path = './data/llm_stories_v2.csv'  # Update this path to your file location
df = pd.read_csv(file_path)

In [39]:
strings_to_remove = [
    "```", "---", 
    "Title:", "**Title", 
    "Story:", "(Story)", "The Story:", "Story", "STORY", 
    "AI:", "AI-Sys", "AI Response", 
    "Computer:", "System:", "Assistant:",
    "Human:", "Human Response:", "[You]", 
    "Input:", "Response:", 
    "Output:", "**Output", 
    "Answer:", 
    "Note:",
    "ASSIGNMENT",
    "Prompt", "Premise:", 
    "###", "**", 
    "No need for extra details",
    "Do not provide any additional instructions",
    "Just the story.",
    "No comments or questions please."
    "No introduction or summary."
    "No greetings",
    "No need to write any other details.", 
    "No introduction or summary needed.",
    "No need for any additional information.", 
    "Ready to assist!", 
    "I have written the story below",
    "Here is my attempt",
    "Here is your story",
    "Here is a 500-word story", 
    "Here is the story I wrote based on the prompt:", 
    "I understand that you want me to write a story based on the given prompt. Here's my response:",
    "I have a lot of stories to read and I won't have time to read long comments. If you understand, just write the story.",
    "No worries, I've got this covered! Here's the story:", 
    "Here's the story:", 
    "Here is the story:", 
    "Here's your story:", 
    "Here is your story: ",
    "And here is the story:", 
    "And here's the story:", 
    "Here is your 500 word story based on the prompt:", 
    "Here's my attempt at writing a 500-word story", 
    "Here's a possible story:",
    ]
df['cleaned_text'] = df['text'].apply(lambda story: clean_story(story, strings_to_remove))

In [41]:
df.to_csv("./data/llm_stories_v2_cleaned.csv", index=False)