In [1]:
import glob
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
human_story_details = glob.glob("./data/v3/*/*.json")

In [5]:
def fetch_story(story_id):
    if not isinstance(story_id, str):
        return None
    human_stories = glob.glob("./data/v3/*/*.md")
    path = [path for path in human_stories if story_id in path][0]
    f = open(path, 'r', encoding="utf8")
    return f.read()

In [38]:
dfs = []
for path in human_story_details:
    df = pd.read_json(path, encoding='utf-8').T
    dfs.append(df)
    
human_df = pd.concat(dfs).reset_index().rename(columns={'index': 'key'})
human_df = human_df[human_df["key"] != "gpt4"]
human_df["story_id"] = human_df["url"].str.extract('/comment/(\w+)')
human_df["text"] = human_df["story_id"].apply(fetch_story)

In [40]:
human_df.to_csv("./data/human_stories_2.csv", encoding='utf-8')

In [126]:
# human_df = pd.read_csv("./data/human_stories.csv", encoding='cp1252')

In [16]:
llm_stories = glob.glob("./llm_story_generation_results_v1/*/*/*")

In [17]:
model_map = {
    "meta-llama_Llama-2-7b-chat-hf": "Llama-2-7B",  
    "meta-llama_Llama-2-13b-chat-hf": "Llama-2-13B", 
    "lmsys_vicuna-33b-v1.3": "Vicuna-33B", 
    "meta-llama_Llama-2-70b-chat-hf": "Llama-2-70B",
    "gpt-4": "GPT-4"    
}

In [18]:
dfs = []
for path in llm_stories:
    df = pd.read_json(path, typ='series').to_frame().T
    df["strategy"] = path.split("\\")[-2]
    df["story_id"] = path.split("\\")[-1].split("_")[-1].replace(".json", "")
    dfs.append(df)
        
llm_df = pd.concat(dfs)
llm_df["model_short"] = llm_df["model_name"].apply(model_map.get)
llm_df["characters"] = llm_df["characters"].str.strip()
llm_df["output"] = llm_df["output"].str.strip()
llm_df = llm_df.rename(columns={
    "output": "text",
    "story_prompt": "prompt"
})
llm_df = llm_df.drop(columns="id")

In [37]:
llm_df.to_csv("./data/llm_stories_2.csv", encoding='utf-8')

In [22]:
llm_df.columns

Index(['model_name', 'prompt', 'characters', 'text', 'strategy', 'story_id',
       'model_short'],
      dtype='object')

In [23]:
human_df.columns

Index(['key', 'url', 'net_upvotes', 'id', 'story_id', 'text'], dtype='object')

In [24]:
df = pd.concat([human_df, llm_df])

In [25]:
# df.to_csv("./data/stories.csv")

In [29]:
# Load the CSV file with appropriate encoding
file_path = './data/stories.csv'  # Update this path to your file location
df = pd.read_csv(file_path, encoding='ISO-8859-1')

In [31]:
# Define a mapping of common misencoded characters to their likely intended ASCII counterparts
replacement_map = {
    'ý': "'",  # Assuming 'ý' was intended to be an apostrophe
    '“': '"',  # Opening curly quote to straight quote
    '”': '"',  # Closing curly quote to straight quote
    '‘': "'",  # Opening single curly quote to straight apostrophe
    '’': "'",  # Closing single curly quote to straight apostrophe
    '—': '-',  # Em dash to hyphen
    '–': '-'   # En dash to hyphen
}

# Function to replace characters based on the mapping
def replace_characters(text, mapping):
    for wrong, correct in mapping.items():
        text = text.replace(wrong, correct)
    return text

# Apply the character replacements to the 'text' column
df['text'] = df['text'].apply(lambda x: replace_characters(x, replacement_map))

# Function to correct sequences of apostrophes that might represent double quotes or other misinterpretations
def correct_sequences(text):
    # Replace sequences of two or more apostrophes with double quotes
    text = text.replace("''''''", '"')
    text = text.replace("''''", '"')
    text = text.replace("'''", '"')
    return text

# Apply the sequence corrections to the 'text' column
df['text'] = df['text'].apply(correct_sequences)

# # Optionally, save the cleaned data back to a CSV file
# output_file_path = 'path_to_your_output_file.csv'  # Update this path to your desired output location
# df.to_csv(output_file_path, index=False, encoding='utf-8')

# print("Data cleaning complete and saved to:", output_file_path)


In [33]:
df.to_csv(file_path, index=False, encoding='utf-8')