In [1]:
# Flag to set if you've made manual changes to the GPT-annotated dataset outputs, and you don't want to accidentally overwrite it/cause problems by running the code that generates these outputs again
annotated_dataset_has_been_manually_edited = False

# You should also set this to true if you don't have the actual combined script files

In [2]:
# Process Text (raw) 
# But the functions will work on the later text, too.
import re
testval = 0

# Step 1: Remove Unwanted Strings
# Regex to match unwanted patterns enclosed in []
unwanted_pattern = re.compile(r"\[color index=\".*?\"\]|\[(?!name|line|%p).*?\]")
def remove_unwanted_strings(text):
    return unwanted_pattern.sub('', text)

# Step 2: Parsing the text
# I'll update the regular expressions to exclude the delimiters.
name_regex = re.compile(r"\[name\](.*?)\[line\]")
dialogue_regex = re.compile(r"\[line\](.*?)\[%p\]")
monologue_regex = re.compile(r"^(.*?)(?=\[%p\])")

def makecols(str):
    global testval
    """Returns a tuple of (speaker, dialogue) from a single line from the script"""
    name_results = name_regex.search(str)
    dialogue_results = dialogue_regex.search(str)
    if name_results is None:
        monologue_results = monologue_regex.search(str)
        return ('UNSPOKEN', monologue_results.group(1) if monologue_results else "")
    try: 
        return (name_results.group(1).strip(), dialogue_results.group(1).strip())
    except:
        print(f"This is the name_results: {name_results}.\nAnd this is the dialogue: {dialogue_results}")
        testval += 1
        return ('ERROR!', '')

def not_empty_monologue(tup):
    if (tup[0] == 'UNSPOKEN') and (tup[1] == '') or (tup[0] == "ERROR!"):
        return False
    return True

# Step 3: Final Processing
def process_script(filename):
    """Returns a list of tuples of (speaker, dialogue) from a script file, filters out empty monologue lines"""
    with open(filename, 'r') as f:
        raw_script = f.read()

    # Remove unwanted strings
    cleaned_script = remove_unwanted_strings(raw_script)

    # Split the cleaned_script into lines and filter out empty lines
    lines = [line.strip() for line in cleaned_script.split('\n') if line.strip()]

    # Process each line to make a tuple of (speaker, dialogue)
    script_tuples = list(map(makecols, lines))
    script_tuples = list(filter(not_empty_monologue, script_tuples))

    
    return script_tuples

In [3]:
if not annotated_dataset_has_been_manually_edited:
    script_tuples = process_script('combined_script.txt')
    print(script_tuples[:20])  # Just printing the first 10 for visualization


    script = process_script('combined_script.txt')
    print(testval)

This is the name_results: <re.Match object; span=(0, 19), match='[name]Rintaro[line]'>.
And this is the dialogue: None
[('Mayuri', '“Hey, what are you mumbling about?”'), ('UNSPOKEN', 'There’s no sound from the phone against my right ear. Only silence.'), ('UNSPOKEN', 'I am baking in the summer sun.'), ('UNSPOKEN', 'Sweat slowly slides down my chin and drips onto the asphalt.'), ('Mayuri', '“Okarin? Earth to Okarin!”'), ('UNSPOKEN', 'A girl is standing in front of me.'), ('UNSPOKEN', 'She calls my name with an inquisitive tilt of her head.'), ('UNSPOKEN', 'We are about to infiltrate deep into enemy territory. Yet despite the imminent risk of death, there is no hint of tension on her innocent, childlike features.'), ('UNSPOKEN', 'I cover my phone’s mouthpiece and turn to the girl with an index finger to my lips.'), ('Mayuri', '“You talking to someone?”'), ('UNSPOKEN', 'I nod and put my phone back to my ear.'), ('UNSPOKEN', 'Still no sound from the other side.'), ('UNSPOKEN', 'My contact

In [4]:
# remove_unwanted_strings("There’s no sound from the phone against my right ear. Only silence.[%p]")
makecols("There’s no sound from the phone against my right ear. Only silence.[%p]")
# monologue_regex.search("There’s no sound from the phone against my right ear. Only silence.[%p]")[1]

('UNSPOKEN',
 'There’s no sound from the phone against my right ear. Only silence.')

In [5]:
# CHARACTER CHOICE #
# Change the global values here if you want to change the character dataset being created, and the person the "user" is roleplaying as.

# The character the user is roleplaying as
user_char = "Rintaro"

# The characters whose lines the model will be trained on
model_chars = ["Kurisu", "Luka", "Faris", "Mayuri", "Itaru", "Suzuha",] # this is kinda outdated now that I'm training on all of them, but 

# NOTE: DOUBLE CHECK THAT THESE ARE RIGHT BEFORE RUNNING THE NOTEBOOK #

In [6]:
script_tuples[1]

('UNSPOKEN',
 'There’s no sound from the phone against my right ear. Only silence.')

In [7]:

# Define Tuple List Processors (before the example generation)
lines_merged = 0
lines_with_space_issues = 0
lines_with_bad_quotes = 0
from tqdm import tqdm # it's not machine learning if there's no progress bar


def remove_only_ellipsis_lines(tlist, index=9999):
    """Remove lines that only contain ellipsis."""
    return [(speaker, line) for speaker, line in tqdm(tlist) if line.replace('“','').replace('”','') != "..."]

def merge_consecutive_lines(tlist, index=9999):
    merged_tlist = []
    last_speaker = None
    global lines_merged
    for speaker, line in tqdm(tlist):
        line_filtered = line.replace("“",'').replace("”",'')
        if not merged_tlist or speaker != last_speaker:
            # New speaker or first dialogue, just add it to the list
            merged_tlist.append((speaker, line.replace("“",'').replace("”",'')))
        else:
            # Same speaker as before, concatenate the lines
            prev_speaker, prev_line = merged_tlist.pop()
            merged_tlist.append((prev_speaker, (prev_line + " " + line).replace("“",'').replace("”",'')))
            print(f"merged a line at index {index}. Prev speaker: {prev_speaker} Speaker: {speaker}")
            lines_merged += 1
        last_speaker = speaker
    return merged_tlist # why do this step here? Because I don't want to iterate over the dataset twice, and monologues should count when examples are being generated with the sliding window, so I can't remove them in the usual spot.


def add_space_after_punctuation(tlist, index=9999):
    corrected_tlist = []
    global lines_with_space_issues
    for speaker, line in tqdm(tlist):
        # Add a space wherever there is a punctuation mark followed by a letter, excluding ellipsis
        corrected_line = re.sub(r'([.,!?])(?<!\.\.\.)(\w)', r'\1 \2', line)
        if corrected_line != line:
            lines_with_space_issues += 1
            print("Added a space at index ", index)
        corrected_tlist.append((speaker, corrected_line))
    return corrected_tlist

def replace_odd_quote(tlist,index=9999):
    corrected_tlist = []
    global lines_with_bad_quotes
    for speaker, line in tqdm(tlist):
        corrected_line = line.replace("‘", "'").replace("’", "'")
        if corrected_line != line:
            lines_with_bad_quotes += 1
            print("replaced quote at index ", index)
        corrected_tlist.append((speaker, corrected_line))
    return corrected_tlist

def add_quotes_to_dialogue(tlist,index=9999):
    """Adds quotes to a pure-dialogue line. Do NOT use on action-annotated lines: only on the first initial tlists."""
    corrected_tlist = []
    for speaker, line in tqdm(tlist):
        if speaker != "UNSPOKEN":
            corrected_line = '"' + line + '"'
        else:
            corrected_line = line
        corrected_tlist.append((speaker, corrected_line))
    return corrected_tlist

def call_multiple_processors(*args):
    """returns a callback that calls all processing functions on the provided tuple list and return the new tuple list/
    Mapped over CONVERSATIONS, not pure script tuples
    """
    def processor(item):
        idx, tlist = item
        tuple_list = tlist.copy()
        for func in args:
            tuple_list = func(tuple_list,index=idx)
        return tuple_list
    return processor

In [8]:
if not annotated_dataset_has_been_manually_edited:
    # Call the tuple list processors
    script_tuples_no_ellipsis = remove_only_ellipsis_lines(script_tuples)
    print("Step 1 complete")
    script_tuples_no_consecutive = merge_consecutive_lines(script_tuples_no_ellipsis)
    print("Step 2 complete")
    script_tuples_punctuation_fixed = add_space_after_punctuation(script_tuples_no_consecutive)
    print("Step 3 complete")
    script_tuples_quote_fixed = replace_odd_quote(script_tuples_punctuation_fixed)
    print("Step 4 complete")
    script_tuples_quotes_added = add_quotes_to_dialogue(script_tuples_punctuation_fixed)
    print("All steps complete")

100%|██████████| 28191/28191 [00:00<00:00, 529448.01it/s]


Step 1 complete


100%|██████████| 27738/27738 [00:00<00:00, 548235.50it/s]


merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: Rintaro Speaker: Rintaro
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index 9999. Prev speaker: UNSPOKEN Speaker: UNSPOKEN
merged a line at index

100%|██████████| 14349/14349 [00:00<00:00, 458258.98it/s]


Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space at index  9999
Added a space 

100%|██████████| 14349/14349 [00:00<00:00, 520704.50it/s]


replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced quote at index  9999
replaced q

100%|██████████| 14349/14349 [00:00<00:00, 2181056.32it/s]

All steps complete





Notice that the tuple list processing functions don't do things that need to be repeated for the reading of the annotated script stuff. They're one-off operations. Thus they do not need to be abstracted further

In [9]:
# Create conversations from raw text
from transformers import AutoTokenizer
from tqdm import tqdm # it's not machine learning if there's no progress bar

tokenizer = AutoTokenizer.from_pretrained("Gryphe/MythoMax-L2-13b")

def generate_examples(script, tokenizer, model_char_count_min=1, window_length=10, user_char_count_min=1, max_lines_without_model_char=10):
    """Extracts useful conversations from the script according to a specific algorithm:

    1. A conversation is defined as a sequence of lines where the model_char speaks at least model_char_count_min times and the user_char speaks at least user_char_count_min times.
    2. A conversation ends when the model_char has not spoken for max_lines_without_model_char lines.
    3. A conversation is saved and a new one started if it is longer than window_length lines.
    
    """
    # MAX_TOKENS = 1500  # This produced really really good examples, but they were too large for GPT-4 to annotate while remembering its instructions, so I had to reduce it
    MAX_TOKENS = 700 # WORKED EARLIER
    # MAX_TOKENS = 800 # EXPERIMENT TO GET LONGER EXAMPLES

    examples = []
    sliding_window = []
    example = []
    model_char_counter = 0
    user_char_counter = 0
    lines_without_model_char = 0
    making_conversation = False

    for dialogue in tqdm(script):
        speaker, line = dialogue

        if len(sliding_window) == window_length:
            sliding_window.pop(0)  # Remove first element

        sliding_window.append(dialogue)

        # Check if there are more than model_char_count_min spoken lines from model_char across sliding_window
        model_char_counter = sum(1 for d in sliding_window if d[0] in model_chars)
        user_char_counter = sum(1 for d in sliding_window if d[0] == user_char)

        if speaker in model_chars:
            lines_without_model_char = 0  # Reset count
        else:
            lines_without_model_char += 1  # Increment count
            
        can_start_conversation = model_char_counter >= model_char_count_min and user_char_counter >= user_char_count_min
        should_stop_conversation = making_conversation and (len(tokenizer.encode(' '.join([d[1] for d in example]))) > MAX_TOKENS or lines_without_model_char > max_lines_without_model_char)
        
        if making_conversation:
            if should_stop_conversation: # making conversation and should stop
                examples.append(example)
                example = []
                sliding_window = []
                model_char_counter = 0
                user_char_counter = 0
                lines_without_model_char = 0
                making_conversation = False
            else: # making conversation and should not stop
                example.append(dialogue)
        elif can_start_conversation: # not making conversation and should start, by appending an example to conversation as well as the entire sliding window
            start_appending = False
            for d in sliding_window:
                speaker, _ = d  # Extract the speaker from the tuple
                if not start_appending:
                    if speaker in ["UNSPOKEN", user_char] or speaker in model_chars:
                        start_appending = True  # Start appending from this point onward
                if start_appending:
                    example.append(d)
            sliding_window = []
            making_conversation = True

    if example:  # Add last example if it's non-empty
        examples.append(example)

    return examples

In [10]:
if not annotated_dataset_has_been_manually_edited:
    created_examples_script = generate_examples(script_tuples_quotes_added, tokenizer,)
    print(len(created_examples_script))

100%|██████████| 14349/14349 [00:07<00:00, 1917.57it/s]

496





In [11]:
created_examples_script[0]

[('Mayuri', '"Hey, what are you mumbling about?"'),
 ('UNSPOKEN',
  'There’s no sound from the phone against my right ear. Only silence. I am baking in the summer sun. Sweat slowly slides down my chin and drips onto the asphalt.'),
 ('Mayuri', '"Okarin? Earth to Okarin!"'),
 ('UNSPOKEN',
  'A girl is standing in front of me. She calls my name with an inquisitive tilt of her head. We are about to infiltrate deep into enemy territory. Yet despite the imminent risk of death, there is no hint of tension on her innocent, childlike features. I cover my phone’s mouthpiece and turn to the girl with an index finger to my lips.'),
 ('Mayuri', '"You talking to someone?"'),
 ('UNSPOKEN',
  'I nod and put my phone back to my ear. Still no sound from the other side. My contact is wise to maintain silence. The whole area could be bugged.'),
 ('Rintaro',
  '"...No, I was just talking to someone. Everything’s fine. I’m about to infiltrate the assembly hall."'),
 ('UNSPOKEN',
  'Still no reply. Looks li

In [12]:
# NOTE: DATA INSPECTION CELL #
if not annotated_dataset_has_been_manually_edited:
    created_examples_script_modified = [i + [("NOTE", "---NEW_CONV---")] for i in created_examples_script]
    created_examples_script_flattened = [item for sublist in created_examples_script_modified for item in sublist]
    with open("script_dump.txt", "w") as f:
        f.write('\n'.join([l[0] + ": " + l[1] for l in created_examples_script_flattened]))

    # find out how many examples only have one kurisu line:
    potentially_bad_examples = [i for i in created_examples_script if len([j for j in i if j[0] in model_chars]) <= 2]
    created_examples_script_processed = [i for i in created_examples_script if len([j for j in i if j[0] in model_chars]) >= 2]
    
    # Take the first of those examples with only one kurisu line and print it out:
    print(len(potentially_bad_examples))
    print(potentially_bad_examples[2])

42
[('UNSPOKEN', 'So far, the Future Gadget Laboratory has completed a total of eight inventions. As I explained to Alpacaman, the lab’s primary goal is to develop weapons for the war against the Dark Dominion, led by the Organization, that rules the world from the shadows. At present, we haven’t completed any inventions of that sort. On the contrary. We haven’t even figured out what we should make. But along the way, we have managed to create some ingenious Future-ish Gadgets as a byproduct of our research. It is a fundamental truth of science that great inventions are often created by accident. In other words, serendipity. Allow me to introduce our glorious Future Gadgets. Gadget No. 1, the Bit Particle Gun. Gadget No. 2, the Bamboo Helicam. Gadget No. 3, Could This Be Ora Ora!?. Gadget No. 4, Moad Snake. Gadget No. 5, ’Once Again I’ve Made A Worthless Object’ by Goemon. Gadget No. 6, the Cyalume Saber. Gadget No. 7, Ghost in the Ball. They can all be seen on the website Daru made, s

OK so brief inspection of the script: the conversations are too short and sparse, I need to increase the number of lines without Kurisu for S;G as it is monologue heavy.

I'll have to make sure in my annotation prompt that the AI does not add actions to simple thoughts on the part of Okabe. Or maybe it should... so that the model gets used to continuing a train of thought?

Results of additional inspection: convs with only 2 Kurisu examples might be removal material; lines with 3 should be kept definitely; lines with 1 are being removed anyway so it doesn't make sense to have min_kurisu_lines be lower than 2

Even with the monologue added back in, the phone context is missing. Oh well.

Will need to make clear that unspoken can narrate both actions and Okabe's thoughts.

Smaller window stops waste at the start of a conversation, smaller max tokens stop waste at end of a conversation, but a smaller window makes it more likely that an example is missed (problem mitigated somewhat now that I'm doing the line merging BEFORE the example generation). A smaller max token size means that some of the really long conversations that are really really good get lost.

In [13]:
# Uncomment if you want to read in the conversations from preexisting files
import os

# Previous function to read dialogue from a single file
def read_dialogue(file_path):
    speaker_line_tuples = []
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue
            split_line = line.split(":", 1)
            if len(split_line) == 2:
                speaker, line_text = split_line
                speaker = speaker.strip()
                line_text = line_text.strip()
                speaker_line_tuples.append((speaker, line_text))
    
    return speaker_line_tuples

# Function to read dialogues from all files in a directory and sort them by index
def read_all_dialogues(directory_path):
    # List to store all dialogues (each dialogue is a list of tuples)
    all_dialogues = []
    
    # Get list of all filenames in the directory
    filenames = [f for f in os.listdir(directory_path) if f.endswith('_conversation.txt')]
    
    # Sort filenames by their numerical index
    filenames.sort(key=lambda x: int(x.split('_')[0]))
    
    for filename in filenames:
        file_path = os.path.join(directory_path, filename)
        dialogue = read_dialogue(file_path)
        all_dialogues.append(dialogue)
    
    return all_dialogues

# if not annotated_dataset_has_been_manually_edited:
#     directory_path = "./conversations/"
#     created_examples_script = read_all_dialogues(directory_path)


In [14]:
# Define Tuple List to Training example format
def generate_training_examples(conversation):
    training_examples = []
    temp_dialogue = []
    for idx, dialogue in enumerate(conversation):
        speaker, _ = dialogue
        temp_dialogue.append(dialogue)
        if speaker in model_chars or speaker == "Okabe" or speaker == "Rintaro" and idx != 0:
            training_examples.append(temp_dialogue.copy())  # Add up to and including current line
    return training_examples

In [15]:
# Create training examples
if not annotated_dataset_has_been_manually_edited:
    training_data_conversations = list(map(generate_training_examples, created_examples_script_processed))

    print(len(training_data_conversations))

467


In [16]:
if not annotated_dataset_has_been_manually_edited:
    # DEBUG # see first element of training convs
    training_data_conversations[59]
    # NOT DEBUG # and filter out the examples that are too small to make sense
    training_data_conversations_filtered = list(filter(lambda x: len(x) > 2, training_data_conversations))
    training_data_conversations_bad = list(filter(lambda x: len(x) <= 2, training_data_conversations))
    # len(processed_conversations)
    print(training_data_conversations_filtered[99][-1])
    print(len(training_data_conversations_filtered))

[('Kurisu', '"!!"'), ('Rintaro', '"I am not Okabe! I am HOUOUIN KYOUMA! How many times do I have to tell you, Christina!?"'), ('Kurisu', '"...God, you’re so full of yourself."'), ('Rintaro', '"From now on, call me Hououin. That or Kyouma."'), ('Kurisu', '"No."'), ('UNSPOKEN', 'Kurisu snaps at me and turns away. Are those tears I see in her eyes?'), ('Rintaro', '"Are you... crying?"'), ('Kurisu', '"I’m not crying! It’s not like I was scared of you for a second there or anything, and I definitely wasn’t relieved when you started talking like an idiot again, okay!?"'), ('UNSPOKEN', 'She stalks over to the window and stares outside. I see her wipe her eyes with the back of her hand. Seems she was crying after all.'), ('Rintaro', '"Daru, why is my assistant crying?"'), ('Itaru', '"Uh, she said so herself just now."'), ('Rintaro', '"All I did was correct her on my name."'), ('Itaru', '"Maybe she only acts strong but is actually weak on the inside. That would be pretty moe."'), ('UNSPOKEN', '

In [17]:
import openai
from tqdm import tqdm
openai.api_key = "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" # don't even try it, by the time I open this I will have deleted this key

In [18]:
openai_scenario_prompt = [
                {"role": "system", "content": """You are an expert scenario-writing and prompt-engineering AI. Your task is to write the context for an interaction between characters from the visual novel Steins;Gate in a "scenario" — a 5-sentence summary about what's happened until the point the interaction STARTS at (writing under the assumption that the reader knows who the characters are and what some of their general traits are). You should use the lines provided to help determine the factual and emotional context behind a given scene.

Think step-by-step, and explain your plan to write an accurate and compelling scenario for the provided context before you actually write the scenario.

Here are two principles you should incorporate into your scenario:
1. Your first sentence should explain the context of the scene: where it takes place, what exactly that place is (in general terms) and what each of the characters are doing there. Focus on named characters when it comes to motivations.
2. End with a statement(s) that describe where the scene is going — specifically, what each of the named characters is trying to do in the scene.

Here's an example of what a scenario might look like: 
\"\"\"In a dark, seemingly abandoned place, genius scientist Kurisu finds herself alone with Okabe, a fellow scientist and friend who she's come to know well through their shared work in inventing. They've been through many experiences together, each one deepening their understanding of the other. In the gloom, Kurisu occupies herself with mending a tear in Okabe's coat, a task that seems oddly domestic in the otherwise strange setting, while Okabe talks to her as casually as he can manage. As Kurisu sews, she struggles to initiate a conversation with Okabe about a decision she knows he's been wrestling with — a decision about which of his friends he needs to let die — hoping to provide some sort of comfort or guidance; Okabe, meanwhile, attempts to cling to any normalcy he can. This scene's emotions are dark and depressing: it is a dreary, bitter struggle where two people attempt to keep their heads above metaphorical water, and eventually fail.\"\"\"

Note in the above example how immense focus is placed on the emotional content and context of the scene,  as well as the relationships between characters.

[To help orient you as you determine which part of the plot a conversation is taking in,  here is a plot summary of Steins;Gate:

Okabe Rintaro, a "mad scientist," meets genius Kurisu at a time travel lecture. They argue, and he later finds her apparently dead. Texting this to his friend Daru activates a prototype time machine, altering the timeline.

Kurisu turns out to be alive. Okabe Rintaro and friends, including Mayuri and Daru, discover their "Phone Microwave" sends texts, or "D-mails," to the past. They use D-mails to fulfill wishes for friends like Moeka, Faris, Luka, and Suzuha. Kurisu joins the lab and helps improve the time machine.

Okabe alone remembers original timelines due to his "Reading Steiner" ability. They also create "Time Leaps," sending memories to the past. However, SERN discovers them, raids the lab, and kills Mayuri. Rintaro time-leaps repeatedly but can't save her.

To fix things, Okabe, aided by Kurisu, undoes all the D-mails, causing personal pain. They grow closer, but Rintaro realizes the first D-mail about Kurisu's "death" caused Mayuri's fate. Undoing it means sacrificing Kurisu, which he reluctantly does at Kurisu's request (after trying many alternatives) to save Mayuri.

Time-traveler Suzuha then contacts Okabe, urging him to prevent World War 3 by saving Kurisu. Okabe accidentally kills Kurisu himself, but gets advice from his future self on reaching a timeline—Steins Gate—where both friends live. He succeeds by faking Kurisu's death.

In the Steins Gate timeline, Okabe and Kurisu encounter each other, experiencing déjà vu from past timelines.]

Take special care to write a scenario that would make sense to someone ignorant of the overall plot of Steins;Gate. IE, you are not just trying to write a scenario that makes sense only when viewed alongside the plot summary; you are writing something that gives adequate context to a scene by itself alone. Instead of using Steins;Gate specific terminology, you will use generic words and explanations to give context to a scene.

Note that UNSPOKEN lines can either be narration about what's happening, or Okabe's thoughts; they're all from Okabe's point of view, however.

One last pointer: keep the language simple. Which characters are where, under what circumstances, and what each of them feel and will do. And what the general emotion of the scene is. The scene itself will do most of the talking. Keep the scenario 5 sentences long at most. Instead of mentioning events in the far future, you will concentrate on the event at hand and the things that led up to it."""},
            ]

In [34]:
# A FUNCTION THAT LETS YOU CALL OPENAI ON ALL THE EXAMPLES
import openai
import os

def write_context_to_file(training_data_example, destination_directory, example_index): # for easier inspection
    """Writes a training example (conversation, the full thing) to a file in the destination directory, so that the input for a scenario can be inspected"""
    full_conversation = training_data_example[-1]
    context = '\n'.join([f'{speaker}: {line}' for speaker, line in full_conversation])
    
    filename = os.path.join(destination_directory, f'{example_index:03d}_conversation.txt') # I'm paying for the tokens, I damn well want to see them

    # Write the scenario to the file
    with open(filename, 'w') as f_1:
        f_1.write(context)

if not annotated_dataset_has_been_manually_edited:
    for idx, content in enumerate(training_data_conversations_filtered):
        """Write all training examples to indexed files"""
        write_context_to_file(content, 'conversations', idx)
    

def create_scenario(training_data_example, destination_directory, example_index):
    """Creates a scenario for a training example and writes it to a file in the destination directory"""
    full_conversation = training_data_example[-1]
    context = '\n'.join([f'{speaker}: {line}' for speaker, line in full_conversation])

    if not os.path.exists(os.path.join(destination_directory, f'{example_index:03d}.txt')):
        prompt = openai_scenario_prompt + [{
            "role" : "user",
            "content" : f"""Context: \"\"\"{context.replace("Rintaro:","Okabe:")}\"\"\"
            
Remember, your goal is to write a scenario that describes the scene.
First, Brainstorm and think through each of the points below. Establish where in the timeline the provided scene most likely falls.
Then write "Scenario:" followed by text that...
1. Describes the location and the characters present at the start of the scene.
2. Describes the relationships between characters.
3. Describes the current emotional and mental states of each character at the start of the scene.
4. Explains quickly any significant events that have happened leading up to the scene, and explains in more detail what events are happening DURING the scene. Defines any special Steins;Gate-specific terms.
5. States what each character's goal is in the scene.
6. Finally, it states what the overall mood of the scene is ("upbeat", "depressing", etc.).
7. If the lines shown cover multiple scenes, describe the emotions of each scene and the context of each scene.

Special note 1: The scenario you write should set up the scene, not summarize it, and not hint at its conclusion. It describes the moment up to Kurisu's first message.
Special note 2: DON'T actually metion that this is happening during Steins;Gate, and don't describe elements of the plot in your scenario that aren't related to the ongoing scene. Be FOCUSED.
Special note 3: if Steins;Gate-specific terms appear in the scene (e.g., PhoneWave, SERN, jellymen), define them in the scenario — in a way that doesn't disrupt its flow too much. List all such special terms in your planning phase.
Do not forget to brainstorm and think through everything before writing "Scenario:" and then the scenario."""
            }]
        if example_index == 0:
            print("------".join([d["content"] for d in prompt]))
        response = openai.ChatCompletion.create(
            model="gpt-4",
            temperature=0.7,
            # top_p = 0.9,
            messages=prompt
        )
    
        scenario = response['choices'][0]['message']['content']
        
        filename_cot_debug = os.path.join(destination_directory, f'{example_index:03d}_cot_debug.txt') # I'm paying for the tokens, I damn well want to see them

        # Write the scenario to the file
        with open(filename_cot_debug, 'w') as f_1:
            f_1.write(scenario)

        try:
            # # Assume "Scenario:\n" is followed by the actual scenario
            scenario = re.search('Scenario:(.*)', scenario, re.DOTALL).group(1)
        except: # incase of regex failure, just skip and hope that the second pass will catch it
            return

        # Create a filename based on the example index
        filename = os.path.join(destination_directory, f'{example_index:03d}.txt')

        # Write the scenario to the file
        with open(filename, 'w') as f_2:
            f_2.write(scenario)
    else:
        print(f"Skipping {example_index:03d} because it already exists.")

In [35]:
# NOTE WARNING -- THIS CELL SPENDS MONEY IF YOU HAVE NOT GENERATED TRAINING EXAMPLES YET.
# If you have not generated training examples yet, this cell will generate them and then generate scenarios for them.

if not annotated_dataset_has_been_manually_edited:

    for idx, content in enumerate(tqdm(training_data_conversations_filtered)):
        # write_context_to_file(content, 'contexts', idx)
        create_scenario(content, 'scenarios', idx)

    print("\nBeginning Second Pass...\n")

    for idx, content in enumerate(tqdm(training_data_conversations_filtered)): # run it again to catch everything that failed the first time. The fact that already-generated scenarios are skipped means this doesn't cost any unneeded money.
        # write_context_to_file(content, 'contexts', idx)
        create_scenario(content, 'scenarios', idx)

  0%|          | 0/467 [00:00<?, ?it/s]

Skipping 000 because it already exists.
Skipping 001 because it already exists.
Skipping 002 because it already exists.
Skipping 003 because it already exists.
Skipping 004 because it already exists.
Skipping 005 because it already exists.
Skipping 006 because it already exists.
Skipping 007 because it already exists.
Skipping 008 because it already exists.
Skipping 009 because it already exists.
Skipping 010 because it already exists.
Skipping 011 because it already exists.
Skipping 012 because it already exists.
Skipping 013 because it already exists.
Skipping 014 because it already exists.
Skipping 015 because it already exists.
Skipping 016 because it already exists.
Skipping 017 because it already exists.
Skipping 018 because it already exists.
Skipping 019 because it already exists.
Skipping 020 because it already exists.
Skipping 021 because it already exists.
Skipping 022 because it already exists.
Skipping 023 because it already exists.
Skipping 024 because it already exists.


 18%|█▊        | 86/467 [00:33<02:26,  2.59it/s]

Skipping 086 because it already exists.
Skipping 087 because it already exists.
Skipping 088 because it already exists.
Skipping 089 because it already exists.
Skipping 090 because it already exists.
Skipping 091 because it already exists.
Skipping 092 because it already exists.
Skipping 093 because it already exists.
Skipping 094 because it already exists.
Skipping 095 because it already exists.
Skipping 096 because it already exists.
Skipping 097 because it already exists.
Skipping 098 because it already exists.
Skipping 099 because it already exists.
Skipping 100 because it already exists.
Skipping 101 because it already exists.
Skipping 102 because it already exists.
Skipping 103 because it already exists.
Skipping 104 because it already exists.
Skipping 105 because it already exists.
Skipping 106 because it already exists.
Skipping 107 because it already exists.
Skipping 108 because it already exists.
Skipping 109 because it already exists.
Skipping 110 because it already exists.


 28%|██▊       | 131/467 [01:10<03:12,  1.74it/s]

Skipping 131 because it already exists.
Skipping 132 because it already exists.
Skipping 133 because it already exists.
Skipping 134 because it already exists.
Skipping 135 because it already exists.
Skipping 136 because it already exists.
Skipping 137 because it already exists.
Skipping 138 because it already exists.
Skipping 139 because it already exists.
Skipping 140 because it already exists.
Skipping 141 because it already exists.
Skipping 142 because it already exists.
Skipping 143 because it already exists.
Skipping 144 because it already exists.
Skipping 145 because it already exists.
Skipping 146 because it already exists.
Skipping 147 because it already exists.
Skipping 148 because it already exists.
Skipping 149 because it already exists.
Skipping 150 because it already exists.
Skipping 151 because it already exists.
Skipping 152 because it already exists.
Skipping 153 because it already exists.
Skipping 154 because it already exists.
Skipping 155 because it already exists.


 51%|█████     | 236/467 [01:48<01:43,  2.22it/s]

Skipping 236 because it already exists.
Skipping 237 because it already exists.
Skipping 238 because it already exists.
Skipping 239 because it already exists.
Skipping 240 because it already exists.
Skipping 241 because it already exists.
Skipping 242 because it already exists.
Skipping 243 because it already exists.
Skipping 244 because it already exists.
Skipping 245 because it already exists.
Skipping 246 because it already exists.
Skipping 247 because it already exists.
Skipping 248 because it already exists.
Skipping 249 because it already exists.
Skipping 250 because it already exists.
Skipping 251 because it already exists.
Skipping 252 because it already exists.
Skipping 253 because it already exists.
Skipping 254 because it already exists.
Skipping 255 because it already exists.
Skipping 256 because it already exists.
Skipping 257 because it already exists.
Skipping 258 because it already exists.
Skipping 259 because it already exists.
Skipping 260 because it already exists.


100%|██████████| 467/467 [02:29<00:00,  3.12it/s]


Skipping 423 because it already exists.
Skipping 424 because it already exists.
Skipping 425 because it already exists.
Skipping 426 because it already exists.
Skipping 427 because it already exists.
Skipping 428 because it already exists.
Skipping 429 because it already exists.
Skipping 430 because it already exists.
Skipping 431 because it already exists.
Skipping 432 because it already exists.
Skipping 433 because it already exists.
Skipping 434 because it already exists.
Skipping 435 because it already exists.
Skipping 436 because it already exists.
Skipping 437 because it already exists.
Skipping 438 because it already exists.
Skipping 439 because it already exists.
Skipping 440 because it already exists.
Skipping 441 because it already exists.
Skipping 442 because it already exists.
Skipping 443 because it already exists.
Skipping 444 because it already exists.
Skipping 445 because it already exists.
Skipping 446 because it already exists.
Skipping 447 because it already exists.


100%|██████████| 467/467 [00:00<00:00, 33978.18it/s]

Skipping 000 because it already exists.
Skipping 001 because it already exists.
Skipping 002 because it already exists.
Skipping 003 because it already exists.
Skipping 004 because it already exists.
Skipping 005 because it already exists.
Skipping 006 because it already exists.
Skipping 007 because it already exists.
Skipping 008 because it already exists.
Skipping 009 because it already exists.
Skipping 010 because it already exists.
Skipping 011 because it already exists.
Skipping 012 because it already exists.
Skipping 013 because it already exists.
Skipping 014 because it already exists.
Skipping 015 because it already exists.
Skipping 016 because it already exists.
Skipping 017 because it already exists.
Skipping 018 because it already exists.
Skipping 019 because it already exists.
Skipping 020 because it already exists.
Skipping 021 because it already exists.
Skipping 022 because it already exists.
Skipping 023 because it already exists.
Skipping 024 because it already exists.





In [36]:
# Get scenarios back into notebook data from files

# read off every scenario, and make a list of them that (should) line up with the training data
# no guarantee of that if the training data has been manually modified. In that case, the flag at the top of the notebook will be true, and we'll use a different version of this
def make_scenario_list():
    scenario_list = []
    idx = 0
    while True:
        filename = f"scenarios/{idx:03d}.txt"
        if os.path.exists(filename):
            with open(filename, "r") as f:
                scenario_list.append(f.read())
            idx += 1
            print("made scenario " + str(idx))
        else:
            print("Stopped at scenario " + str(idx))
            print("Because " + filename + " doesn't exist")
            break
    return scenario_list
scenarios = make_scenario_list()

made scenario 1
made scenario 2
made scenario 3
made scenario 4
made scenario 5
made scenario 6
made scenario 7
made scenario 8
made scenario 9
made scenario 10
made scenario 11
made scenario 12
made scenario 13
made scenario 14
made scenario 15
made scenario 16
made scenario 17
made scenario 18
made scenario 19
made scenario 20
made scenario 21
made scenario 22
made scenario 23
made scenario 24
made scenario 25
made scenario 26
made scenario 27
made scenario 28
made scenario 29
made scenario 30
made scenario 31
made scenario 32
made scenario 33
made scenario 34
made scenario 35
made scenario 36
made scenario 37
made scenario 38
made scenario 39
made scenario 40
made scenario 41
made scenario 42
made scenario 43
made scenario 44
made scenario 45
made scenario 46
made scenario 47
made scenario 48
made scenario 49
made scenario 50
made scenario 51
made scenario 52
made scenario 53
made scenario 54
made scenario 55
made scenario 56
made scenario 57
made scenario 58
made scenario 59
made s

In [37]:
os.path.exists("./scenarios/086.txt")

True

In [38]:
def format_list(strings):
    if not strings:
        return ""

    if len(strings) == 1:
        return strings[0]
    
    if len(strings) == 2:
        return f"{strings[0]} or {strings[1]}"

    last = strings[-1]
    rest = ', '.join(strings[:-1])

    return f"{rest}, or {last}"

def list_of_chars_in_string(string):
    return format_list(list(filter(lambda c: (c + ":") in string ,model_chars,)))

In [39]:
annotation_prompt = [
                {"role": "system", "content": """You are an expert roleplaying AI with deep understanding of internet roleplay formats and extensive writing ability. 

- Your task is to convert raw text from the Visual Novel Steins;Gate into a roleplay format. 
- You will add physical actions done by the characters to their lines in a compelling, narrative way, that makes sense in the context of the scene you're modifying. 
- Actions should be surrounded by *asterisks*, and spoken things should be surrounded by "double quotes".
- You may also find it useful to add non-action, non-dialogue text to characters' responses, (such as 'she says' or other such generic connective terms) to make sentences make sense.
- Do not change which character speaks any given line.

All lines should be adapted to be in the first person, e.g., *I do X*.

Some lines are very important and have a lot of narrative/emotional weight. You may dramatically overhaul some lines to be stunning anchors of the scene. Here's an example:

\"\"\"
Kurisu: "Sigh... You still haven't made up your mind? You like Mayuri, don't you?"
\"\"\"

The context behind that line is that Kurisu is asking Okabe whose life he is going to save — hers or Mayuri's — near the end of the Visual Novel. It's in the middle of a very tense and emotional scene. You could enhance it to become: 

\"\"\"
Kurisu: *I hesitate, my fingers tracing a pattern on the cold concrete beneath me, as if it could somehow help me find the right words. My breath catches, and I feel a sting in my eyes. It's a vulnerability I seldom let myself feel.* "Sigh... Have you still not made up your mind?" *I search Okabe's face, looking for an answer, my voice trembling but firm.* "You like Mayuri, don't you?" *I muster every ounce of courage to ask the question, needing clarity in this whirlwind of emotions.*
\"\"\"

You can write as much as you want, conciseness is not a priority. Try to add at least 2 sentences of *actions* to every line you are given.

Some detailed instructions:
- Spoken words in the original should be left intact — you're adding to the script, not undoing it.
- Add actions and novel-like connective text to make dialogue more roleplay-like.
- Ensure continuity at all costs. If you ended up cutting or changing something for some reason, make sure that the lines after it make sense.
- Remember to write all lines in first person, no matter who the speaker is. E.g., do Mayuri: "Tutturu!" *I say as I walk into the Lab, a warm smile on my face. I'm so happy to see everyone~! I hope everyone is having a good day today~* instead of Mayuri: "Tutturu!" *She cheerfully greets everyone as she enters the room, exuding her usual cheerful aura.* Additionally, as in this example, make sure the thoughts/*actions* of each character match the personalities of the character.
- Outline the roleplay scene before writing.
    - Analyze the dialogue to understand what's happening physically.
    - Brainstorm character actions to reveal emotions and thoughts.
- Start roleplay text with "Roleplay:".
- For interrupted sentences, split and insert the interrupting action.
    - Example: Okabe: "I'll find a way to--" *I don't get the chance to finish my sentence* \nKurisu: *I snatch Okabe's phone out of his hands.*\nOkabe: "What are you doing!?" *I stammer at Kurisu.*
    - The interrupting character should not say new things—they should only do *actions.*
    - Also note that characters write in first person, but refer to other characters by their names or pronouns.
- Add *actions* to the ENTIRE scene.
- If a scene transition (for instance, Okabe and Kurisu leaving an assembly hall) can be explained with an *action*, add one that makes the transition between scenes manageable. 
- If there is any sort of transition between scenes, turn it into a long, descriptive *action* by Okabe that explains what happened in the interlude. However, whenever a character is speaking or taking *action*, their line must start with "TheirName:" in that format exactly. Anything else will break the script that is parsing your outputs so be careful about this."""},]

In [40]:
# def get_tokens_of_conv(conv):
#     """Gets the number of tokens in a conversation"""
#     return len(tokenizer.encode(' '.join([d[1] for d in conv[-1]])))

# # training_data_conversations_filtered[1][-1]

# get_tokens_of_conv(training_data_conversations_filtered[1])

In [41]:
# print('\n\n'.join([f'{speaker}: {line}' for speaker, line in training_data_conversations_filtered[1][-1]]))

In [42]:
def annotate_conversation(training_data_example, destination_directory, example_index):
    full_conversation = training_data_example[-1]
    context = '\n\n'.join([f'{speaker}: {line}' for speaker, line in full_conversation])

    scenario = scenarios[example_index]

    if not os.path.exists(os.path.join(destination_directory, f'{example_index:03d}.txt')):
        prompt = annotation_prompt + [{
            "role" : "user",
            "content" : f"""Dialogue for reformatting:
\"\"\"
{context.replace("Rintaro:", "Okabe:").strip()}
\"\"\"

## Be sure to remember:
1. Lines from UNSPOKEN represent narration of actions or thoughts from Okabe's POV. If it makes sense for one of these to be turned into an *action* by a character, do so. Be sure to always use first person.
2. Don't leave any UNSPOKEN lines on lines by themselves; all actions and narrations must be part of a character's line. Please, do not write "UNSPOKEN" at all in your response.
3. Every line you write in the roleplay must have a Character: saying it.
4. BE SURE to write the CORRECT speaker for any given line, I have seen a few cases where you accidentally switch who is saying a line and that messes up the whole scene. This applies to thoughts and actions too: Okabe should never *think* in the middle of other characters' lines, for instance. This avoids breaking the roleplay rule where people shouldn't act out other people's characters for them.
5. Remember that all lines should be adapted to be in the first person, e.g., *I do X*.
6. IMPORTANT: the writing should be powerful and nuanced, conveying small details that reveal characters' motivations, personalities, and thoughts, often without overtly stating them — it should be deep and poetic at once, and ought to make up for the lack of a visual element in plain text.  Write powerfully, but not pretentiously.
7. In your planning stage, explicitly mention the archetypes/personalities of each of the characters involved, and take brief notes on what word choices/writing styles you'll write their *actions and thoughts* in, considering that information.
8. Be varied in the *actions* you add. DO NOT just focus on facial expressions and the eyes: sounds, slight movements, sighs, recoiling or leaning forward... instead of being repetitive, be DIVERSE AND COMPELLING in your writing. Characters might interact with items in the area around them; you can embellish here and add things that aren't explicitly mentioned in the original scene, so long as it makes sense. And again, match the personalities of the characters involved: dramatic characters are flamboyant; shy characters are often quiet, etc. However characters have nuance: even flamboyant characters can be depressed and worn out under trying circumstances, and you should look out for this, always using the emotion most fitting for the situation and character.
9. All *actions* and *thoughts* should be written in first person for the character that's doing/thinking them. E.g., Mayuri: *I clap my hands together, my eyes wide and bright with anticipation.* "Let's try sending more!"
10. Do NOT forget to write out your plan/thoughts/brainstorming before outputting the final roleplay, and mention in this plan which 1–2 lines will be anchors, and what thematic direction you'll take them in."""
        }]
        if example_index == 0: # make sure not going catastrophically wrong
            print("------".join([d["content"] for d in prompt]))
        
        response = openai.ChatCompletion.create(
            model="gpt-4",
            temperature=0.7,
            top_p=0.9,
            messages=prompt
        )
    
        annotation = response['choices'][0]['message']['content']
        print(annotation)

        filename_cot_debug = os.path.join(destination_directory, f'{example_index:03d}_cot_debug.txt') # I'm paying for the tokens, I damn well want to see them
        try:
            annotation_filtered = re.search('Roleplay:(.*)', annotation, re.DOTALL).group(1)

            # Write the scenario to the file
            with open(filename_cot_debug, 'w') as f_1:
                f_1.write(annotation)

            # Assume "Scenario:\n" is followed by the actual scenario

            # Create a filename based on the example index
            filename = os.path.join(destination_directory, f'{example_index:03d}.txt')

            # Write the scenario to the file
            with open(filename, 'w') as f_2:
                f_2.write(annotation_filtered)
        except:
            print("ERROR in regex, GPT probably screwed up")
    else:
        print(f"Skipping {example_index:03d} because it already exists.")

In [45]:
if not annotated_dataset_has_been_manually_edited:
    
    # Create annotated training examples (same # of them as training examples and scenarios)
    for idx, example in enumerate(training_data_conversations_filtered):
        try:
            annotate_conversation(example, 'annotated_convs', idx)
        except:
            pass # prevent timeouts from screwing me

    # Create annotated training examples (same # of them as training examples and scenarios)
    for idx, example in enumerate(training_data_conversations_filtered):
        try:
            annotate_conversation(example, 'annotated_convs', idx)
        except:
            pass

Skipping 000 because it already exists.
Skipping 001 because it already exists.
Skipping 002 because it already exists.
Skipping 003 because it already exists.
Skipping 004 because it already exists.
Skipping 005 because it already exists.
Skipping 006 because it already exists.
Skipping 007 because it already exists.
Skipping 008 because it already exists.
Skipping 009 because it already exists.
Skipping 010 because it already exists.
Skipping 011 because it already exists.
Skipping 012 because it already exists.
Skipping 013 because it already exists.
Skipping 014 because it already exists.
Skipping 015 because it already exists.
Skipping 016 because it already exists.
Skipping 017 because it already exists.
Skipping 018 because it already exists.
Skipping 019 because it already exists.
Skipping 020 because it already exists.
Skipping 021 because it already exists.
Skipping 022 because it already exists.
Skipping 023 because it already exists.
Skipping 024 because it already exists.


In [28]:
len(scenarios)

85

In [None]:
# Check that all annotated files exist

for i in range(0,20): # you will want to modify the end value here to make it fit your dataset
    if not os.path.exists(f"./annotated_convs/{i:03d}.txt"):
        print("Problem in ", i)

In [None]:
# Define a different read function modified for annotated filenames:
def read_all_dialogues_annotated(directory_path):
    # List to store all dialogues (each dialogue is a list of tuples)
    all_dialogues = []
    
    # Get list of all filenames in the directory
    filenames = [f for f in os.listdir(directory_path) if not f.endswith('_cot_debug.txt')]
    
    # Sort filenames by their numerical index
    filenames.sort(key=lambda x: int(x.split('.')[0]))
    
    for filename in filenames:
        file_path = os.path.join(directory_path, filename)
        dialogue = read_dialogue(file_path)
        all_dialogues.append(dialogue)
    
    return all_dialogues

In [None]:
# Get annotated conversations back into notebook data from files

# # read off every annotated conversation, and make a list of them that lines up with the training data
annotated_conversations = read_all_dialogues_annotated("./annotated_convs")

# Reset global data analysis variables
lines_with_bad_quotes = 0
lines_merged = 0
lines_with_space_issues = 0

processed_annotated_conversations = list(map(call_multiple_processors(remove_only_ellipsis_lines,merge_consecutive_lines,add_space_after_punctuation,replace_odd_quote,), enumerate(annotated_conversations)))

print(lines_with_space_issues,lines_merged,lines_with_bad_quotes)
print(processed_annotated_conversations[5])

In [None]:
def extract_anchors(text):
    # Split the text by newline to get individual lines
    lines = text.split('\n')
    
    # Pattern for extracting the speaker, line number, and content
    pattern = r"(\w+) \((\d+)\): (.+)"
    
    # List to store the results
    result = []
    
    for line in lines:
        match = re.match(pattern, line)
        if match:
            # Extract the speaker, line number, and content from the matched groups
            speaker = match.group(1)
            line_number = int(match.group(2))  # Convert line number to integer
            content = match.group(3)
            
            # Append the values to the result list
            result.append((speaker, line_number, content))
    
    return result

In [None]:
# Anchor Annotate

# Steps: same as before but this time we format the conv with line numbers
# and replace the lines whose numbers match the newly-generated anchors

# For each thing in processed_annotated_conversations, call this on it
def create_anchors(training_data_example, destination_directory, example_index):
    anchor_prompt = """You are an expert creative writing AI with deep understanding of internet roleplay formats and masterful writing ability. 

You will be given a modified scene from the Visual Novel Steins;Gate. Each line spoken is numbered. Some lines in this scene are very important and have a lot of narrative/emotional weight. Your goal is to pick one or two of the numbered lines, based on relevance and content, and rewrite the *actions* of those lines to make them stunning anchors of the entire scene. This will be accomplished through a mixture of embellishment, creativity, and expansion of the original line. 

All lines should be adapted to be in the first person, e.g., *I do X*.

Here's an example. Consider the line:

\"\"\"
Kurisu: "Sigh... You still haven't made up your mind? You like Mayuri, don't you?" *She looks away, seemingly frustrated but also concerned.*
\"\"\"

The context behind that line is that Kurisu is asking Okabe whose life he is going to save — hers or Mayuri's — near the end of the Visual Novel. It's in the middle of a very tense and emotional scene. You could enhance it to become: 

\"\"\"
Kurisu: *I hesitate, my fingers tracing a pattern on the cold concrete beneath me, as if it could somehow help me find the right words. My breath catches, and I feel a sting in my eyes. It's a vulnerability I seldom let myself feel.* "Sigh... Have you still not made up your mind?" *I search his face, looking for an answer, my voice trembling but firm.* "You like Mayuri, don't you?" *I muster every ounce of courage to ask the question, needing clarity in this whirlwind of emotions.*
\"\"\"

Maintain the Character (number): line format of the line(s) you change."""
    full_conversation = training_data_example[-1]
    context = '\n\n'.join([f'{l[0]} ({i + 1}): {l[1]}' for i, l in enumerate(training_data_example)])

    scenario = scenarios[example_index]

    if not os.path.exists(os.path.join(destination_directory, f'{example_index:03d}.txt')):
        prompt = [{
            "role" : "system", "content" : anchor_prompt
            },
                    {
            "role" : "user",
            "content" : f"""Scenario/setting: \"\"\"{scenario}\"\"\"

Text to add some good anchors to:
\"\"\"
{context.replace("Rintaro:", "Okabe:").strip()}
\"\"\"

## Be sure to remember:
Recall that your task is to find 4 important lines that can be lengthened and embellished significantly, so as to give greater impact to important parts of a scene through absolutely stellar prose that makes up for the lack of a visual element in the plain text. The writing should be powerful and nuanced, conveying small details that reveal characters' motivations, personalities, and thoughts, without overtly stating them — it should be deep and poetic at once. About 4–5 sentences long (60-70 words per enhanced line).
1. Write using a variety of words and immense stylistic flair appropriate to the scene. Be creative and prioritize making the new lines compelling instead of 100% accurate to the original. Alternate short, snappy responses with long and detailed prose to give the text a good rhythm.
2. BE SURE to write the CORRECT speaker for any given line, I have seen a few cases where you accidentally switch who is saying a line and that messes up the whole scene. Also, do not break the important roleplay rule whereby characters should not act on behalf of other characters: Okabe should not think or take *actions* during one of Kurisu's lines, for instance. No one but the character whose line it is should speak on that line.
3. IMPORTANT: the writing should be powerful and nuanced, conveying small details that reveal characters' motivations, personalities, and thoughts, often without overtly stating them — it should be deep and poetic at once, and ought to make up for the lack of a visual element in plain text. Write powerfully, but not pretentiously.

Before you begin, you should plan out and brainstorm your approach. In your planning stage, explicitly identify lines you are going to radically enhance with extra-long actions to serve as the "anchors" of the scene. These anchors, after you add your extensive, prose-like *actions* to them, should end up being at least 60 WORDS long, and very compelling. Mention in your plans which 4 lines will be anchors, and what thematic direction you'll take them in.

Don't rewrite the entire text: just add your new high-quality lines, with the correct line number. You don't even need to rewrite the lines you're going to enhance in their entirety in your planning phase; you can just list their line number.
"""
            }]
        if example_index == 0: # make sure not going catastrophically wrong
            print("------".join([d["content"] for d in prompt]))
        response = openai.ChatCompletion.create(
            model="gpt-4",
            temperature=0.7,
            n=1,
            top_p=0.9,
            messages=prompt
        )
    
        anchors = response['choices'][0]['message']['content']
        
        filename = os.path.join(destination_directory, f'{example_index:03d}.txt') # process for getting lines from cot file is exact same time complexity as getting from non-cot file. Only going to save one type
        with open(filename, 'w') as f_1:
                f_1.write(anchors)
    else:
        print(f"Skipping {example_index:03d} because it already exists.")
        
        

In [None]:
processed_annotated_conversations[0]

In [None]:
# make anchors
if not annotated_dataset_has_been_manually_edited:
    for idx, annotated_conv in enumerate(processed_annotated_conversations):
        create_anchors(annotated_conv,"anchors",idx)
        
# modify processed examples to have the new anchor lines
# happens regardless of whether we're doing this the first time or not
for idx, annotated_conv in enumerate(processed_annotated_conversations):
    filename = f"anchors/{idx:03d}.txt"
    with open(filename, "r") as f:
        newlines = extract_anchors(f.read())
        for line in newlines: # modify processed_annotated_conversations in place. I don't have to reread things from files now.
            processed_annotated_conversations[idx][line[1]] = (line[0], line[2])

In [None]:
# We'll do this in the training script, not the data creation script
# for annovated_conv in processed_annotated_conversations:
#     for idx, line in enumerate(annotated_conv):
#         annotated_conv[idx] = (line[0],line[1].replace("*",""))

In [None]:
processed_annotated_conversations[0]

In [None]:
processed_annotated_conversations[0] # The modification in-place works

In [None]:
# merge lines and such again, in case of errors. Watch for output here, there shouldn't be any.
processed_annotated_conversations = list(map(call_multiple_processors(remove_only_ellipsis_lines,merge_consecutive_lines,add_space_after_punctuation,replace_odd_quote), enumerate(processed_annotated_conversations)))

In [None]:
# In case this hasn't been run already, due to the flag being set (the number of training examples in the annotated dataset/scenario files may be different than the number generated by the script if the user deleted some manually) we can run it here
if annotated_dataset_has_been_manually_edited:
        def make_scenario_list_trainingdata():
                scenario_list = []
                for idx, content in enumerate(processed_annotated_conversations):
                        with open(f"scenarios/{idx:03d}.txt", "r") as f:
                                scenario_list.append(f.read())
                return scenario_list

        scenarios = make_scenario_list(processed_annotated_conversations)

In [None]:
# print(annotated_conversations)

In [None]:
# Whole process text loop again, turn into tuple list
# Create training examples (again)
training_data_conversations_annotated = list(map(generate_training_examples, processed_annotated_conversations))
training_data_conversations_annotated = [[subsublist for subsublist in sublist if len(subsublist) > 1] for sublist in training_data_conversations_annotated]


In [None]:
# Helper that creates JSON object for a training example at a certain index (annotated history, annotated completion, scenario)
def create_json_object(annotated_conversation, example_index):
    last_speaker, last_line = annotated_conversation[-1]
    return { # or something like this
        "history": '\n'.join([f'{speaker}: {line}' for speaker, line in annotated_conversation[:-1]]), # Since spoken lines probably don't have newlines, we can safely split at newlines to get the speakers back from the json
        "completion": f'{last_line}',
        "speaker": annotated_conversation[-1][0],
        "scenario": scenarios[example_index],
    }

In [None]:
# Turn annotated conversation into list of json objects for eventual use in the training script

final_examples = []
for idx, conv in enumerate(training_data_conversations_annotated): # conv is a list of lists of tuples
    for ex in conv: # ex is a list of tuples
        final_examples.append(create_json_object(ex,idx))

In [None]:
# Inspect
final_examples[0]

In [None]:
import json

with open('final_dataset.json','w') as f:
    f.write(json.dumps(final_examples, indent=2))
