In [54]:
# Process Text (raw) 
# But the functions will work on the later text, too.
import re
testval = 0

# Step 1: Remove Unwanted Strings
# Regex to match unwanted patterns enclosed in []
unwanted_pattern = re.compile(r"\[color index=\".*?\"\]|\[(?!name|line|%p).*?\]")
def remove_unwanted_strings(text):
    return unwanted_pattern.sub('', text)

# Step 2: Parsing the text
# I'll update the regular expressions to exclude the delimiters.
name_regex = re.compile(r"\[name\](.*?)\[line\]")
dialogue_regex = re.compile(r"\[line\](.*?)\[%p\]")
monologue_regex = re.compile(r"^(.*?)(?=\[%p\])")

def makecols(str):
    global testval
    """Returns a tuple of (speaker, dialogue) from a single line from the script"""
    name_results = name_regex.search(str)
    dialogue_results = dialogue_regex.search(str)
    if name_results is None:
        monologue_results = monologue_regex.search(str)
        return ('UNSPOKEN', monologue_results.group(1) if monologue_results else "")
    try: 
        return (name_results.group(1).strip(), dialogue_results.group(1).strip())
    except:
        print(f"This is the name_results: {name_results}.\nAnd this is the dialogue: {dialogue_results}")
        testval += 1
        return ('ERROR!', '')

def not_empty_monologue(tup):
    if (tup[0] == 'UNSPOKEN') and (tup[1] == ''):
        return False
    return True

# Step 3: Final Processing
def process_script(filename):
    """Returns a list of tuples of (speaker, dialogue) from a script file, filters out empty monologue lines"""
    with open(filename, 'r') as f:
        raw_script = f.read()

    # Remove unwanted strings
    cleaned_script = remove_unwanted_strings(raw_script)

    # Split the cleaned_script into lines and filter out empty lines
    lines = [line.strip() for line in cleaned_script.split('\n') if line.strip()]

    # Process each line to make a tuple of (speaker, dialogue)
    script_tuples = list(map(makecols, lines))
    script_tuples = list(filter(not_empty_monologue, script_tuples))

    
    return script_tuples

script_tuples = process_script('combined_script.txt')
print(script_tuples[:20])  # Just printing the first 10 for visualization


script = process_script('combined_script.txt')
print(testval)

This is the name_results: <re.Match object; span=(0, 19), match='[name]Rintaro[line]'>.
And this is the dialogue: None
[('???', '“Hey, what are you mumbling about?”'), ('UNSPOKEN', 'There’s no sound from the phone against my right ear. Only silence.'), ('UNSPOKEN', 'I am baking in the summer sun.'), ('UNSPOKEN', 'Sweat slowly slides down my chin and drips onto the asphalt.'), ('???', '“Okarin? Earth to Okarin!”'), ('UNSPOKEN', 'A girl is standing in front of me.'), ('UNSPOKEN', 'She calls my name with an inquisitive tilt of her head.'), ('UNSPOKEN', 'We are about to infiltrate deep into enemy territory. Yet despite the imminent risk of death, there is no hint of tension on her innocent, childlike features.'), ('UNSPOKEN', 'I cover my phone’s mouthpiece and turn to the girl with an index finger to my lips.'), ('???', '“You talking to someone?”'), ('UNSPOKEN', 'I nod and put my phone back to my ear.'), ('UNSPOKEN', 'Still no sound from the other side.'), ('UNSPOKEN', 'My contact is wise 

In [55]:
# remove_unwanted_strings("There’s no sound from the phone against my right ear. Only silence.[%p]")
makecols("There’s no sound from the phone against my right ear. Only silence.[%p]")
# monologue_regex.search("There’s no sound from the phone against my right ear. Only silence.[%p]")[1]

('UNSPOKEN',
 'There’s no sound from the phone against my right ear. Only silence.')

In [56]:
# CHARACTER CHOICE #
# Change the global values here if you want to change the character dataset being created, and the person the "user" is roleplaying as.

# The character the user is roleplaying as
user_char = "Rintaro"

# The character the model is going to be trained to be
model_char = "Kurisu"

# NOTE: DOUBLE CHECK THAT THESE ARE RIGHT BEFORE RUNNING THE NOTEBOOK #

In [57]:
# Define Tuple List Processors (before the example generation)
def remove_only_ellipsis_lines(tlist):
    """Remove lines that only contain ellipsis."""
    return [(speaker, line) for speaker, line in tlist if line.replace('“','').replace('”','') != "..."]

def merge_consecutive_lines(tlist):
    merged_tlist = []
    last_speaker = None
    for speaker, line in tlist:
        line_filtered = line.replace("“",'').replace("”",'')
        if not merged_tlist or speaker != last_speaker:
            # New speaker or first dialogue, just add it to the list
            merged_tlist.append((speaker, line.replace("“",'').replace("”",'')))
        else:
            # Same speaker as before, concatenate the lines
            prev_speaker, prev_line = merged_tlist.pop()
            merged_tlist.append((prev_speaker, (prev_line + " " + line).replace("“",'').replace("”",'')))
        last_speaker = speaker
    return merged_tlist # why do this step here? Because I don't want to iterate over the dataset twice, and monologues should count when examples are being generated with the sliding window, so I can't remove them in the usual spot.


def add_space_after_punctuation(tlist):
    corrected_tlist = []
    for speaker, line in tlist:
        # Add a space wherever there is a punctuation mark followed by a letter, excluding ellipsis
        corrected_line = re.sub(r'([.,!?])(?<!\.\.\.)(\w)', r'\1 \2', line)
        corrected_tlist.append((speaker, corrected_line))
    return corrected_tlist

def replace_odd_quote(tlist):
    corrected_tlist = []
    for speaker, line in tlist:
        corrected_line = line.replace("‘", "'").replace("’", "'")
        corrected_tlist.append((speaker, corrected_line))
    return corrected_tlist

In [58]:
# Call the tuple list processors
script_tuples_no_ellipsis = remove_only_ellipsis_lines(script_tuples)
script_tuples_no_consecutive = merge_consecutive_lines(script_tuples_no_ellipsis)
script_tuples_punctuation_fixed = add_space_after_punctuation(script_tuples_no_consecutive)
script_tuples_quote_fixed = replace_odd_quote(script_tuples_punctuation_fixed)

Notice that the tuple list processing functions don't do things that need to be repeated for the reading of the annotated script stuff. They're one-off operations. Thus they do not need to be abstracted further

In [59]:
# Create conversations from raw text
from transformers import AutoTokenizer
from tqdm import tqdm # it's not machine learning if there's no progress bar

tokenizer = AutoTokenizer.from_pretrained("Gryphe/MythoMax-L2-13b")

def generate_examples(script, tokenizer, model_char_count_min=1, window_length=10, user_char_count_min=1, max_lines_without_model_char=10):
    """Extracts useful conversations from the script according to a specific algorithm:

    1. A conversation is defined as a sequence of lines where the model_char speaks at least model_char_count_min times and the user_char speaks at least user_char_count_min times.
    2. A conversation ends when the model_char has not spoken for max_lines_without_model_char lines.
    3. A conversation is saved and a new one started if it is longer than window_length lines.
    
    """
    # MAX_TOKENS = 1500  # This produced really really good examples, but they were too large for GPT-4 to annotate while remembering its instructions, so I had to reduce it
    MAX_TOKENS = 900

    examples = []
    sliding_window = []
    example = []
    model_char_counter = 0
    user_char_counter = 0
    lines_without_model_char = 0
    making_conversation = False

    for dialogue in tqdm(script):
        speaker, line = dialogue

        if len(sliding_window) == window_length:
            sliding_window.pop(0)  # Remove first element

        sliding_window.append(dialogue)

        # Check if there are more than model_char_count_min spoken lines from model_char across sliding_window
        model_char_counter = sum(1 for d in sliding_window if d[0] == model_char)
        user_char_counter = sum(1 for d in sliding_window if d[0] == user_char)

        if speaker == model_char:
            lines_without_model_char = 0  # Reset count
        else:
            lines_without_model_char += 1  # Increment count
            
        can_start_conversation = model_char_counter >= model_char_count_min and user_char_counter >= user_char_count_min
        should_stop_conversation = making_conversation and (len(tokenizer.encode(' '.join([d[1] for d in example]))) > MAX_TOKENS or lines_without_model_char > max_lines_without_model_char)
        
        if making_conversation:
            if should_stop_conversation: # making conversation and should stop
                examples.append(example)
                example = []
                sliding_window = []
                model_char_counter = 0
                user_char_counter = 0
                lines_without_model_char = 0
                making_conversation = False
            else: # making conversation and should not stop
                example.append(dialogue)
        elif can_start_conversation: # not making conversation and should start, by appending an example to conversation as well as the entire sliding window
            start_appending = False
            for d in sliding_window:
                speaker, _ = d  # Extract the speaker from the tuple
                if not start_appending:
                    if speaker in ["UNSPOKEN", user_char, model_char]:
                        start_appending = True  # Start appending from this point onward
                if start_appending:
                    example.append(d)
            sliding_window = []
            making_conversation = True

    if example:  # Add last example if it's non-empty
        examples.append(example)

    return examples

In [60]:
created_examples_script = generate_examples(script_tuples_quote_fixed, tokenizer,)
print(len(created_examples_script))
# with open("script_dump.txt", "w") as f:
#     f.write(''.join())
# print(created_examples_script)

100%|██████████| 14351/14351 [00:03<00:00, 3857.87it/s]

239





In [61]:
# NOTE: DATA INSPECTION CELL #

created_examples_script_modified = [i + [("NOTE", "---NEW_CONV---")] for i in created_examples_script]
created_examples_script_flattened = [item for sublist in created_examples_script_modified for item in sublist]
with open("script_dump.txt", "w") as f:
    f.write('\n'.join([l[0] + ": " + l[1] for l in created_examples_script_flattened]))

# find out how many examples only have one kurisu line:
potentially_bad_examples = [i for i in created_examples_script if len([j for j in i if j[0] == "Kurisu"]) == 1]

# Take the first of those examples with only one kurisu line and print it out:
print(len(potentially_bad_examples))
potentially_bad_examples[0]

19


[('UNSPOKEN',
  "I try to warn her, but Mayuri is no longer there. She's gone. Mayuri disappears like this every so often. And each time she does, I wonder for one terrible second if she really has vanished from the world. I look around and see Mayuri standing at a distance. She's gazing up at the sky through the buildings. ...Here we go again. She's stopped in the middle of the street, so other pedestrians look at her, wondering what she's doing. But she doesn't notice their stares. She slowly reaches out to the sky, as if entranced. And then she freezes in that position. This is one of Mayuri's habits. I call it Stardust Handshake. Mayuri says that she's liked looking at the night sky ever since she was little. Her reason is romantic -- or perhaps childish."),
 ('Mayuri', 'I feel like I can reach the stars.'),
 ('UNSPOKEN',
  "When I asked her about it one day, she just smiled and gave that answer. At first she only reached out to the night sky, but lately, she's been doing it withou

OK so brief inspection of the script: the conversations are too short and sparse, I need to increase the number of lines without Kurisu for S;G as it is monologue heavy.

I'll have to make sure in my annotation prompt that the AI does not add actions to simple thoughts on the part of Okabe. Or maybe it should... so that the model gets used to continuing a train of thought?

Results of additional inspection: convs with only 2 Kurisu examples might be removal material; lines with 3 should be kept definitely; lines with 1 are being removed anyway so it doesn't make sense to have min_kurisu_lines be lower than 2

Even with the monologue added back in, the phone context is missing. Oh well.

Will need to make clear that unspoken can narrate both actions and Okabe's thoughts.

Smaller window stops waste at the start of a conversation, smaller max tokens stop waste at end of a conversation, but a smaller window makes it more likely that an example is missed (problem mitigated somewhat now that I'm doing the line merging BEFORE the example generation). A smaller max token size means that some of the really long conversations that are really really good get lost.

In [62]:
# Define Tuple List to Training example format
def generate_training_examples(conversation):
    training_examples = []
    temp_dialogue = []
    for idx, dialogue in enumerate(conversation):
        speaker, _ = dialogue
        temp_dialogue.append(dialogue)
        if speaker == 'Kurisu' and idx != 0:
            training_examples.append(temp_dialogue.copy())  # Add up to and including current line
    return training_examples

In [63]:
created_examples_script[59]

[('UNSPOKEN',
  "Moeka doesn't seem that interested, but she nods. We set the PhoneWave (name subject to change) to forward to my phone."),
 ('Kurisu', 'Thursday last week... so about five days. Is that okay?'),
 ('UNSPOKEN',
  'Kurisu puts on her lab coat before checking the phone attached to the PhoneWave (name subject to change).'),
 ('Itaru',
  'The drawing was five days ago. It needs to arrive before that, right?'),
 ('Kurisu',
  "We'll send it to one week ago. 7 days is 168 hours, so let's round up a bit and enter 170#."),
 ('Mayuri', 'Ummm, 1, 7, 0, #...'),
 ('UNSPOKEN',
  "Moeka has finished typing the email into my phone. We replaced the number 40 with 37. This way, we won't get 1st or 2nd prize."),
 ('Rintaro', "It'll have to do."),
 ('UNSPOKEN', "We can only send 36 bytes. There's only so much we can say."),
 ('Mayuri', "This mail's kinda shady, huh? Ehehe!"),
 ('Moeka', "...I'll redo it."),
 ('UNSPOKEN', 'I stop Moeka from erasing the mail.'),
 ('Rintaro',
  "It's fine. I d

In [64]:
# Create training examples
training_data_conversations = list(map(generate_training_examples, created_examples_script))


In [65]:
len(training_data_conversations)

239

In [66]:
# DEBUG # see first element of training convs
training_data_conversations[59]

[[('UNSPOKEN',
   "Moeka doesn't seem that interested, but she nods. We set the PhoneWave (name subject to change) to forward to my phone."),
  ('Kurisu', 'Thursday last week... so about five days. Is that okay?')],
 [('UNSPOKEN',
   "Moeka doesn't seem that interested, but she nods. We set the PhoneWave (name subject to change) to forward to my phone."),
  ('Kurisu', 'Thursday last week... so about five days. Is that okay?'),
  ('UNSPOKEN',
   'Kurisu puts on her lab coat before checking the phone attached to the PhoneWave (name subject to change).'),
  ('Itaru',
   'The drawing was five days ago. It needs to arrive before that, right?'),
  ('Kurisu',
   "We'll send it to one week ago. 7 days is 168 hours, so let's round up a bit and enter 170#.")]]

In [67]:
training_data_conversations_filtered = list(filter(lambda x: len(x) >= 1, training_data_conversations))
# len(processed_conversations)
print(training_data_conversations_filtered[99][-1])
print(len(training_data_conversations_filtered))

[('Kurisu', 'Okabe!'), ('UNSPOKEN', "Kurisu's advances on me with a stern glare."), ('Kurisu', "If you want to give me stupid nicknames, that's fine -- I mean, it's not, but... What you said just now is below the belt. I didn't take you for the kind of guy who thinks it's funny to hurt people's feelings, but I guess I was wrong. You're a real jerk, Okabe."), ('Rintaro', 'Huh?'), ('UNSPOKEN', 'What did I do to deserve this tongue-lashing?'), ('Kurisu', 'Apologize. Apologize to Urushibara-san right now.'), ('Rintaro', "Wait a second. I don't have to apologize for anything. I just said the tru--"), ('Luka', "It's okay."), ('UNSPOKEN', 'Lukako gets up from the sofa. His head is still hanging, and his voice is barely audible.'), ('Luka', "So that's how Okabe-san sees me..."), ('Rintaro', "No no no! That's not how I see you, Lukako. You're a guy, remember!? Or are you denying reality too!?"), ('Kurisu', "You're the one denying reality!"), ('Mayuri', "That's right! Luka-chan's a girl, Okarin.

In [68]:
import openai
openai.api_key = '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'

In [88]:
openai_scenario_prompt = [
                {"role": "system", "content": """You are an expert scenario-writing and prompt-engineering AI. Your task is to write the context for an interaction between Kurisu and Okabe from the visual novel Steins;Gate in a "scenario" -- a 5-sentence summary about what's happened until the point the interaction STARTS at (writing under the assumption that the reader knows who Kurisu is, and what some of her general traits are). You should use the lines provided to help determine the context behind a given scene.

Remember to keep the scenario at most three sentences long. Your goal is to describe, in generic terms, the conversation's SETTING, at the START of the conversation (as well as what Kurisu is feeling and trying to do) instead of being to summarize it. This context should make sense if the reader only had access to the first line said by Kurisu in the conversation (and the lines immediately preceding it).

Think step-by-step, and explain your plan to write an accurate and compelling scenario for the provided context before you actually write the scenario.

Here are two roleplay prompt engineering principles you should incorporate into your scenario:
1. Your first sentence should explain the context of the scene: where it takes place, what exactly that place is (in general terms) and what each of the characters are doing there. Focus on Rintaro and Kurisu when it comes to motivations.
2. End with a statement that describes where the scene is going, specifically, what Kurisu is trying to do.

Note that we're using the naming conventions of the visual novel here, so it's 'PhoneWave (name subject to change)', in case that ever comes up.

[To help orient you as you determine which part of the plot a conversation is taking in,  here is a plot summary of Steins;Gate:

Rintaro, a "mad scientist," meets genius Kurisu at a time travel lecture. They argue, and he later finds her apparently dead. Texting this to his friend Daru activates a prototype time machine, altering the timeline.

Kurisu turns out to be alive. Rintaro and friends, including Mayuri and Daru, discover their "Phone Microwave" sends texts, or "D-mails," to the past. They use D-mails to fulfill wishes for friends like Moeka, Faris, Luka, and Suzuha. Kurisu joins the lab and helps improve the time machine.

Rintaro alone remembers original timelines due to his "Reading Steiner" ability. They also create "Time Leaps," sending memories to the past. However, SERN discovers them, raids the lab, and kills Mayuri. Rintaro time-leaps repeatedly but can't save her.

To fix things, Rintaro, aided by Kurisu, undoes all D-mails, causing personal pain. They grow closer, but Rintaro realizes the first D-mail about Kurisu's "death" caused Mayuri's fate. Undoing it means sacrificing Kurisu, which he reluctantly does to save Mayuri.

Time-traveler Suzuha then contacts Rintaro, urging him to prevent World War 3 by saving Kurisu. Rintaro accidentally kills her himself, but gets advice from his future self on reaching a timeline—Steins Gate—where both friends live. He succeeds by faking Kurisu's death.

In the Steins Gate timeline, Rintaro and Kurisu encounter each other, experiencing déjà vu from past timelines.]

Take special care to write a scenario that would make sense to someone ignorant of the overall plot of Steins;Gate. IE, you are not just trying to write a scenario that makes sense only when viewed alongside the plot summary; you are writing something that gives adequate context to a scene by itself alone. Instead of using Steins;Gate specific terminology, you will use generic words and explanations to give context to a scene.

Note that UNSPOKEN lines can either be narration about what's happening, or Okabe's thoughts; they're all from Okabe's point of view, however.

One last pointer: keep the language simple. Which characters are where, under what circumstances, and what Kurisu is feeling and will do. The scene itself will do most of the talking. Keep the scenario 5 sentences long at most. Instead of mentioning events in the far future, you will concentrate on the event at hand and the things that led up to it."""},
    {
        "role": "user",
        "content": """Context: \"\"\"Okabe: Who the hell am I!? Someone who knows you for a fraud, that’s who! You stole your theory from John Titor! And you call yourself an inventor!?
Nakabachi: S-someone throw this man out!
Okabe: You're the one we should throw out, Doctor! Have you no shame!? You have no right to call yourself an inventor!
Nakabachi: Shut your mouth, you little pest!
UNSPOKEN: Just then, someone grabs my arm from behind. Quite convinced it's an official here to throw me out, I turn around to glare him down.
Okabe: Unhand me, you... huh?
UNSPOKEN: It's a girl about my age. Her intense stare seems to challenge me. I take a step back. Her face looks somehow familiar. Where have I seen her before?
Okabe: Ah...
UNSPOKEN: We haven't met, but I know her face. It's Makise Kurisu. A few days ago, my friend Daru showed me a magazine article titled Girl Genius Gives Lecture in Akihabara. The article was about a 17-year-old girl who had just graduated from an American university. Her thesis was even published in a major scientific journal. Girl Genius, Makise Kurisu. I recognize the stubborn-looking girl from her photograph. She's even wearing the exact same scowl. What business could such a genius have with me? She takes a quick look around the room, then turns back to me with a stern expression.
Kurisu: Could you come with me for a moment?
UNSPOKEN: What's with the attitude? She's obviously not staff, and there's no way that the Makise Kurisu would be working with someone like Doctor Nakabachi. Which means... no!
Okabe: Y-you're with the Organization!?
Kurisu: Huh?
Okabe: If their tendrils have gotten this far, then I've made a grave mistake.
Kurisu: Stop fooling around and come with me.
UNSPOKEN: My outburst has already attracted too much attention. Nakabachi, in particular, looks like he wants to rip my head off. It must be mortifying to be exposed as a fraud by a bright young man like myself. Anyway, I mustn’t draw any more attention to myself. If the Organization gets wind of my presence here, it could endanger Mayuri -- to say nothing of these ignorant civilians. I let Makise Kurisu lead me out of the assembly hall.
Okabe: Try anything and people are sure to notice. What will your superiors say then?
Kurisu: What are you talking about?\"\"\""""
    },
    {
        "role" : "assistant",
        "content": """Plan:
1. Start by describing the setting and the ongoing event - a lecture where Okabe causes a scene. 
2. Describe what kind of person Okabe is. Nakabachi is not involved in the main conversation with Kurisu, so I will not explain who he is.
3. Mention Kurisu's relation to Okabe, that they are meeting for the first time.
4. Describe Kurisu's current state, slightly puzzled and intrigued by Okabe's outburst and behavior.
5. Conclude by stating Kurisu's intention - to find out what Okabe was trying to tell her.
6. Do not mention anything that occurs after the first few lines Kurisu speaks.

Scenario:
In the midst of a heated lecture, Okabe, an eccentric young man, stirs up an argument, accusing the speaker of plagiarism. Kurisu is in the crowd observing the spectacle, and believes Okabe had tried to tell her something earlier (Okabe believes that he's never met Kurisu). This is the first time Kurisu and Okabe meet, and she finds Okabe's erratic behavior puzzling yet intriguing. Intrigued by Okabe's strange behavior and apparent knowledge of her, Kurisu decides to approach him during the lecture with the intention of uncovering what he was trying to communicate to her earlier."""
    },
            ]

In [89]:
# A FUNCTION THAT LETS YOU CALL OPENAI ON ALL THE EXAMPLES
import openai
import os

def write_context_to_file(training_data_example, destination_directory, example_index): # for easier inspection
    """Writes a training example (conversation, the full thing) to a file in the destination directory, so that the input for a scenario can be inspected"""
    full_conversation = training_data_example[-1]
    context = '\n'.join([f'{speaker}: {line}' for speaker, line in full_conversation])
    
    filename = os.path.join(destination_directory, f'{example_index:03d}_conversation.txt') # I'm paying for the tokens, I damn well want to see them

    # Write the scenario to the file
    with open(filename, 'w') as f_1:
        f_1.write(context)

for idx, content in enumerate(training_data_conversations_filtered):
    """Write all training examples to indexed files"""
    write_context_to_file(content, 'conversations', idx)
    

def create_scenario(training_data_example, destination_directory, example_index):
    """Creates a scenario for a training example and writes it to a file in the destination directory"""
    full_conversation = training_data_example[-1]
    context = '\n'.join([f'{speaker}: {line}' for speaker, line in full_conversation])

    if not os.path.exists(os.path.join(destination_directory, f'{example_index:03d}_cot_debug.txt')):
        response = openai.ChatCompletion.create(
            model="gpt-4",
            temperature=0.7,
            # top_p = 0.9,
            messages=openai_scenario_prompt + [{
                "role" : "user",
            "content" : f"""Context (do not forget to mention in your final response how well Kurisu knows all of the characters present in this scene): \"\"\"{context.replace("Rintaro:","Okabe:")}\"\"\"
            
Remember that:
1. The scenario you write should assume complete knowledge of who Kurisu is (it should not introduce her), but must state her relation to the other characters in the scene.
2. The scenario you write should set up the scene, not summarize it, and not hint at its conclusion. It describes the moment up to Kurisu's first message.
3. DON'T actually metion that this is happening during Steins;Gate, and don't describe elements of the plot that aren't related to the ongoing scene. Be FOCUSED.
4. Follow the rough format (don't copy the words verbatim, but the order should generally be preserved: Because of [brief background explaining that scene that can be understood by a generic reader] Kurisu is at [setting] along with [other character(s) who are directly involved in the interactions between Kurisu and Okabe]. Kurisu knows [characters] [well or not well, positively or negatively]. Kurisu is trying to [do something (that involves Okabe)].
5. If the lines shown cover multiple scenes, focus only on the first one that involves both Okabe and Kurisu speaking.
6. You don't have to introduce who Kurisu or Okabe are, but you should briefly mention who the other characters are.

Don't forget to plan out your final answer before you write it. At least 4 bullet points of thought process are recommended."""
            }]
        )
    
        scenario = response['choices'][0]['message']['content']
        
        filename_cot_debug = os.path.join(destination_directory, f'{example_index:03d}_cot_debug.txt') # I'm paying for the tokens, I damn well want to see them

        # Write the scenario to the file
        with open(filename_cot_debug, 'w') as f_1:
            f_1.write(scenario)

        # # Assume "Scenario:\n" is followed by the actual scenario
        scenario = re.search('Scenario:(.*)', scenario, re.DOTALL).group(1)

        # Create a filename based on the example index
        filename = os.path.join(destination_directory, f'{example_index:03d}.txt')

        # Write the scenario to the file
        with open(filename, 'w') as f_2:
            f_2.write(scenario)
    else:
        print(f"Skipping {example_index:03d} because it already exists.")

In [90]:
# NOTE WARNING -- THIS CELL SPENDS MONEY IF YOU HAVE NOT GENERATED TRAINING EXAMPLES YET.
# If you have not generated training examples yet, this cell will generate them and then generate scenarios for them.

for idx, content in enumerate(tqdm(training_data_conversations[:3])):
    # write_context_to_file(content, 'contexts', idx)
    create_scenario(content, 'scenarios', idx)

print("\nBeginning Second Pass...\n")

for idx, content in enumerate(tqdm(training_data_conversations[:3])): # run it again to catch everything that failed the first time. The fact that already-generated scenarios are skipped means this doesn't cost any unneeded money.
    # write_context_to_file(content, 'contexts', idx)
    create_scenario(content, 'scenarios', idx)

100%|██████████| 3/3 [00:53<00:00, 17.75s/it]



Beginning Second Pass...



100%|██████████| 3/3 [00:00<00:00, 8251.09it/s]

Skipping 000 because it already exists.
Skipping 001 because it already exists.
Skipping 002 because it already exists.





In [91]:
# Get scenarios back into notebook data from files

# read off every scenario, and make a list of them that lines up with the training data
def make_scenario_list(training_data_conversations):
    scenario_list = []
    for idx, content in enumerate(training_data_conversations):
        with open(f"scenarios/{idx:03d}.txt", "r") as f:
            scenario_list.append(f.read())
    return scenario_list

scenarios = make_scenario_list(training_data_conversations[:3])

In [92]:
annotation_prompt = [
                {"role": "system", "content": """For the rest of this conversation, you are an expert roleplaying AI with deep understanding of internet roleplay formats. I need your assistance in annotating a dataset of lines from the Visual Novel Steins;Gate. Since Steins;Gate is a visual novel, the dialogue is written without much indication of the physical actions any characters are performing. You are going to add physical actions done by the characters to their voice lines in a compelling, narrative way, that make senses in the context of the scene you're modifying. Actions should be surrounded by asterisks, and things the charactes say should be surrounded by double quotes (""). You may also find it useful to add non-action, non-dialogue text to characters' responses, (such as 'she says' or other such generic connective terms) to make sentences make sense.
 
In case I was unclear about what I mean by "physical actions the characters are taking", consider this example I just made up on the spot:

```
Character: "You fool," *character rolls her eyes in exasperation, raising her voice* "this is fundamental internet roleplay stuff, of course I know how it works!" she says.
```

Write character responses in the tense shown in the example above, except for Okabe Rintaro, whose lines should be adapted to be in the first person. So for instance:

```
Okabe: "This is the choice of Steins Gate!" *I strike a pose.* "Muahahaha!" I finish my exposition by laughing manaically.
```
Some detailed instructions:
* Keep spoken lines intact; these are in "quotes."
* Add actions and novel-like connective text to make dialogue more roleplay-like.
    * Make it logical, narratively compelling, but not verbose.
* Refer to Okabe Rintaro in other characters' actions the same way the unspoken lines do -- as "me".
    * Example: Kurisu: "...So you talk to yourself." *She raises an eyebrow, clearly questioning my sanity.*
* Use characters' names for actions when not referring to Okabe.
    * Example: Kurisu: "Can't be too careful around the perv duo," *She shoots a glare at Daru.*
* Outline the roleplay scene before writing.
    * Analyze the dialogue to understand what's happening physically.
    * Brainstorm character actions to reveal emotions and thoughts.
* Start roleplay text with "Roleplay:".
* Follow Steins;Gate VN naming conventions, e.g., 'PhoneWave (name subject to change)'.
* For Okabe's actions, use character's name or pronoun instead of "your."
    * Example: I wonder what she's doing (not I wonder what you're doing).
* For interrupted sentences, split and insert the interrupting action.
    * Example: Okabe "I'll find a way to--\nKurisu: *snatches phone*\nOkabe "What are you doing!?" *I stammer.*
    * The interrupting character should not say new lines; only add actions.
* If unsure about adding an action for an interruption, leave the line as-is.
* Add *actions* to the ENTIRE scene, even minor ones will do.
* If a scene transition (for instance, Okabe and Kurisu leaving an assembly hall) can be explained with an *action*, add one that makes the transition between scenes manageable."""},
    {
        "role": "user",
        "content": """Scenario/setting: \"\"\"In the midst of a heated lecture, Okabe, an eccentric young man, stirs up an argument, accusing the speaker of plagiarism. Kurisu is in the crowd observing the spectacle, and believes Okabe had tried to tell her something earlier (Okabe believes that he's never met Kurisu). This is the first time Kurisu and Okabe meet, and she finds Okabe's erratic behavior puzzling yet intriguing. Intrigued by Okabe's strange behavior and apparent knowledge of her, Kurisu decides to approach him during the lecture with the intention of uncovering what he was trying to communicate to her earlier.\"\"\"

Dialogue for reformatting:
\"\"\"
Okabe: Who the hell am I!? Someone who knows you for a fraud, that's who! You stole your theory from John Titor! And you call yourself an inventor!?

Nakabachi: S-someone throw this man out!

Okabe: You're the one we should throw out, Doctor! Have you no shame!? You have no right to call yourself an inventor!

Nakabachi: Shut your mouth, you little pest!

UNSPOKEN: Just then, someone grabs my arm from behind. Quite convinced it's an official here to throw me out, I turn around to glare him down.

Okabe: Unhand me, you... huh?

UNSPOKEN: It's a girl about my age. Her intense stare seems to challenge me. I take a step back. Her face looks somehow familiar. Where have I seen her before?

Okabe: Ah...

UNSPOKEN: We haven't met, but I know her face. It's Makise Kurisu. A few days ago, my friend Daru showed me a magazine article titled Girl Genius Gives Lecture in Akihabara. The article was about a 17-year-old girl who had just graduated from an American university. Her thesis was even published in a major scientific journal. Girl Genius, Makise Kurisu. I recognize the stubborn-looking girl from her photograph. She's even wearing the exact same scowl. What business could such a genius have with me? She takes a quick look around the room, then turns back to me with a stern expression.

Kurisu: Could you come with me for a moment?

UNSPOKEN: What's with the attitude? She's obviously not staff, and there's no way that the Makise Kurisu would be working with someone like Doctor Nakabachi. Which means... no!

Okabe: Y-you're with the Organization!?

Kurisu: Huh?

Okabe: If their tendrils have gotten this far, then I've made a grave mistake.

Kurisu: Stop fooling around and come with me.

UNSPOKEN: My outburst has already attracted too much attention. Nakabachi, in particular, looks like he wants to rip my head off. It must be mortifying to be exposed as a fraud by a bright young man like myself. Anyway, I mustn't draw any more attention to myself. If the Organization gets wind of my presence here, it could endanger Mayuri -- to say nothing of these ignorant civilians. I let Makise Kurisu lead me out of the assembly hall.
\"\"\""""
    },
    {
        "role" : "assistant",
        "content": """Plan: Okabe is the main speaker for most of this scene, passionately accusing Nakabachi of plagiarism. His lines should be filled with emotion and conviction. Towards the end of the scene, he becomes more cautious and wary when interacting with Kurisu. His actions should reflect this change in demeanor. Kurisu on the other hand, remains calm and collected throughout, her actions should reflect her confusion and curiosity about Okabe's behavior. I will be careful to ensure all UNSPOKEN lines are either incorporated into a character's line, or are removed. It makes sense to drop UNSPOKEN lines that are only exposition.

Roleplay:

Okabe: "Who the hell am I!?" *I stand tall, pointing an accusing finger at the man on the podium.* "Someone who knows you for a fraud, that's who! You stole your theory from John Titor! And you call yourself an inventor!?" 

Nakabachi: "S-someone throw this man out!" *He stammers, nervously looking at the crowd for support.*

Okabe: "You're the one we should throw out, Doctor!" *I take a step forward, my voice ringing out in the silent room.* "Have you no shame!? You have no right to call yourself an inventor!"

Nakabachi: "Shut your mouth, you little pest!" *He points a shaking finger at me, his face red with anger.*

Kurisu: *Just then, she hand grabs my arm from behind.* 

Okabe: *I whip around, ready to face another adversary.* "Unhand me, you... huh?" *My words falter as I find myself looking into the eyes of a young girl. Recognition dawns on me as I realize who she is.* "Ah..."

Kurisu: "Could you come with me for a moment?" *She asks, her gaze stern and unyielding.*

Okabe: "Y-you're with the Organization!?" *I step back, my eyes widening in shock and suspicion.*

Kurisu: "Huh?" *Her brows furrow in confusion, clearly not understanding your reaction.*

Okabe: "If their tendrils have gotten this far, then I've made a grave mistake." *I clutch my coat's lapel dramatically, scanning the room for potential threats.*

Kurisu: "Stop fooling around and come with me." *She insists, her patience clearly wearing thin.*

Okabe: *Seeing no other option, I allow her to lead me away from the crowd.*"""
    },]

In [93]:
def get_tokens_of_conv(conv):
    """Gets the number of tokens in a conversation"""
    return len(tokenizer.encode(' '.join([d[1] for d in conv[-1]])))

# training_data_conversations_filtered[1][-1]

get_tokens_of_conv(training_data_conversations_filtered[1])

335

In [94]:
print('\n\n'.join([f'{speaker}: {line}' for speaker, line in training_data_conversations_filtered[1][-1]]))

Rintaro: Guh!

UNSPOKEN: This is bad. Ordinary methods don't work on Makise Kurisu, the genius girl. On the contrary, she's the one psyching me out! Damn. Looks like I'll have to make a tactical retreat. If I can just find an opening! Suddenly, Kurisu steps up to me with a serious expression. She stares right at me, her huge eyes blazing with strength of will. Such fire. I can't look away. Could someone with such pure eyes really be an Organization agent?

Kurisu: What were you trying to tell me earlier?

UNSPOKEN: Earlier?

Rintaro: What are you talking about?

Kurisu: About fifteen minutes ago. Before the conference started.

UNSPOKEN: Nonsense. This is the first time we've met. I was with Mayuri and that Upa toy fifteen minutes ago.

Kurisu: You were trying to tell me something, right? You looked really upset.

UNSPOKEN: Is this a trap? It does seem like one of the Organization's dirty tricks. But would this girl do something that underhanded?

Kurisu: You looked like you were going

In [95]:
def annotate_conversation(training_data_example, destination_directory, example_index):
    full_conversation = training_data_example[-1]
    context = '\n\n'.join([f'{speaker}: {line}' for speaker, line in full_conversation])

    scenario = scenarios[example_index]

    if not os.path.exists(os.path.join(destination_directory, f'{example_index:03d}_cot_debug.txt')):
        response = openai.ChatCompletion.create(
            model="gpt-4",
            temperature=0.7,
            # top_p=0.9,
            messages= annotation_prompt+ [{
                "role" : "user",
            "content" : f"""Scenario/setting: \"\"\"{scenario}\"\"\"

Dialogue for reformatting: 
\"\"\"
{context.replace("Rintaro:", "Okabe:")}
\"\"\"

Remember:
1. Lines from UNSPOKEN represent narration of actions or thoughts from Rintaro's POV. If it makes sense for one of these to be turned into an *action* by a character, do so.
2. Write compellingly. You can add actions (BUT NOT VOICELINES) to characters that weren't there before so long as they don't break the continuity of the scene, if it makes your writing more compelling.
3. Don't forget to plan out your response first. 
4. Don't leave any UNSPOKEN lines on lines by themselves; all actions and narrations must be part of a character's line.
5. Instead of changing any lines characters say, you will use actions to elegantly tie the scene together (but absolutely do not change any lines characters say).
6. Be sure to get the speaker of any given line right, I've seen a few cases where you accidentally switch who is saying a line and that messes up the whole scene.
7. Add actions to every line you can, even near the end, and even if it's short.
8. Be sure to cover the entire scene, not skipping anything, regardless of the content of the scene. If a character: says it, you MUST include it in your output.
9. Every line you write in the roleplay must have a Character: saying it.
"""
            }]
        )
    
        annotation = response['choices'][0]['message']['content']
        
        filename_cot_debug = os.path.join(destination_directory, f'{example_index:03d}_cot_debug.txt') # I'm paying for the tokens, I damn well want to see them

        # Write the scenario to the file
        with open(filename_cot_debug, 'w') as f_1:
            f_1.write(annotation)

        # Assume "Scenario:\n" is followed by the actual scenario
        annotation = re.search('Roleplay:(.*)', annotation, re.DOTALL).group(1)

        # Create a filename based on the example index
        filename = os.path.join(destination_directory, f'{example_index:03d}.txt')

        # Write the scenario to the file
        with open(filename, 'w') as f_2:
            f_2.write(annotation)
    else:
        print(f"Skipping {example_index:03d} because it already exists.")

In [96]:
# Create annotated training examples (same # of them as training examples and scenarios)
for idx, example in enumerate(training_data_conversations[:3]):
    annotate_conversation(example, 'annotated_convs', idx)

In [79]:
# Get annotated conversations back into notebook data from files

# read off every annotated conversation, and make a list of them that lines up with the training data
def make_annotated_conversation_list(training_data_conversations):
    annotated_conversation_list = []
    for idx, content in enumerate(training_data_conversations):
        with open(f"annotated_convs/{idx:03d}.txt", "r") as f:
            # Split the cleaned_script into lines and filter out empty lines
            lines = [line.strip() for line in f.read().split('\n') if line.strip()]
            # Process each line to make a tuple of (speaker, dialogue)
            script_tuples = list(map(makecols, lines)) # Issue here is that makecols looks for the things that denote speaker and whatnot that were present in the original script, but not GPT's outputs. Look for everything before the first colon on a new line, using the regex you got from GPT4 a while ago. Look through your conversations.
            # script_tuples = list(filter(not_empty_monologue, script_tuples))
            # script_tuples = lines
            annotated_conversation_list.append(script_tuples)

    return annotated_conversation_list

annotated_conversations = make_annotated_conversation_list(training_data_conversations)

FileNotFoundError: [Errno 2] No such file or directory: 'annotated_convs/003.txt'

In [None]:
print(annotated_conversations)

[[('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', '')], [('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSPOKEN', ''), ('UNSP

In [19]:
# Whole process text loop again, turn into tuple list

In [None]:
# Helper that creates JSON object for a training example at a certain index (annotated history, annotated completion, scenario)
def create_json_object(training_data_example, annotated_conversation, scenario, example_index):
    return { # or something like this
        "history": '\n'.join([f'{speaker}: {line}' for speaker, line in training_data_example]),
        "completion": '\n'.join([f'{speaker}: {line}' for speaker, line in annotated_conversation]),
        "scenario": scenario,
    }

In [None]:
# T-List to training examples, concatenate, then loop to write json objects with scenario and annotated training example chat history and completion target (for each example) TO FILE