### Imports

In [28]:
from openai import AzureOpenAI

from langchain import PromptTemplate
import tiktoken

import pandas as pd
import json
import os

In [29]:
book_titles = ["The Odyssey", "War and Peace", "Animal Farm", "Christmas Carol"]
book_names = ["the-odyssey", "war-and-peace", "animal-farm", "a-christmas-carol"]
book_index = 2
book_title = book_titles[book_index]

### Functions Processing

In [30]:
def generate_message(role, content):
    return {"role":role,"content":content}

def generate_prompt(prompt):
    return generate_message(role="user", content=prompt)
    
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def generate_chunks(sentences_df, sentences_per_chunk, MAX_CHUNK_TOKEN_SIZE, ENCODING_NAME):
    start = 0
    chunks = []

    def has_remaining_sentences():
        return start < len(sentences_df)

    def concatenate_sentences(df, start, end):
        return df.loc[start:end, 'sentence'].str.cat()

    while has_remaining_sentences():
        end = start + sentences_per_chunk - 1
        concatenated_sentences = concatenate_sentences(sentences_df, start, end)

        # if end > len(sentences_df):
        #     chunks.append(sentences_df.loc[start:len(sentences_df), 'sentence'].str.cat())
        #     break

        while num_tokens_from_string(concatenated_sentences, ENCODING_NAME) < MAX_CHUNK_TOKEN_SIZE and end < len(sentences_df) - 1:
            end += 1
            concatenated_sentences = concatenate_sentences(sentences_df, start, end)
            

        while num_tokens_from_string(concatenated_sentences, ENCODING_NAME) > MAX_CHUNK_TOKEN_SIZE:
            end -= 1
            concatenated_sentences = concatenate_sentences(sentences_df, start, end)

        chunks.append({"content": concatenated_sentences, "end_sid": end})
        start = end + 1
    
    # for chunk in chunks:
    #     print(num_tokens_from_string(chunk, "cl100k_base"))
    return chunks

def calculate_sentences_per_chunk(sentences_df, ENCONDING_NAME, MAX_CHUNK_TOKEN_SIZE):

    book = sentences_df['sentence'].astype(str).str.cat(sep=' ')

    total_tokens = num_tokens_from_string(book, ENCONDING_NAME)
    total_sentences = max(sentences_df['sid'])
    avg_tokens_per_sentence = total_tokens / total_sentences
    sentences_per_chunk = MAX_CHUNK_TOKEN_SIZE / avg_tokens_per_sentence

    return sentences_per_chunk

def parse_character_names(character_names):
    names = character_names[0]
    for index in range(1, len(character_names)):
        if index < len(character_names) - 1:
            names += ", "
        else:
            names += " and "
        names += character_names[index]
    return names

def parse_character_json_format(character_names):
    names = f'"{character_names[0]}": Summary for {character_names[0]}'
    for index in range(1, len(character_names)):
        if index < len(character_names):
            names += ",\n"
        else:
            names += "\n"
        names += f'\t"{character_names[index]}": Summary for {character_names[index]}'
    return names



### Templates

In [31]:
# Prompt template

summarize_character_from_excerpt_template = PromptTemplate(
    input_variables=["text", "parsed_character_names", "parsed_character_json_format"],
    template="""I will provide you with a set of instructions, followed by some text.
    The former is labeled 'Instructions' and the latter 'Text'.
    
    Instructions:
    Briefly summarize the characters {parsed_character_names} based on Text.
    For example, you can write about relations to other characters, personality traits, appearance, or other things you might find interesting.
    Do not mention the source, i.e. that it is a text, book or summary anywhere.
    Return the summaries as an JSON object according to the following format:
    
    {{
        {parsed_character_json_format}
    }}
    
    Text:
    {text}"""
)

summarize_character_from_summaries_template = PromptTemplate(
    input_variables=["character_name", "summaries"],
    template="""I will provide you with a set of instructions, followed by a sequence of summaries.
    The summaries are labeled 'Summary #1' and 'Summary #2'.

    Instructions:
    Briefly summarize the character {character_name} based on Summary #1 and Summary #2.

    {summaries}
    """
)

# Messages

system_message = generate_message(
    role="system", 
    content="You are a book publisher that summarizes a character based on an excerpt from a literary book.")

# user_message_example_1 = generate_message(
#     role="user",
#     content="Briefly summarize the character (character name) based on the text below in one to two sentences. Focus on relations to other characters and personality traits.\n\n(text)"
# )

user_message_example_1 = generate_message(
    role="user",
    content="""I will provide you with a set of instructions, followed by some text.
    The former is labeled 'Instructions' and the latter 'Text'.
    
    Instructions:
    Briefly summarize the characters (placeholder for character names) based on Text.
    For example, you can write about relations to other characters, personality traits, appearance, or other things you might find interesting.
    Do not mention the source, i.e. that it is a text, book or summary anywhere.
    Return the summaries as an JSON object according to the following format:
    
    {{
        (placeholder for properties of JSON object)
    }}
    
    Text:
    (some text)"""
)

assistant_message_example_1 = generate_message(
    role="assistant",
    content="""{{
        "(character #1)": (character #1) is (other character's name)'s (relation). He is (personality traits). (description of appearance).
        "(character #2)": (character #2) is (other character's name)'s (relation). She is (personality traits). (description of appearance).
    }}"""
)

user_message_example_2 = generate_message(
    role="user",
    content="""I will provide you with a set of instructions, followed by a sequence of summaries.
    The summaries are labeled 'Summary #1' and 'Summary #2'.

    Instructions:
    Briefly summarize the character (character name) based on Summary #1 and Summary #2.

    Summary #1: (contents of summary #1, which contains a description of the character's relations, personality traits and appearance)

    Summary #2: (contents of summary #2, which contains a description of the character's relations, personality traits and appearance)
    """
)

assistant_message_example_2 = generate_message(
    role="assistant",
    content="(character) is (other character's name)'s (relation). He or she is (personality traits). (description of appearance)."
)

In [32]:
# names = pd.read_csv(f'data/characters/{book_names[book_index]}_characters.csv')['name'].tolist()[:2]

### Constants

In [33]:
messages_and_template = system_message, user_message_example_1, assistant_message_example_1, generate_prompt(
    summarize_character_from_excerpt_template.format(
        parsed_character_json_format="",
        parsed_character_names="",
        text=""
    ))

ENCODING_NAME = "cl100k_base"
CHARACTERS_TO_BE_SUMMARIZED_PER_CHUNK = 5

MESSAGES_AND_TEMPLATE_TOKEN_SIZE = num_tokens_from_string(json.dumps(messages_and_template), ENCODING_NAME)
MAX_OUTPUT_TOKEN_SIZE = 600 * CHARACTERS_TO_BE_SUMMARIZED_PER_CHUNK
MAX_TOKEN_SIZE = 8192
MAX_CHUNK_TOKEN_SIZE = MAX_TOKEN_SIZE - MAX_OUTPUT_TOKEN_SIZE - MESSAGES_AND_TEMPLATE_TOKEN_SIZE


### Summarize

In [34]:
def summarize_character_from_excerpt(character_names, text):

    parsed_character_names = parse_character_names(character_names)
    parsed_character_json_format = parse_character_json_format(character_names)

    tokens_parsed_character_names = num_tokens_from_string(parsed_character_names, ENCODING_NAME)
    tokens_character_json_format = num_tokens_from_string(parsed_character_json_format, ENCODING_NAME)

    messages = [
        system_message,
        user_message_example_1,
        assistant_message_example_1,
        generate_prompt(summarize_character_from_excerpt_template.format(
            parsed_character_json_format=parsed_character_json_format,
            parsed_character_names=parsed_character_names,
            text=text
        ))
    ]

    client = AzureOpenAI(
        azure_endpoint = "https://prod-bb-openai-sweden.openai.azure.com/", 
        api_key=os.getenv("API_KEY"),  
        api_version="2024-02-15-preview"
    )

    return client.chat.completions.create(
        model="gpt-4",
        messages = messages,
        temperature=0.15,
        max_tokens=MAX_OUTPUT_TOKEN_SIZE - tokens_parsed_character_names - tokens_character_json_format,
        top_p=0.85,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )

def summarize_characters(character_names, chunks):
    summaries_per_chunk = []

    for chunk_index, chunk in enumerate(chunks):
        characters_to_be_summarized = []
        for character_name in character_names:
            if character_name in chunk['content']:
                characters_to_be_summarized.append(character_name)
        try:
            completion = summarize_character_from_excerpt(characters_to_be_summarized, chunk['content'])
            
            summary_json = json.loads(completion.choices[0].message.content)
            for character_name, summary in summary_json.items():
                summaries_per_chunk.append({"end_sid": chunk['end_sid'], "character_name": character_name, "summary": summary})

        except Exception as e:    
            error_message = f"An error occurred while summarizing chunk {chunk_index}: {e}"
            with open("error_log.txt", "a") as file:
                file.write(error_message + "\n")
            continue       
    summaries_per_chunk_df = pd.DataFrame(summaries_per_chunk)
    summaries_per_chunk_path = f"data/summaries/{book_title}/summaries_per_chunk.csv"
    
    if os.path.isfile(summaries_per_chunk_path):
        os.remove(summaries_per_chunk_path)

    summaries_per_chunk_df.to_csv(summaries_per_chunk_path)

def summarize_character_from_summaries(character_name, summary_1, summary_2):
    
    tokens_character_name = num_tokens_from_string(character_name, ENCODING_NAME)

    messages = [
        system_message,
        user_message_example_2,
        assistant_message_example_2,
        generate_prompt(summarize_character_from_summaries_template.format(
            character_name=character_name, 
            summaries=summary_1 + "\n" + summary_2
        ))
    ]

    client = AzureOpenAI(
        azure_endpoint = "https://prod-bb-openai-sweden.openai.azure.com/", 
        api_key=os.getenv("API_KEY"),  
        api_version="2024-02-15-preview"
    )

    return client.chat.completions.create(
        model="gpt-4",
        messages = messages,
        temperature=0.1,
        max_tokens=MAX_OUTPUT_TOKEN_SIZE - tokens_character_name,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )

def hierarchically_merge(summaries_per_chunk_df, character_names):
    merged_summaries = []
    for character_name in character_names:
        character_summaries = summaries_per_chunk_df.loc[summaries_per_chunk_df['character_name'] == character_name].to_dict("records")
        merged_summaries.append(character_summaries[0])
        if len(character_summaries) > 1:
            for summary_index in range(len(character_summaries) - 1):
                summary_1 = merged_summaries[summary_index]['summary']
                summary_2 = character_summaries[summary_index + 1]['summary']
                completion = summarize_character_from_summaries(character_name, summary_1, summary_2)
                merged_summaries.append({"end_sid": character_summaries[summary_index + 1]['end_sid'], "character_name": character_name, "summary": completion.choices[0].message.content})

    merged_summaries_df = pd.DataFrame(merged_summaries)
    merged_summaries_path = f"data/summaries/{book_title}/merged_summaries.csv"
    merged_summaries_exist = os.path.isfile(merged_summaries_path)
    merged_summaries_df.to_csv(merged_summaries_path, mode="a", header=not merged_summaries_exist)


### Extract Chunks 

In [35]:

character_names = pd.read_csv('data/characters/' + book_names[book_index] + '_characters.csv')['name'].tolist()[CHARACTERS_TO_BE_SUMMARIZED_PER_CHUNK+5:CHARACTERS_TO_BE_SUMMARIZED_PER_CHUNK+10]
sentences_df = pd.read_csv('data/sentences/' + book_names[book_index] + '_sentences.csv')

sentences_per_chunk = calculate_sentences_per_chunk(sentences_df, ENCODING_NAME, MAX_CHUNK_TOKEN_SIZE)
chunks = generate_chunks(sentences_df, sentences_per_chunk, MAX_CHUNK_TOKEN_SIZE, ENCODING_NAME)

summarize_characters(character_names, chunks)
summaries_per_chunk_df = pd.read_csv(f"data/summaries/{book_title}/summaries_per_chunk.csv")
summaries_per_chunk_df = summaries_per_chunk_df.sort_values(['character_name','end_sid'])

hierarchically_merge(summaries_per_chunk_df, character_names)



In [36]:
pd.DataFrame([{"name": "Daniel", "surname": "Dahlberg"}, {"name": "Axel", "surname": "Lokrantz"}]).to_csv("test.csv", mode="a", header=not os.path.isfile("test.csv"))