### Imports

In [1]:
from openai import AzureOpenAI

from langchain import PromptTemplate
import tiktoken

import pandas as pd
import json
import os

In [2]:
book_titles = ["The Odyssey", "War and Peace", "Animal Farm", "Christmas Carol"]
book_names = ["the-odyssey", "war-and-peace", "animal-farm", "a-christmas-carol"]
book_index = 1
book_title = book_titles[book_index]

### Functions Processing

In [3]:
def generate_message(role, content):

    """
    Generates a message with a specified role and content.

    Args:
        role (str): The role of the message, e.g., "user" or "system".
        content (str): The content of the message.

    Returns:
        dict: A dictionary representing the message with keys 'role' and 'content'.
    """

    return {"role":role,"content":content}

def generate_prompt(prompt):

    """
    Generates a user prompt message.

    Args:
        prompt (str): The prompt content.

    Returns:
        dict: A dictionary representing the user prompt message.
    """

    return generate_message(role="user", content=prompt)
    
def num_tokens_from_string(string: str, encoding_name: str) -> int:

    """
    Returns the number of tokens in a text string.

    This function calculates the number of tokens in a text string using a specified encoding.

    Args:
        string (str): The input text string.
        encoding_name (str): The name of the encoding to use for tokenization.

    Returns:
        int: The number of tokens in the text string.
    """

    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def generate_chunks(sentences_df, sentences_per_chunk, MAX_CHUNK_TOKEN_SIZE, ENCODING_NAME):

    """
    Generates chunks of text from a DataFrame of sentences.

    This function divides a DataFrame of sentences into chunks of text, each containing a maximum
    number of tokens specified by MAX_CHUNK_TOKEN_SIZE. The number of sentences per chunk is also
    controlled by sentences_per_chunk.

    Args:
        sentences_df (DataFrame): A DataFrame containing sentences.
        sentences_per_chunk (int): The maximum number of sentences per chunk.
        MAX_CHUNK_TOKEN_SIZE (int): The maximum number of tokens allowed per chunk.
        ENCODING_NAME (str): The name of the encoding used for tokenization.

    Returns:
        list: A list of dictionaries, where each dictionary represents a chunk and contains:
              - 'content': The concatenated sentences within the chunk.
              - 'end_sid': The index of the last sentence within the chunk.
    """

    start = 0
    chunks = []

    def has_remaining_sentences():
        return start < len(sentences_df)

    def concatenate_sentences(df, start, end):
        return df.loc[start:end, 'sentence'].str.cat()

    while has_remaining_sentences():
        end = start + sentences_per_chunk - 1
        concatenated_sentences = concatenate_sentences(sentences_df, start, end)

        # if end > len(sentences_df):
        #     chunks.append(sentences_df.loc[start:len(sentences_df), 'sentence'].str.cat())
        #     break

        while num_tokens_from_string(concatenated_sentences, ENCODING_NAME) < MAX_CHUNK_TOKEN_SIZE and end < len(sentences_df) - 1:
            end += 1
            concatenated_sentences = concatenate_sentences(sentences_df, start, end)
            

        while num_tokens_from_string(concatenated_sentences, ENCODING_NAME) > MAX_CHUNK_TOKEN_SIZE:
            end -= 1
            concatenated_sentences = concatenate_sentences(sentences_df, start, end)

        chunks.append({"content": concatenated_sentences, "end_sid": end})
        start = end + 1
    
    # for chunk in chunks:
    #     print(num_tokens_from_string(chunk, "cl100k_base"))
    return chunks

def calculate_sentences_per_chunk(sentences_df, ENCONDING_NAME, MAX_CHUNK_TOKEN_SIZE):

    """
    Calculates the number of sentences per chunk based on token size.

    This function calculates the optimal number of sentences per chunk such that each chunk
    contains approximately MAX_CHUNK_TOKEN_SIZE tokens.

    Args:
        sentences_df (DataFrame): A DataFrame containing sentences.
        ENCODING_NAME (str): The name of the encoding used for tokenization.
        MAX_CHUNK_TOKEN_SIZE (int): The maximum number of tokens allowed per chunk.

    Returns:
        float: The calculated number of sentences per chunk.
    """

    book = sentences_df['sentence'].astype(str).str.cat(sep=' ')

    total_tokens = num_tokens_from_string(book, ENCONDING_NAME)
    total_sentences = max(sentences_df['sid'])
    avg_tokens_per_sentence = total_tokens / total_sentences
    sentences_per_chunk = MAX_CHUNK_TOKEN_SIZE / avg_tokens_per_sentence

    return sentences_per_chunk

def parse_character_names(character_names):

    """
    Parses a list of character names into a formatted string.

    This function takes a list of character names and formats them into a string
    with proper punctuation and conjunctions.

    Args:
        character_names (list): A list of character names.

    Returns:
        str: A formatted string containing the character names.
    """

    names = character_names[0]
    for index in range(1, len(character_names)):
        if index < len(character_names) - 1:
            names += ", "
        else:
            names += " and "
        names += character_names[index]
    return names

def parse_character_json_format(character_names):

    """
    Parses a list of character names into a JSON-formatted string.

    This function takes a list of character names and formats them into a JSON-like
    string with proper formatting and structure.

    Args:
        character_names (list): A list of character names.

    Returns:
        str: A JSON-formatted string containing the character names and their summaries.
    """

    names = f'"{character_names[0]}": Summary for {character_names[0]}'
    for index in range(1, len(character_names)):
        if index < len(character_names):
            names += ",\n"
        else:
            names += "\n"
        names += f'\t"{character_names[index]}": Summary for {character_names[index]}'
    return names

### Prompt Templates

In [None]:
summarize_character_from_excerpt_template = PromptTemplate(
    input_variables=["text", "parsed_character_names", "parsed_character_json_format"],
    template="""I will provide you with a set of instructions, followed by some text.
    The former is labeled 'Instructions' and the latter 'Text'.
    
    Instructions:
    Briefly summarize the characters {parsed_character_names} based on Text.
    For example, you can write about relations to other characters, personality traits, appearance, or other things you might find interesting.
    Do not mention the source, i.e. that it is a text, book or summary anywhere.
    Return the summaries as an JSON object according to the following format:
    
    {{
        {parsed_character_json_format}
    }}
    
    Text:
    {text}"""
)

summarize_character_from_summaries_template = PromptTemplate(
    input_variables=["character_name", "summaries"],
    template="""I will provide you with a set of instructions, followed by a sequence of summaries.
    The summaries are labeled 'Summary #1' and 'Summary #2'.

    Instructions:
    Briefly summarize the character {character_name} based on Summary #1 and Summary #2.

    {summaries}
    """
)

# Messages

system_message = generate_message(
    role="system", 
    content="You are a book publisher that summarizes a character based on an excerpt from a literary book."
)

user_message_example_1 = generate_message(
    role="user",
    content="""I will provide you with a set of instructions, followed by some text.
    The former is labeled 'Instructions' and the latter 'Text'.
    
    Instructions:
    Briefly summarize the characters (placeholder for character names) based on Text.
    For example, you can write about relations to other characters, personality traits, appearance, or other things you might find interesting.
    Do not mention the source, i.e. that it is a text, book or summary anywhere.
    Return the summaries as an JSON object according to the following format:
    
    {{
        (placeholder for properties of JSON object)
    }}
    
    Text:
    (some text)"""
)

assistant_message_example_1 = generate_message(
    role="assistant",
    content="""{{
        "(character #1)": (character #1) is (other character's name)'s (relation). He is (personality traits). (description of appearance).
        "(character #2)": (character #2) is (other character's name)'s (relation). She is (personality traits). (description of appearance).
    }}"""
)

user_message_example_2 = generate_message(
    role="user",
    content="""I will provide you with a set of instructions, followed by a sequence of summaries.
    The summaries are labeled 'Summary #1' and 'Summary #2'.

    Instructions:
    Briefly summarize the character (character name) based on Summary #1 and Summary #2.

    Summary #1: (contents of summary #1, which contains a description of the character's relations, personality traits and appearance)

    Summary #2: (contents of summary #2, which contains a description of the character's relations, personality traits and appearance)
    """
)

assistant_message_example_2 = generate_message(
    role="assistant",
    content="(character) is (other character's name)'s (relation). He or she is (personality traits). (description of appearance)."
)

### Constants

In [5]:
messages_and_template = system_message, user_message_example_1, assistant_message_example_1, generate_prompt(
    summarize_character_from_excerpt_template.format(
        parsed_character_json_format="",
        parsed_character_names="",
        text=""
    )
)

USE_TURBO = True
MODEL_NAME = "gpt-4-turbo" if USE_TURBO else "gpt-4"
ENCODING_NAME = "cl100k_base"
CHARACTERS_TO_BE_SUMMARIZED_PER_CHUNK = 5

MESSAGES_AND_TEMPLATE_TOKEN_SIZE = num_tokens_from_string(json.dumps(messages_and_template), ENCODING_NAME)
MAX_OUTPUT_TOKEN_SIZE = 600 * CHARACTERS_TO_BE_SUMMARIZED_PER_CHUNK
MAX_TOKEN_SIZE = 8192
MAX_TOKEN_SIZE = MAX_TOKEN_SIZE * (4 if USE_TURBO else 1)
MAX_CHUNK_TOKEN_SIZE = MAX_TOKEN_SIZE - MAX_OUTPUT_TOKEN_SIZE - MESSAGES_AND_TEMPLATE_TOKEN_SIZE


### Summarize

In [6]:
def summarize_character_from_excerpt(character_names, text):

    """
    Generates a summary of character information from an excerpt of text.

    This function generates a summary of character information from an excerpt of text,
    including the names of characters and their respective summaries.

    Args:
        character_names (list): A list of character names.
        text (str): The excerpt of text containing information about the characters.

    Returns:
        dict: A completion response from the OpenAI API, containing the generated summary.
    """

    parsed_character_names = parse_character_names(character_names)
    parsed_character_json_format = parse_character_json_format(character_names)

    tokens_parsed_character_names = num_tokens_from_string(parsed_character_names, ENCODING_NAME)
    tokens_character_json_format = num_tokens_from_string(parsed_character_json_format, ENCODING_NAME)

    messages = [
        system_message,
        user_message_example_1,
        assistant_message_example_1,
        generate_prompt(summarize_character_from_excerpt_template.format(
            parsed_character_json_format=parsed_character_json_format,
            parsed_character_names=parsed_character_names,
            text=text
        ))
    ]

    client = AzureOpenAI(
        azure_endpoint = "https://prod-bb-openai-sweden.openai.azure.com/", 
        api_key=os.getenv("API_KEY"),  
        api_version="2024-02-15-preview"
    )

    return client.chat.completions.create(
        model=MODEL_NAME,
        messages = messages,
        temperature=0.15,
        max_tokens=MAX_OUTPUT_TOKEN_SIZE - tokens_parsed_character_names - tokens_character_json_format,
        top_p=0.85,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )

def summarize_characters(character_names, chunks):

    """
    Summarizes characters from text chunks.

    This function summarizes characters from text chunks, generating summaries for each character
    found in the provided chunks.

    Args:
        character_names (list): A list of character names.
        chunks (list): A list of dictionaries representing text chunks, each containing:
                       - 'content': The text content of the chunk.
                       - 'end_sid': The index of the last sentence within the chunk.

    Returns:
        None
    """

    summaries_per_chunk = []

    for chunk_index, chunk in enumerate(chunks):
        characters_to_be_summarized = []
        for character_name in character_names:
            if character_name in chunk['content']:
                characters_to_be_summarized.append(character_name)
        try:
            completion = summarize_character_from_excerpt(characters_to_be_summarized, chunk['content'])
            
            summary_json = json.loads(completion.choices[0].message.content)
            for character_name, summary in summary_json.items():
                summaries_per_chunk.append({"end_sid": chunk['end_sid'], "character_name": character_name, "summary": summary})

        except Exception as e:    
            error_message = f"An error occurred while summarizing chunk {chunk_index}: {e}"
            with open("error_log.txt", "a") as file:
                file.write(error_message + "\n")
            continue       
    summaries_per_chunk_df = pd.DataFrame(summaries_per_chunk)
    summaries_per_chunk_path = f"data/summaries/{book_title}/summaries_per_chunk.csv"
    
    if os.path.isfile(summaries_per_chunk_path):
        os.remove(summaries_per_chunk_path)

    summaries_per_chunk_df.to_csv(summaries_per_chunk_path)

def summarize_character_from_summaries(character_name, summary_1, summary_2):
    
    """
    Generates a summary of a character from existing summaries.

    This function generates a summary of a character based on two existing summaries,
    combining them into a coherent narrative.

    Args:
        character_name (str): The name of the character.
        summary_1 (str): The first summary of the character.
        summary_2 (str): The second summary of the character.

    Returns:
        dict: A completion response from the OpenAI API, containing the generated summary.
    """

    tokens_character_name = num_tokens_from_string(character_name, ENCODING_NAME)

    messages = [
        system_message,
        user_message_example_2,
        assistant_message_example_2,
        generate_prompt(summarize_character_from_summaries_template.format(
            character_name=character_name, 
            summaries=summary_1 + "\n" + summary_2
        ))
    ]

    client = AzureOpenAI(
        azure_endpoint = "https://prod-bb-openai-sweden.openai.azure.com/", 
        api_key=os.getenv("API_KEY"),  
        api_version="2024-02-15-preview"
    )

    return client.chat.completions.create(
        model=MODEL_NAME,
        messages = messages,
        temperature=0.1,
        max_tokens=MAX_OUTPUT_TOKEN_SIZE - tokens_character_name,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )

def hierarchically_merge(summaries_per_chunk_df, character_names):

    """
    Merges character summaries hierarchically.

    This function merges character summaries hierarchically, combining them into coherent narratives
    for each character.

    Args:
        summaries_per_chunk_df (DataFrame): A DataFrame containing summaries per chunk.
        character_names (list): A list of character names.

    Returns:
        None
    """

    merged_summaries = []
    for character_name in character_names:
        character_summaries = summaries_per_chunk_df.loc[summaries_per_chunk_df['character_name'] == character_name].to_dict("records")
        merged_summaries.append(character_summaries[0])
        if len(character_summaries) > 1:
            for summary_index in range(len(character_summaries) - 1):
                summary_1 = merged_summaries[summary_index]['summary']
                summary_2 = character_summaries[summary_index + 1]['summary']
                completion = summarize_character_from_summaries(character_name, summary_1, summary_2)
                merged_summaries.append({"end_sid": character_summaries[summary_index + 1]['end_sid'], "character_name": character_name, "summary": completion.choices[0].message.content})

    merged_summaries_df = pd.DataFrame(merged_summaries)
    merged_summaries_path = f"data/summaries/{book_title}/merged_summaries.csv"
    merged_summaries_exist = os.path.isfile(merged_summaries_path)
    merged_summaries_df.to_csv(merged_summaries_path, mode="a", header=not merged_summaries_exist, encoding='utf-8-sig')  

### Extract Chunks 

In [7]:
character_names = pd.read_csv('data/characters/' + book_names[book_index] + '_characters.csv')['name'].tolist()[:10]
sentences_df = pd.read_csv('data/sentences/' + book_names[book_index] + '_sentences.csv')

sentences_per_chunk = calculate_sentences_per_chunk(sentences_df, ENCODING_NAME, MAX_CHUNK_TOKEN_SIZE)
chunks = generate_chunks(sentences_df, sentences_per_chunk, MAX_CHUNK_TOKEN_SIZE, ENCODING_NAME)

summarize_characters(character_names, chunks)
summaries_per_chunk_df = pd.read_csv(f"data/summaries/{book_title}/summaries_per_chunk.csv")
summaries_per_chunk_df = summaries_per_chunk_df.sort_values(['character_name','end_sid'])

hierarchically_merge(summaries_per_chunk_df, character_names)