In [None]:
import re
from openai import OpenAI
import json
import os

API_KEY = "sk-proj-"
client = OpenAI(api_key=API_KEY)

formatted_directory = "/Users/dzianissheka/projects/dev/study/nlp_tools/format-book/_temp/formatted_chapters_mini/"

if not os.path.exists(formatted_directory):
    os.makedirs(formatted_directory)


def get_stat_for_text(text):
    num_characters = len(text)
    num_words = len(text.split())
    num_sentences = len(re.split(r'[.!?]', text)) - 1

    return {
        'characters': num_characters,
        'words': num_words,
        'sentences': num_sentences,
    }


def split_text_into_sections(file_path):
    # Read the text from the file
    with open(file_path, 'r') as file:
        text = file.read()
    
    # Regular expression to match sections
    pattern = r"(\d+)\.\s*([^\n]+)\n(.*?)(?=\n\d+\.|$)"
    matches = re.findall(pattern, text, re.DOTALL)
    
    # Create a dictionary to store the sections
    sections = []
    for match in matches:
        chapter_number = match[0].strip()
        title = match[1].strip()
        content = match[2].strip()

        num_characters = len(content)
        num_words = len(content.split())
        num_sentences = len(re.split(r'[.!?]', content)) - 1

        sections.append({
            "chapter_number": chapter_number,
            "title": title,
            "content": content,
            'characters': num_characters,
            'words': num_words,
            'sentences': num_sentences,
        })
    
    return sections

# Function to split a section by length, ensuring splits occur at sentence boundaries
def split_content_by_length(content, max_length):
    # Split content into sentences
    sentences = re.split(r'(?<=[.!?])\s+', content)
    
    parts = []
    current_section = ""
    for sentence in sentences:
        if len(current_section) + len(sentence) <= max_length:
            if current_section:
                current_section += " "
            current_section += sentence
        else:
            # Add the current section as a new part
            parts.append({
                "content": current_section.strip(),
                "characters": len(current_section.strip()),
                "words": len(current_section.strip().split()),
                "sentences": len(re.split(r'[.!?]', current_section.strip())) - 1
            })
            # Start a new section with the current sentence
            current_section = sentence
    
    if current_section:
        content = current_section.strip()
        parts.append({
            "content": content,
            "characters": len(current_section),
            "words": len(current_section.split()),
            "sentences": len(re.split(r'[.!?]', content)) - 1
        })
    
    return parts


def format_text(text_content):
    prompt = (
        f"Format the following piece of text separating paragraphs with new line, without changing the meaning of the text"
        f"Return a JSON object."
        f"with a single field named 'result'. The value of 'result' should contain the formatted text. "
        f"\n\nText: {text_content}"
    )
    
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant which response always in json format."},
            {
                "role": "user",
                "content": prompt
            }
        ],
        response_format={ "type": "json_object" }
    )

        # Parse the JSON response and extract the value of "result"
    try:
        # print(f"\n\nText: {completion.choices[0].message.content}")
        response_json = json.loads(completion.choices[0].message.content)
        formatted_text = response_json.get("result", "Could not find 'result' in response.")
    except json.JSONDecodeError:
        formatted_text = "Invalid JSON response."

    return formatted_text


# Function to generate a snake_case filename from the title
def generate_filename(title):
    # Remove special symbols and convert to lowercase snake_case
    filename = re.sub(r'[^a-zA-Z0-9\s]', '', title)  # Remove special characters
    filename = re.sub(r'\s+', '_', filename.strip())  # Replace spaces with underscores
    return filename.lower()


# Example usage
formatted_text = format_text("This is an example text that needs formatting.")
print(formatted_text)

# Example usage
# TODO: add support for docx / epub / pdf
# file_path = '/Users/dzianissheka/projects/dev/study/nlp_tools/texts/tarhapaiva/tarhapäivä-5-20.docx'  # Replace with your text file path
file_path = '/Users/dzianissheka/projects/dev/study/nlp_tools/format-book/texts/tarhapaiva/tarhapäivä-5-20.docx.txt'

sections = split_text_into_sections(file_path)
print(f"Number of сhapters: {len(sections)}\n")

# Displaying the sections
for index, section in enumerate(sections):
    print(f"Chapter: {section['chapter_number']}. {section['title']}\n")
    print(f"Characters: {section['characters']}\nWords: {section['words']}\nSentences: {section['sentences']}\n")
    print("---------------------------------------------------\n")
    


# Example usage
max_length = 1000
section = sections[0]
print(f"Chapter: {section['chapter_number']}. {section['title']}\n")
print(f"Original section length: {len(section['content'])}\n")
split_parts = split_content_by_length(section['content'], max_length)
print(f"Number of split parts: {len(split_parts)}\n")
for part in split_parts:
    print(f"Characters: {part['characters']}\nWords: {part['words']}\nSentences: {part['sentences']}\n")
    print("---------------------------------------------------\n")

import re
import os
from pathlib import Path
from openai import OpenAI
import json

directory = "/Users/dzianissheka/projects/dev/study/nlp_tools/format-book/_temp/chapters/"

if not os.path.exists(directory):
    os.makedirs(directory)

items = [
    {"title": "Example Title 1", "content": "This is the content of the first item."},
    {"title": "Another Example", "content": "This is the content of the second item."},
    # Add more items as needed
]


# Create files based on the items array

for section in sections:
    # Generate the filename
    filename = generate_filename(section['chapter_number'] + " " + section['title']) + ".txt"
    file_path = Path(directory) / filename

    # Create the file and write the content
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(f"{section['title']}\n\n{section['content']}")

print("Files have been created successfully.")


for section in sections:
    # Generate the filename
    filename = generate_filename(section['chapter_number'] + " " + section['title']) + ".txt"
    file_path = Path(formatted_directory) / filename

    section_stat = get_stat_for_text(section['content'])

    split_parts = split_content_by_length(section['content'], 2000)
    result_section_text = ""
    # print(f"Number of split parts: {len(split_parts)}\n")
    for part in split_parts:
        print(f"Characters: {part['characters']}\nWords: {part['words']}\nSentences: {part['sentences']}\n")
        formatted_text = format_text(part['content'])
        result_section_text += formatted_text + "\n"

    formatted_stat = get_stat_for_text(result_section_text)

    
    data = [
        ["words",section_stat['words'],formatted_stat['words']],
        ["sentences",section_stat['sentences'],formatted_stat['sentences']],
        ["characters", section_stat['characters'], formatted_stat['characters']],
    ]

    # Create the file and write the content
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(f"{section['chapter_number']}. {section['title']}\n\n{result_section_text}")

print("Formatted files have been created successfully.")


This is an example text that needs formatting.
Number of сhapters: 15

Chapter: 6. Suudelma

Characters: 2327
Words: 310
Sentences: 40

---------------------------------------------------

Chapter: 7. Antti ja Paavo ovat Reposten mökillä

Characters: 1831
Words: 251
Sentences: 42

---------------------------------------------------

Chapter: 8. Ennillä on poikaystävä

Characters: 2701
Words: 364
Sentences: 64

---------------------------------------------------

Chapter: 9. Antin ja Paavon arki

Characters: 2928
Words: 411
Sentences: 54

---------------------------------------------------

Chapter: 10. Antti ja Paavo käyvät Pian luona

Characters: 3005
Words: 416
Sentences: 60

---------------------------------------------------

Chapter: 11. Jouluaatto

Characters: 2961
Words: 411
Sentences: 62

---------------------------------------------------

Chapter: 12. Hyvää uutta vuotta!

Characters: 1292
Words: 165
Sentences: 24

---------------------------------------------------

Chapter: 