In [10]:
import os
import re
from openai import OpenAI
#330min run1
###########################
# CONFIGURE DATA DIRECTORY
###########################
DATA_DIR = r"D:\\mgr\\nlp\\core\\data_processing_scripts"
# Adjust the above path as needed.

###########################
# OPENAI / HF CLIENT SETUP
###########################
with open("key.env", "r") as file:
    api_key = file.read().strip()

client = OpenAI(
    base_url="https://a1v7omv60l5kze8p.us-east-1.aws.endpoints.huggingface.cloud/v1/", 
    api_key=api_key
)

def call_model(prompt,
               model="tgi",
               max_tokens=200,
               temperature=0.5,
               stream=False,
               **kwargs):
    """
    Helper function to call the model with a given prompt and return the response text.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        stream=stream,
        **kwargs
    )

    if stream:
        full_output = []
        for chunk in response:
            delta = chunk.choices[0].delta.get("content", "")
            full_output.append(delta)
        return "".join(full_output)
    else:
        return response.choices[0].message.content

###########################
# TASK FUNCTIONS (Polish prompts)
###########################

###########################
# LIMIT TO FIRST 1000 BLANKS
###########################
def limit_to_first_1000_blanks(text):
    """
    Finds sequences of underscores (placeholders).
    Returns a substring of `text` up to (and including) the 1000th placeholder,
    if there are more than 1000 placeholders.
    Otherwise, returns the entire text.
    """
    # This regex finds any sequence of 2 or more underscores as a "placeholder."
    placeholders = list(re.finditer(r'_{2,}', text))

    if len(placeholders) <= 1000:
        return text
    else:
        # We only keep up to the end of the 1000th placeholder
        end_1000 = placeholders[999].end()  # 999 because zero-based index
        truncated_text = text[:end_1000]
        return truncated_text

###########################
# SPLIT TEXT INTO 10 CHUNKS
###########################
def split_into_10_chunks(text):
    """
    Split 'text' into at most 10 roughly equal-length chunks by character count.
    Returns a list of string chunks.
    """
    chunks = []
    length = len(text)
    if length == 0:
        return [text]  # Edge case: empty text

    # We'll do integer division plus 1 to ensure we don't lose any chars
    chunk_size = (length // 10) + 1
    
    start = 0
    for _ in range(9):  # first 9 chunks
        end = start + chunk_size
        if end > length:
            end = length
        chunks.append(text[start:end])
        start = end
        if start >= length:
            break
    
    # final chunk
    if start < length:
        chunks.append(text[start:])

    return chunks

###########################
# TASK FUNCTION (Polish prompt)
###########################
def fill_in_blanks(book_title, blanks_text):
    """
    Prompt in Polish to fill in the blanks.
    We assume we only want the model to return the words needed
    for each blank, in order.
    """
    prompt = f"""Oto fragment z brakującymi słowami (podkreśleniami) z książki pod tytułem "{book_title}".
Tekst do uzupełnienia:
{blanks_text}

Twoje zadanie: uzupełnij brakujące słowa w języku polskim. 
Zwróć mi wyłącznie listę słów, które powinny się tam pojawić w kolejności, w jakiej występują. 
Nie dodawaj żadnych komentarzy ani dodatkowego tekstu, tylko słowa rozdzielone spacją lub znakami nowej linii.
"""
    return call_model(prompt)


def answer_questions(book_title, questions_list):
    """
    Prompt in Polish to answer questions in Polish.
    """
    answers = []
    for question in questions_list:
        prompt = f"""Masz pytanie dotyczące książki pt. "{book_title}".
Pytanie: {question}
Proszę odpowiedz zwięźle w języku polskim.
"""
        answer = call_model(prompt)
        answers.append(answer.strip())
    return answers

def summarize_text(book_title, text_sections):
    """
    Prompt in Polish to summarize text in Polish.
    """
    summarized_sections = []
    for section in text_sections:
        prompt = f"""Masz następujący fragment z książki pt. "{book_title}".
Fragment: 
{section}

Zadanie: Streść ten fragment w sposób zwięzły w języku polskim.
"""
        summary = call_model(prompt)
        summarized_sections.append(summary.strip())
    return summarized_sections

def translate_text(book_title, text_sections):
    """
    Prompt in Polish, but request the translation in English.
    """
    translated_sections = []
    for section in text_sections:
        prompt = f"""Masz fragment z książki pt. "{book_title}" w języku polskim:
{section}

Zadanie: Przetłumacz powyższy fragment na język angielski.
"""
        translation = call_model(prompt)
        translated_sections.append(translation.strip())
    return translated_sections

###########################
# MAIN SCRIPT
###########################

def main():
    # 1. Handle BLANKS
    blanks_input_dir = os.path.join(DATA_DIR, "blanks", "blanks")
    blanks_output_dir = os.path.join(DATA_DIR, "blanks", "solutions_auto")
    os.makedirs(blanks_output_dir, exist_ok=True)
    for filename in os.listdir(blanks_input_dir):
        if filename.endswith("_blanks.txt"):
            book_title = filename.replace("_blanks.txt", "")  # e.g. "BookA"
            input_path = os.path.join(blanks_input_dir, filename)
            
            # Create book-specific output directory
            book_output_dir = os.path.join(blanks_output_dir, book_title)
            os.makedirs(book_output_dir, exist_ok=True)
            
            # We'll name the output file "solutions.txt"
            output_path = os.path.join(book_output_dir, "solutions.txt")
            
            with open(input_path, "r", encoding="utf-8") as f_in:
                original_blanks_text = f_in.read()

            # 1) Limit to the first 1000 placeholders
            truncated_blanks_text = limit_to_first_1000_blanks(original_blanks_text)

            # 2) Split the truncated text into up to 10 chunks
            text_chunks = split_into_10_chunks(truncated_blanks_text)

            all_words = []
            for chunk_index, chunk_text in enumerate(text_chunks, start=1):
                if chunk_text.strip() == "":
                    # If there's an empty chunk, skip
                    continue

                # 3) Call the model on this chunk
                print(f"Filling chunk {chunk_index} of {len(text_chunks)} for book '{book_title}'...")
                filled_text = fill_in_blanks(book_title, chunk_text)

                # 4) Split the model response into words/tokens
                words_in_chunk = filled_text.split()
                all_words.extend(words_in_chunk)

            # 5) Write the combined words (from all chunks) to file, one per line
            with open(output_path, "w", encoding="utf-8") as f_out:
                for w in all_words:
                    f_out.write(w.strip() + "\n")


    # 2. Handle Q&A
    qa_input_dir = os.path.join(DATA_DIR, "q_a", "questions")
    qa_output_dir = os.path.join(DATA_DIR, "q_a", "answers_auto")
    os.makedirs(qa_output_dir, exist_ok=True)

    for filename in os.listdir(qa_input_dir):
        if filename.endswith("_questions.txt"):
            book_title = filename.replace("_questions.txt", "")  # e.g. "BookA"
            input_path = os.path.join(qa_input_dir, filename)
            
            # Create book-specific output directory
            book_output_dir = os.path.join(qa_output_dir, book_title)
            os.makedirs(book_output_dir, exist_ok=True)
            
            output_path = os.path.join(book_output_dir, "answers.txt")

            with open(input_path, "r", encoding="utf-8") as f_in:
                questions = [line.strip() for line in f_in if line.strip()]

            answers = answer_questions(book_title, questions)

            with open(output_path, "w", encoding="utf-8") as f_out:
                for ans in answers:
                    f_out.write(ans + "\n")

    # 3. Handle SUMMARIES
    summary_input_dir = os.path.join(DATA_DIR, "summary", "source")
    summary_output_dir = os.path.join(DATA_DIR, "summary", "summaries_auto")
    os.makedirs(summary_output_dir, exist_ok=True)

    for filename in os.listdir(summary_input_dir):
        if filename.endswith(".txt"):
            book_title = filename.replace(".txt", "")  # e.g. "BookA"
            input_path = os.path.join(summary_input_dir, filename)
            
            # Create book-specific output directory
            book_output_dir = os.path.join(summary_output_dir, book_title)
            os.makedirs(book_output_dir, exist_ok=True)
            
            output_path = os.path.join(book_output_dir, "summaries.txt")

            with open(input_path, "r", encoding="utf-8") as f_in:
                content = f_in.read()
                sections = content.split("===")

            summaries = summarize_text(book_title, sections)
            final_text = "\n===\n".join(summaries)

            with open(output_path, "w", encoding="utf-8") as f_out:
                f_out.write(final_text)

    # 4. Handle TRANSLATIONS
    trans_input_dir = os.path.join(DATA_DIR, "trans", "source")
    trans_output_dir = os.path.join(DATA_DIR, "trans", "translations_auto")
    os.makedirs(trans_output_dir, exist_ok=True)

    for filename in os.listdir(trans_input_dir):
        if filename.endswith(".txt"):
            book_title = filename.replace(".txt", "")  # e.g. "BookA"
            input_path = os.path.join(trans_input_dir, filename)
            
            # Create book-specific output directory
            book_output_dir = os.path.join(trans_output_dir, book_title)
            os.makedirs(book_output_dir, exist_ok=True)
            
            output_path = os.path.join(book_output_dir, "translations.txt")

            with open(input_path, "r", encoding="utf-8") as f_in:
                content = f_in.read()
                sections = content.split("===")

            translations = translate_text(book_title, sections)
            final_text = "\n===\n".join(translations)

            with open(output_path, "w", encoding="utf-8") as f_out:
                f_out.write(final_text)

if __name__ == "__main__":
    main()


Filling chunk 1 of 10 for book 'Balladyna'...
Filling chunk 2 of 10 for book 'Balladyna'...
Filling chunk 3 of 10 for book 'Balladyna'...
Filling chunk 4 of 10 for book 'Balladyna'...
Filling chunk 5 of 10 for book 'Balladyna'...
Filling chunk 6 of 10 for book 'Balladyna'...
Filling chunk 7 of 10 for book 'Balladyna'...
Filling chunk 8 of 10 for book 'Balladyna'...
Filling chunk 9 of 10 for book 'Balladyna'...
Filling chunk 10 of 10 for book 'Balladyna'...
Filling chunk 1 of 10 for book 'Dziady_(Mickiewicz)'...
Filling chunk 2 of 10 for book 'Dziady_(Mickiewicz)'...
Filling chunk 3 of 10 for book 'Dziady_(Mickiewicz)'...
Filling chunk 4 of 10 for book 'Dziady_(Mickiewicz)'...
Filling chunk 5 of 10 for book 'Dziady_(Mickiewicz)'...
Filling chunk 6 of 10 for book 'Dziady_(Mickiewicz)'...
Filling chunk 7 of 10 for book 'Dziady_(Mickiewicz)'...
Filling chunk 8 of 10 for book 'Dziady_(Mickiewicz)'...
Filling chunk 9 of 10 for book 'Dziady_(Mickiewicz)'...
Filling chunk 10 of 10 for book 'Dz