In [None]:
import re
from openai import OpenAI
import json

In [None]:
def process_text_file(file_path, output_filename, client):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    paragraphs = [p.strip() for p in text.split('\n\n') if len(p.strip().split()) >= 10]

    jsonl_data = []
    skipped_paragraphs = []
    total_retries = 0

    for i, paragraph in enumerate(paragraphs, start=1):
        retry_count = 0
        failure_reason = None

        while retry_count < 5:
            completion = client.chat.completions.create(
                model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q8_0.gguf",
                messages=[
                    {
                        "role": "system",
                        "content": "You are an expert novelist. Paraphrase the given text in your own words, refining the prose and making it a bit richer. Don't write additional story or meaning, keep the original meaning and form. The response should contain only one paragraph. Keep a similar length in your output to the original, not more than 50% difference, and shouldn't be shorter than the original text."
                    },
                    {
                        "role": "user",
                        "content": paragraph
                    }
                ],
                temperature=0.9,
            )
            response = completion.choices[0].message.content.strip()

            if "###" in response:
                failure_reason = "Response contains '###'"
            elif "Paraphrased Text" in response:
                failure_reason = "Response contains 'Paraphrased Text'"
            elif "Alternate Refinement" in response:
                failure_reason = "Response contains 'Alternate Refinement'"
            elif "Extended Alternate Refinement" in response:
                failure_reason = "Response contains 'Extended Alternate Refinement'"
            elif "[Note:" in response:
                failure_reason = "Response contains '[Note:'"
            elif "### Question:" in response:
                failure_reason = "Response contains '### Question:'"
            elif len(response.split('\n\n')) > 1:
                failure_reason = "Response contains multiple paragraphs"
            elif len(response.split()) < int(0.8 * len(paragraph.split())):
                failure_reason = "Response is less than 80% the size of the input"
            elif len(response.split()) > int(2 * len(paragraph.split())):
                failure_reason = "Response is more than 2 times the length of the input"
            else:
                break

            retry_count += 1
            total_retries += 1
            print(f"\n- Retrying for paragraph {i}: {paragraph} \n - Reason: {failure_reason} \n - Response: {response}")

        if retry_count == 5:
            print(f"\n Skipping paragraph {i} after 5 retries.")
            skipped_paragraphs.append((i, failure_reason))
            continue

        word_count = len(paragraph.split())
        print(f"\n---> Paragraph {i} ({word_count} words):")
        print(f"\nInput: {paragraph}")
        print(f"\nOutput: {response}")
        print()

        result = {
            "text": f"<human>: Rephrase the following text in the style of George MacDonald: {response}\n<bot>: {paragraph}",
            "metadata": {"source": "gutenberg"}
        }
        jsonl_data.append(json.dumps(result, ensure_ascii=False))

    with open(output_filename, 'w', encoding='utf-8') as output_file:
        output_file.write('\n'.join(jsonl_data))

    total_paragraphs = len(paragraphs)
    written_paragraphs = len(jsonl_data)
    skipped_paragraphs_count = len(skipped_paragraphs)

    print("Summary:")
    print(f"Total paragraphs: {total_paragraphs}")
    print(f"Written paragraphs: {written_paragraphs}")
    print(f"Skipped paragraphs: {skipped_paragraphs_count}")
    print(f"Total retries: {total_retries}")
    print("Skipped paragraph details:")
    for paragraph_number, reason in skipped_paragraphs:
        print(f"Paragraph {paragraph_number}: {reason}")

In [None]:
file_path = 'G:\\texts\\sample1_B.txt'
output_file_path = 'G:\\texts\\train_data_book1_Drusniel_Meta-Llama-3-8B-Instruct-Q8_0.jsonl'

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

process_text_file(file_path, output_file_path, client)