In [3]:
import csv
import json
import random
from openai import OpenAI
import re
from html import unescape

In [2]:
def generate_rejected_response(client, prompt, temperature, seed):
    completion = client.chat.completions.create(
        model="professorf/Meta-Llama-3-8B-Instruct-16f-gguf/llama-3-8b-instruct-16f.gguf",
        messages=[
            {
                "role": "system",
                "content": "You are an expert novelist. Paraphrase the given text in your own words, refining the prose and making it a bit richer. Keep a similar length in your output to the original, not more than 50% difference, and shouldn't be shorter than the original text, only a bit more purple prose. Don't add additional text or notes to the output."
            },
            {
                "role": "user",
                "content": "Don't write additional story content or change the meaning, keep the original meaning and form. The response should contain only one paragraph. Rephrase the following text: " + prompt
            }
        ],
        temperature=temperature,
        seed=seed,
    )
    return completion.choices[0].message.content.strip()

In [3]:
def process_jsonl_file(input_file_path, output_file_path, client):
    total_lines = 0
    written_lines = 0
    skipped_lines = []
    total_retries = 0

    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        try:
            with open(output_file_path, 'r', encoding='utf-8') as output_file:
                csv_reader = csv.reader(output_file)
                written_lines = sum(1 for _ in csv_reader) - 1 
        except FileNotFoundError:
            pass

        start_line_number = written_lines + 1

        csv_writer = None
        output_file = None

        for line_number, line in enumerate(input_file, start=start_line_number):
            total_lines += 1

            if csv_writer is None:
                output_file = open(output_file_path, 'a', newline='', encoding='utf-8')
                csv_writer = csv.writer(output_file)
                if written_lines == 0:
                    csv_writer.writerow(['prompt', 'chosen', 'rejected'])

            json_data = json.loads(line)
            prompt = json_data['text'].split('\n<bot>: ')[0].replace('<human>: Rephrase the following text in the style of George MacDonald: ', '')
            chosen = json_data['text'].split('\n<bot>: ')[1]

            retry_count = 0
            while retry_count < 5:
                seed = random.randint(0, 2**32 - 1)
                temperature = round(random.uniform(0.7, 1.0), 1)

                rejected = generate_rejected_response(client, prompt, temperature, seed)

                if "###" in rejected:
                    failure_reason = "Response contains '###'"
                elif len(rejected.split('\n\n')) > 1:
                    failure_reason = "Response contains multiple paragraphs"
                elif len(rejected.split()) < int(0.85 * len(prompt.split())):
                    failure_reason = "Response is less than 85% the size of the input"
                elif len(rejected.split()) > int(2.5 * len(prompt.split())):
                    failure_reason = "Response is more than 2.5 times the length of the input"
                else:
                    break

                retry_count += 1
                total_retries += 1

            if retry_count == 5:
                print(f"Skipping line {line_number} after 5 retries: {prompt}")
                skipped_lines.append((line_number, failure_reason))
                continue

            csv_writer.writerow([prompt, chosen, rejected])
            written_lines += 1

            if written_lines % 100 == 0:
                output_file.flush()
                print(f"Progress saved at line {line_number}")
                print(f"Written lines: {written_lines}")
                print(f"Skipped lines: {len(skipped_lines)}")
                print(f"Total retries: {total_retries}")
                print()

        if output_file is not None:
            output_file.close()

    print("Summary:")
    print(f"Total lines: {total_lines}")
    print(f"Written lines: {written_lines}")
    print(f"Skipped lines: {len(skipped_lines)}")
    print(f"Total retries: {total_retries}")
    print("Skipped line details:")
    for line_number, reason in skipped_lines:
        print(f"Line {line_number}: {reason}")

In [4]:
# Set up the OpenAI client
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

# Specify the input and output file paths
input_file_path = 'G:\\texts\\train_data_book1_Drusniel_Meta-Llama-3-8B-Instruct-Q8_0.jsonl'
output_file_path = 'G:\\texts\\orpo_data_book1_Drusniel_Meta-Llama-3-8B-Instruct-Q8_0.csv'

# Process the JSONL file and generate the CSV file
process_jsonl_file(input_file_path, output_file_path, client)

Progress saved at line 3600
Written lines: 3600
Skipped lines: 0
Total retries: 0

Progress saved at line 3700
Written lines: 3700
Skipped lines: 0
Total retries: 0

Progress saved at line 3800
Written lines: 3800
Skipped lines: 0
Total retries: 5

Progress saved at line 3900
Written lines: 3900
Skipped lines: 0
Total retries: 5

Skipping line 3940 after 5 retries: "With an enigmatic glint in his eye, Kelsier whispered, 'The secrets I hold would lose their allure if revealed to you.'"
Progress saved at line 4001
Written lines: 4000
Skipped lines: 1
Total retries: 10

Skipping line 4071 after 5 retries: "With a hint of resignation, Kelsier nodded in understanding," he spent twelve long months within those oppressive walls."
Progress saved at line 4102
Written lines: 4100
Skipped lines: 2
Total retries: 15

Progress saved at line 4202
Written lines: 4200
Skipped lines: 2
Total retries: 17

Progress saved at line 4302
Written lines: 4300
Skipped lines: 2
Total retries: 17

Progress saved 

In [24]:
def normalize_dialogue(dialogue):
    # Normalize curly quotes to straight quotes
    dialogue = re.sub(r'[“”]', '"', dialogue)
    dialogue = re.sub(r'[‘’]', "'", dialogue)
    dialogue = re.sub(r'["""]', '"', dialogue)
    dialogue = re.sub(r'[""]', "'", dialogue)
    dialogue = re.sub(r"[''']", "'", dialogue)
    
    # Normalize ellipses
    dialogue = dialogue.replace('…', '...')
    
    # Normalize dashes
    dialogue = re.sub(r'[–—]', '-', dialogue)
    
    # Unescape HTML entities if necessary
    dialogue = unescape(dialogue)
    
    return dialogue

def normalize_file(input_file_path, output_file_path):
    try:
        with open(input_file_path, 'r', encoding='utf-8') as infile, \
             open(output_file_path, 'w', encoding='utf-8', newline='') as outfile:
            
            reader = csv.reader(infile)
            writer = csv.writer(outfile)
            
            for row in reader:
                normalized_row = [normalize_dialogue(field) for field in row]
                writer.writerow(normalized_row)
    
    except FileNotFoundError:
        print(f"File not found: {input_file_path}")
    
    except IOError:
        print(f"Error accessing file: {input_file_path}")

In [25]:
input_file_path = 'G:\\texts\\orpo_data_book1_Drusniel_Meta-Llama-3-8B-Instruct-Q8_0.csv'
output_file_path = 'G:\\texts\\orpo_train_data_normalized.csv'
normalize_file(input_file_path, output_file_path)

In [28]:
import csv
import json

def convert_to_json_format(input_file_path, output_file_path):
    try:
        with open(input_file_path, 'r', encoding='utf-8') as infile, \
             open(output_file_path, 'w', encoding='utf-8') as outfile:
            
            reader = csv.DictReader(infile)
            data = []
            
            for row in reader:
                instruction = f"Rephrase the following text in the style of Drusniel: {row['prompt']}"
                chosen = row['chosen']
                rejected = row['rejected']
                
                entry = {
                    "instruction": instruction,
                    "input": "",
                    "chosen": chosen,
                    "rejected": rejected
                }
                
                data.append(entry)
            
            json.dump(data, outfile, indent=2)
    
    except FileNotFoundError:
        print(f"File not found: {input_file_path}")
    
    except IOError:
        print(f"Error accessing file: {input_file_path}")

In [29]:
input_file_path = 'G:\\texts\\orpo_train_data_normalized.csv'
output_file_path = 'G:\\texts\\orpo_train_data_normalized.json'
convert_to_json_format(input_file_path, output_file_path)