In [1]:
import json
import pandas as pd

# General-purpose function to load JSON data
def load_json(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print("File not found. Please ensure the file path is correct.")
    except json.JSONDecodeError:
        print("File is not a valid JSON. Please check the file content.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Function to count entries in JSON
def count_entries_in_json(data):
    if data is not None:
        return len(data)
    return 0

In [3]:
# File paths
file_path = '/home/drusniel/llm_notebooks/analyzed_George_MacDonald.json'
# Load JSON data
data = load_json(file_path)

In [4]:
# Count the number of entries in the JSON file
number_of_entries = count_entries_in_json(data)
print(f"Number of entries in the JSON file: {number_of_entries}")

Number of entries in the JSON file: 182


In [5]:
# Function to create a DataFrame from JSON data with tones and their occurrences
# Prompt: map these tones in the 10 most significant tones for writting: Alarming, Light-hearted... give me the result in the following format:     tone_mapping = {
#         "Alarming": "Dramatic", "Gruff": "Dramatic", "Heated": "Dramatic", "Indignant": "Dramatic",
#         "Suspenseful": "Dramatic", "Tense": "Dramatic", "Philosophical": "Thoughtful", "Pensive": "Thoughtful",
#         "Reflective": "Thoughtful", "Inquisitive": "Thoughtful", "Curious": "Thoughtful", "Skeptical": "Thoughtful",
#         "Light-hearted": "Positive", "Optimistic": "Positive", "Excited": "Positive", "Calm": "Calm",
#         "Casual": "Calm", "Solemn": "Calm", "Determined": "Determined", "Resolute": "Determined",
#         "Mysterious": "Mysterious", "Stealthy": "Mysterious", "Cautious": "Mysterious"
#     }

def tone_occurrences_dataframe(data):
    if data is None:
        return pd.DataFrame(columns=['Tone', 'Occurrences'])
    
    tone_counts = {}
    for entry in data:
        tone = entry.get('tone', None)
        if tone:
            tone_counts[tone] = tone_counts.get(tone, 0) + 1

    # Creating DataFrame from dictionary
    tones_df = pd.DataFrame(list(tone_counts.items()), columns=['Tone', 'Occurrences'])
    
    # Print tones in a comma-separated format
    comma_separated_tones = ", ".join(tone_counts.keys())
    print("Comma-separated tones:", comma_separated_tones)
    
    return tones_df

# Generate the DataFrame
df_tones = tone_occurrences_dataframe(data)
print(df_tones)

Comma-separated tones: serene, charming, mysterious, intriguing, reflective, dreary, listless, suggestive, anticipative, curious, adventurous, tense, panicked, resolute, hopeful, concerned, evasive, imaginative, skeptical, insistent, dismissive, defensive, sarcastic, facetious, inquiring, assertive, challenging, indignant, accusatory, sorrowful, reproachful, adamant, disbelieving, frustrated, tender, conciliatory, forgiving, loving, affronted, offended, reassuring, irritated, apologetic, confident, endearing, thoughtful, calculative, friendly, accommodating, perplexed, unattended, unexpected, distressed, surprised, resolved, disappointed, gloomy, brightening, picturesque, disenchanted, reluctant, startled, anxious, realistic, unperturbed, frightened, terrified, alarmed, confused, exasperated, horrified, desperate, exhausted, dismayed, hopeless, disoriented, descriptive, inquisitive, pleading, cautious, reassured, playful, nervous, grateful, affectionate, action, relieved, conflicted, n

In [7]:
# Function to rewrite JSON data with grouped tones and add an incremental ID
def rewrite_json_with_grouped_tones(data, output_file_path):
    tone_mapping = {
    "Serene": "Calm", "Charming": "Positive", "Mysterious": "Mysterious", 
    "Intriguing": "Thoughtful", "Reflective": "Thoughtful", "Dreary": "Gloomy", 
    "Listless": "Gloomy", "Suggestive": "Thoughtful", "Anticipative": "Positive", 
    "Curious": "Thoughtful", "Adventurous": "Excited", "Tense": "Dramatic", 
    "Panicked": "Dramatic", "Resolute": "Determined", "Hopeful": "Positive", 
    "Concerned": "Thoughtful", "Evasive": "Defensive", "Imaginative": "Creative", 
    "Skeptical": "Thoughtful", "Insistent": "Assertive", "Dismissive": "Defensive", 
    "Defensive": "Defensive", "Sarcastic": "Humorous", "Facetious": "Humorous", 
    "Inquiring": "Thoughtful", "Assertive": "Assertive", "Challenging": "Dramatic", 
    "Indignant": "Dramatic", "Accusatory": "Dramatic", "Sorrowful": "Gloomy", 
    "Reproachful": "Gloomy", "Adamant": "Determined", "Disbelieving": "Skeptical", 
    "Frustrated": "Dramatic", "Tender": "Affectionate", "Conciliatory": "Calm", 
    "Forgiving": "Calm", "Loving": "Affectionate", "Affronted": "Defensive", 
    "Offended": "Defensive", "Reassuring": "Calm", "Irritated": "Dramatic", 
    "Apologetic": "Calm", "Confident": "Assertive", "Endearing": "Affectionate", 
    "Thoughtful": "Thoughtful", "Calculative": "Strategic", "Friendly": "Positive", 
    "Accommodating": "Calm", "Perplexed": "Confused", "Unattended": "Negative", 
    "Unexpected": "Surprised", "Distressed": "Dramatic", "Surprised": "Surprised", 
    "Resolved": "Determined", "Disappointed": "Gloomy", "Gloomy": "Gloomy", 
    "Brightening": "Positive", "Picturesque": "Creative", "Disenchanted": "Gloomy", 
    "Reluctant": "Defensive", "Startled": "Surprised", "Anxious": "Nervous", 
    "Realistic": "Thoughtful", "Unperturbed": "Calm", "Frightened": "Dramatic", 
    "Terrified": "Dramatic", "Alarmed": "Dramatic", "Confused": "Confused", 
    "Exasperated": "Dramatic", "Horrified": "Dramatic", "Desperate": "Dramatic", 
    "Exhausted": "Gloomy", "Dismayed": "Gloomy", "Hopeless": "Gloomy", 
    "Disoriented": "Confused", "Descriptive": "Creative", "Inquisitive": "Thoughtful", 
    "Pleading": "Dramatic", "Cautious": "Defensive", "Reassured": "Calm", 
    "Playful": "Humorous", "Nervous": "Nervous", "Grateful": "Positive", 
    "Affectionate": "Affectionate", "Action": "Excited", "Relieved": "Positive", 
    "Conflicted": "Dramatic", "Noble": "Positive", "Contemplative": "Thoughtful", 
    "Dramatic": "Dramatic", "Ethical": "Thoughtful", "Consoling": "Calm", 
    "Focused": "Determined", "Intrigued": "Thoughtful", "Animated": "Excited", 
    "Questioning": "Thoughtful", "Amused": "Humorous", "Conspiratorial": "Mysterious", 
    "Observant": "Thoughtful", "Strategic": "Strategic", "Sneaky": "Mysterious", 
    "Stealthy": "Mysterious", "Commanding": "Assertive", "Humorous": "Humorous", 
    "Anticipatory": "Positive", "Awestruck": "Excited", "Political": "Dramatic", 
    "Formal": "Formal", "Joyful": "Positive", "Excited": "Excited", "Vivid": "Creative", 
    "Intimate": "Affectionate", "Wistful": "Gloomy", "Melancholic": "Gloomy", 
    "Painful": "Gloomy", "Mystical": "Mysterious", "Inviting": "Positive", 
    "Introspective": "Thoughtful", "Philosophical": "Thoughtful"
}

    
    for index, entry in enumerate(data, start=1):
        original_tone = entry.get('tone')
        entry['tone'] = tone_mapping.get(original_tone, original_tone) 
        entry['id'] = index  

    try:
        with open(output_file_path, 'w') as file:
            json.dump(data, file, indent=4)
        return "JSON data has been rewritten successfully."
    except Exception as e:
        return f"An error occurred: {str(e)}"

# File paths
file_path = '/home/drusniel/llm_notebooks/analyzed_George_MacDonald.json'
output_file_path = '/home/drusniel/llm_notebooks/modified_data_George_MacDonald.json'

# Rewrite the JSON file with added IDs
result = rewrite_json_with_grouped_tones(data, output_file_path)
print(result)

JSON data has been rewritten successfully.


In [12]:
def generate_prompts_for_rewriting(input_file_path, output_file_path):
    try:
        with open(input_file_path, 'r') as json_file, open(output_file_path, 'w') as text_file:
            data = json.load(json_file)
            for index, entry in enumerate(data, start=1):
                if 'text' in entry:
                    text = entry['text']  
                    prompt = (f"Rewrite the following text in JSON format with your own words, "
                              f"that is clearly recognized as GPT4 generated text by the descriptive wording and tone "
                              f"and return the result in a JSON format with a field called 'id' that has number {index} "
                              f"and a field called 'gptText' that has the text you rewrote, the text is:\n{text}\n\n")
                    text_file.write(prompt)  
                else:
                    text_file.write(f"No text found for entry {index}.\n\n") 
        return "Prompts have been written successfully to the text file."
    except FileNotFoundError:
        return "The file was not found."
    except json.JSONDecodeError:
        return "The file is not a valid JSON file."
    except Exception as e:
        return f"An error occurred: {str(e)}"

In [13]:
# File paths
file_path = '/home/drusniel/llm_notebooks/analyzed_George_MacDonald.json'
output_file_path = '/home/drusniel/llm_notebooks/instructions_George_MacDonald.txt'

# Generate prompts for rewriting the text
result = generate_prompts_for_rewriting(file_path, output_file_path)
print(result)

Prompts have been written successfully to the text file.


In [14]:
import re

def escape_special_chars(text):
    # Escape backslashes
    text = text.replace('\\', '\\\\')
    # Replace Unicode characters with their escaped representation
    text = text.replace('\\u201c', '\\"')
    text = text.replace('\\u201d', '\\"')
    text = text.replace('\\u2019', "'")
    text = text.replace('\\u2014', "-")
    return text

def generate_train_data(gpt_file, modified_file, output_file):
    # Read gptStyle.json
    with open(gpt_file, 'r') as file:
        gpt_data = json.load(file)

    # Read modified_data.json
    with open(modified_file, 'r') as file:
        modified_data = json.load(file)

    # Create a dictionary to store the modified data entries by their IDs
    modified_dict = {entry['id']: entry for entry in modified_data}

    # Open the output JSONL file
    with open(output_file, 'w') as output:
        for gpt_entry in gpt_data:
            gpt_id = gpt_entry['id']
            gpt_text = gpt_entry['gptText']

            if gpt_id in modified_dict:
                modified_entry = modified_dict[gpt_id]
                author = modified_entry['author']
                tone = modified_entry['tone']
                text_type = modified_entry['type']
                modified_text = modified_entry['text']

                # Escape special characters and Unicode
                gpt_text = escape_special_chars(gpt_text)
                modified_text = escape_special_chars(modified_text)

                # Create the formatted text for the output JSONL file
                formatted_text = f"<human>:Rephrase the following text with in the style of {author}: {gpt_text}\\n<bot>: {modified_text}"

                # Create the metadata dictionary
                metadata = {"source": "gutenberg"}

                # Create the final dictionary for the JSON entry
                jsonl_entry = {"text": formatted_text, "metadata": metadata}

                # Write the JSONL entry to the output file
                output.write(json.dumps(jsonl_entry) + '\n')

    print(f"Train data generated successfully. Output file: {output_file}")
    
gpt_file = '/home/drusniel/llm_notebooks/gptStyle_George_MacDonald.json'
modified_file = '/home/drusniel/llm_notebooks/modified_data_George_MacDonald.json'
output_file = '/home/drusniel/llm_notebooks/train_data_George_MacDonald.jsonl'

generate_train_data(gpt_file, modified_file, output_file)

Train data generated successfully. Output file: /home/drusniel/llm_notebooks/train_data_George_MacDonald.jsonl


In [15]:
import json

def convert_jsonl_to_json(input_file, output_file):
    with open(input_file, 'r') as file:
        jsonl_data = file.readlines()

    json_data = []

    for line in jsonl_data:
        data = json.loads(line)
        text = data['text']

        # Split the text into instruction, input, and output
        parts = text.split('<bot>:')
        instruction = parts[0].strip().replace('<human>:', '').strip()
        output = parts[1].strip() if len(parts) > 1 else ''

        # Create a new dictionary with the desired format
        json_entry = {
            'instruction': instruction,
            'input': '',
            'output': output
        }

        json_data.append(json_entry)

    # Write the JSON data to the output file
    with open(output_file, 'w') as file:
        json.dump(json_data, file, indent=2)

    print(f"Conversion completed. Output file: {output_file}")

# Specify the paths for the input and output files
input_file = '/home/drusniel/llm_notebooks/train_data_George_MacDonald.jsonl'
output_file = '/home/drusniel/llm_notebooks/train_data_llama_factory_George_MacDonald.json'

# Call the function to convert the file
convert_jsonl_to_json(input_file, output_file)

Conversion completed. Output file: /home/drusniel/llm_notebooks/train_data_llama_factory_George_MacDonald.json
