In [None]:
import os
import json
import sys
from typing import List, Dict
import warnings

src_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if src_path not in sys.path:
    sys.path.append(src_path)

from utils.helper import load_env
from haystack import Pipeline
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack_integrations.components.generators.anthropic import AnthropicGenerator
from haystack.utils import Secret

In [None]:
# Ignore notebook warnings
warnings.filterwarnings('ignore')
load_env()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

In [3]:
def read_classified_posts(author_name: str, input_dir: str = "classified_posts") -> List[Dict]:
    """
    Read classified posts from the JSON file.

    Args:
        author_name (str): The name of the author.
        input_dir (str): The name of the input directory.

    Returns:
        List[Dict]: A list of dictionaries containing classified post data.
    """
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    data_dir = os.path.join(project_root, 'data', input_dir)

    json_filename = os.path.join(data_dir, f"{author_name}_classified.json")

    if not os.path.exists(json_filename):
        raise FileNotFoundError(f"The file {json_filename} does not exist.")

    with open(json_filename, 'r', encoding='utf-8') as f:
        posts = json.load(f)

    return posts

In [18]:
prompt_template = """
Analyze the following blog posts by a single author. Based on these posts, create a comprehensive set of instructions for writing a new blog post that authentically mimics the author's unique style. Focus on the following aspects:

1. Tone and Voice:
   - Overall mood (e.g., formal, casual, humorous, serious)
   - Level of personal disclosure
   - Use of rhetorical devices (e.g., irony, sarcasm, metaphors)

2. Writing Style:
   - Sentence complexity (e.g., short and punchy, long and elaborate)
   - Paragraph structure and length
   - Use of literary devices (e.g., alliteration, analogies, personification)
   - Balance between description and action

3. Content Structure:
   - Typical post length
   - Introduction style (e.g., anecdotal, question-based, declarative)
   - Body organization (e.g., chronological, problem-solution, compare-contrast)
   - Conclusion style (e.g., call-to-action, summary, open-ended question)

4. Language and Vocabulary:
   - Level of technicality or jargon
   - Use of industry-specific terms
   - Frequency of adjectives and adverbs
   - Preferred word choices for common concepts

5. Engagement Techniques:
   - Use of questions to engage readers
   - Inclusion of personal anecdotes or examples
   - References to pop culture, current events, or other external sources
   - Calls-to-action and their placement

6. Formatting and Visual Elements:
   - Use of headings and subheadings
   - Frequency and style of bullet points or numbered lists
   - Incorporation of quotes or block quotes
   - Use of bold, italic, or other text formatting

7. Emoji and Special Characters:
   - Frequency and context of emoji usage
   - Any recurring special characters or symbols

8. Unique Quirks:
   - Signature phrases or expressions
   - Recurring themes or topics
   - Distinctive ways of opening or closing posts

9. Argumentative Style:
   - Approach to presenting opinions (e.g., direct, subtle, balanced)
   - Use of evidence or sources to support arguments
   - Handling of counterarguments

10. Reader Interaction:
    - Addressing the reader directly
    - Encouraging comments or discussions
    - Referencing previous posts or promising future content

11. Link and Reference Patterns:
   - Frequency and placement of external links
   - Types of resources commonly referenced (e.g., GitHub, documentation, blog posts)
   - Style of attributions and citations
   - Use of internal cross-references to other posts

12. Problem-Solving Approach:
    - How technical challenges are introduced
    - Step-by-step problem breakdown patterns
    - Ways of presenting alternative solutions
    - Methods for explaining trade-offs
    - Patterns in discussing implementation details

13. Knowledge Assumptions:
   - Level of technical background assumed
   - How basic concepts are explained vs. advanced topics
   - Use of analogies to explain complex topics
   - Ways of making technical content accessible

14. Anti-Patterns to Avoid:
   - Writing styles that the author never uses
   - Topics or approaches that would feel out of character
   - Technical concepts typically avoided or delegated
   - Tonal elements that would break authenticity

15. Context Adaptations:
   - How the writing style varies between technical tutorials vs. project updates
   - Tone differences between success stories and challenge discussions
   - Style adjustments for different audience technical levels
   - Language patterns for different types of technical content

Blog posts:
{% for post in posts %}
Title: {{post.title}}
Content: {{post.content}}

{% endfor %}

Based on these posts, provide a detailed set of instructions for writing a new blog post that convincingly mimics this author's style. The instructions should be comprehensive enough that a skilled writer could use them to create a post that seems authentically written by the original author.

Your response should be in the following format:

Writing Instructions for [Author Name]:
1. [First instruction]
2. [Second instruction]
3. [Third instruction]
...

Ensure your instructions cover all the aspects mentioned above, providing specific examples from the author's writing where relevant. The goal is to capture not just the superficial elements of the author's style, but also the underlying thought processes and approaches to content creation.
"""

prompt_builder = PromptBuilder(template=prompt_template)

In [None]:
# generator = OllamaGenerator(model="qwen2.5",
#                             url = "http://localhost:11434",
#                             generation_kwargs={
#                             "num_predict": 1000,
#                             "temperature": 0.7,
#                             })

generator = AnthropicGenerator(
    api_key=Secret.from_env_var("ANTHROPIC_API_KEY"),
    model="claude-3-5-sonnet-20241022",
    generation_kwargs={
        "max_tokens": 8192,  # Max possible in Claude 3.5 sonnet
        "temperature": 0.7   # Optional: adjust creativity vs consistency
    }
)

pipeline = Pipeline()
pipeline.add_component(instance=prompt_builder, name="prompt_builder")
pipeline.add_component(instance=generator, name="llm")
pipeline.connect("prompt_builder", "llm")

In [21]:
def save_instructions_to_disk(author_name: str, instruction_type: str, instructions: str):
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    instructions_dir = os.path.join(project_root, 'data', 'author_instructions')
    os.makedirs(instructions_dir, exist_ok=True)

    file_path = os.path.join(instructions_dir, f"{author_name}_{instruction_type}_instructions.txt")
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(instructions)
    print(f"{instruction_type.capitalize()} instructions saved to {file_path}")


def load_instructions_from_disk(author_name: str) -> str:
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    file_path = os.path.join(project_root, 'data', 'author_instructions', f"{author_name}_instructions.txt")

    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        return None

def generate_author_writing_prompt(author_name: str, instruction_type: str = "writing") -> str:
    print(f"Generating {instruction_type} writing prompt for author: {author_name}")
    posts = read_classified_posts(author_name)

    # Use only non-confidential posts that have been classified
    non_confidential_posts = [post for post in posts if post.get('confidential') == False]

    if not non_confidential_posts:
        return f"No non-confidential posts found for {author_name}"

    input_dict = {
        "posts": non_confidential_posts
    }

    result = pipeline.run(data=input_dict)
    writing_instructions = result["llm"]["replies"][0]

    print(f"{instruction_type.capitalize()} writing instructions generated for {author_name}")
    save_instructions_to_disk(author_name, instruction_type, writing_instructions)
    return writing_instructions

In [None]:
author_name = "author1"
instruction_type = "writing"
instructions = generate_author_writing_prompt(author_name, instruction_type)
print(instructions)