In [None]:
import os
import json
import re
import sys
from typing import List, Dict
from pathlib import Path
import warnings

src_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if src_path not in sys.path:
    sys.path.append(src_path)

from utils.helper import load_env
from haystack import Pipeline
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.anthropic import AnthropicGenerator
# from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack.utils import Secret

In [None]:
# Ignore notebook warnings
warnings.filterwarnings('ignore')
load_env()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

In [None]:
apply_instructions_template = """
You are an AI writing assistant tasked with applying the following writing instructions to a given blog post draft. Your goal is to modify the draft to match the style and characteristics described in the instructions.

Writing Instructions:
{{instructions}}

Here are some of the author's most successful posts for reference:
{% for post in example_posts %}
Example Post {{loop.index}}:
Title: {{post.title}}
Content: {{post.content}}

{% endfor %}

When applying the writing instructions to the draft below, pay special attention to:
1. How the example posts implement the writing instructions in practice
2. Common patterns in structure, tone, and style across the example posts
3. Specific techniques the author uses to engage readers
4. Ways the author introduces and explains technical concepts
5. Characteristic phrases, transitions, and writing devices

Original Blog Post Draft:
Title: {{draft_title}}
Content: {{draft_content}}

Please rewrite the draft to match the author's style, using both the writing instructions and example posts as guides. Maintain the technical accuracy and core message while adapting the presentation to match the author's voice.

Output the rewritten post in this format:
TITLE: (the revised title)

CONTENT:
(the revised content)
"""

apply_instructions_prompt_builder = PromptBuilder(template=apply_instructions_template)

# generator = OllamaGenerator(model="qwen2.5",
#                             url = "http://localhost:11434",
#                             generation_kwargs={
#                             "num_predict": 1000,
#                             "temperature": 0.7,
#                             })

generator = AnthropicGenerator(
    api_key=Secret.from_env_var("ANTHROPIC_API_KEY"),
    model="claude-3-sonnet-20240229",
    generation_kwargs={
        "max_tokens": 4096,
        "temperature": 0.7,
    }
)

apply_instructions_pipeline = Pipeline()
apply_instructions_pipeline.add_component(instance=apply_instructions_prompt_builder, name="apply_instructions_prompt_builder")
apply_instructions_pipeline.add_component(instance=generator, name="llm")
apply_instructions_pipeline.connect("apply_instructions_prompt_builder", "llm")


In [4]:
def read_classified_posts(author_name: str, input_dir: str = "classified_posts") -> List[Dict]:
    """
    Read classified posts from the JSON file.

    Args:
        author_name (str): The name of the author.
        input_dir (str): The name of the input directory.

    Returns:
        List[Dict]: A list of dictionaries containing classified post data.
    """
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    data_dir = os.path.join(project_root, 'data', input_dir)

    json_filename = os.path.join(data_dir, f"{author_name}_classified.json")

    if not os.path.exists(json_filename):
        raise FileNotFoundError(f"The file {json_filename} does not exist.")

    with open(json_filename, 'r', encoding='utf-8') as f:
        posts = json.load(f)

    return posts


def read_top_posts(author_name: str, n: int = 20) -> List[Dict]:
    """Read and return top N classified posts for an author."""
    posts = read_classified_posts(author_name)

    # Filter non-confidential posts
    non_confidential_posts = [p for p in posts if p.get('confidential') == False]

    # Sort by engagement score
    sorted_posts = sorted(non_confidential_posts,
                         key=lambda x: (
                             x.get('like_count', 0) +
                             x.get('comment_count', 0) * 2 +
                             x.get('views', 0) * 0.1
                         ), reverse=True)

    return sorted_posts[:n]


def save_post_instructions(author_name: str, instruction_type: str, post_title: str, instructions: str):
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    instructions_dir = os.path.join(project_root, 'data', 'post_instructions')
    os.makedirs(instructions_dir, exist_ok=True)

    # Create a filename-friendly version of the post title
    safe_title = "".join([c for c in post_title if c.isalnum() or c in (' ', '-', '_')]).rstrip()
    safe_title = safe_title.replace(' ', '_')

    file_name = f"{author_name}_{instruction_type}_{safe_title}_instructions.txt"
    file_path = os.path.join(instructions_dir, file_name)

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(instructions)
    print(f"Post instructions saved to {file_path}")


def load_instructions_from_disk(author_name: str, instruction_type: str) -> str:
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    file_path = os.path.join(project_root, 'data', 'author_instructions', f"{author_name}_{instruction_type}_instructions.txt")

    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        raise FileNotFoundError(f"No {instruction_type} instructions found for author: {author_name}")


def apply_author_style(author_name: str, instruction_type: str, draft_title: str, draft_content: str, num_examples: int = 10) -> Dict[str, str]:
    instructions = load_instructions_from_disk(author_name, instruction_type)
    example_posts = read_top_posts(author_name, num_examples)

    input_dict = {
        "instructions": instructions,
        "example_posts": example_posts,
        "draft_title": draft_title,
        "draft_content": draft_content
    }

    result = apply_instructions_pipeline.run(data=input_dict)
    response_text = result["llm"]["replies"][0]

    # Parse the response to extract title and content
    try:
        title_match = re.search(r"TITLE:\s*(.*?)(?:\n\n|\n?CONTENT:)", response_text, re.DOTALL)
        content_match = re.search(r"CONTENT:\s*(.*?)$", response_text, re.DOTALL)

        updated_title = title_match.group(1).strip() if title_match else draft_title
        updated_content = content_match.group(1).strip() if content_match else draft_content
    except Exception as e:
        print(f"Error parsing LLM response: {e}")
        updated_title = draft_title
        updated_content = draft_content

    # Save the post-specific instructions
    save_post_instructions(author_name, instruction_type, updated_title, updated_content)

    return {
        "title": updated_title,
        "content": updated_content
    }

In [None]:
draft_title = "Building an AI-Powered Writing Style Mimic: A Technical Journey"
draft_content = """
Ever wondered how to programmatically capture and replicate someone's writing style? Here's how we built a tool that does exactly that, using WordPress.com API, LLMs, and Python.

The process works in four main steps:

1. Data Collection
   - Fetch author's posts via WordPress.com API
   - Collect engagement metrics (views, likes, comments)
   - Select top 50 posts based on engagement

2. Content Classification
   - Use local LLM to identify and filter confidential content
   - Ensure we only learn from publicly shareable writing
   - Store classified posts for further processing

3. Style Analysis
   - Feed non-confidential posts to Claude 3.5
   - Generate comprehensive writing style instructions
   - Capture patterns in tone, structure, and technical depth

4. Style Application
   - Take new draft content as input
   - Apply author's style instructions
   - Generate content that matches the author's voice

The entire pipeline is built in Python, using Haystack for LLM orchestration and modern API integrations. The result? A tool that can take any draft and rewrite it in the style of specific authors while maintaining the original message.

Want to try it yourself? The code is available on GitHub.
"""

updated_post = apply_author_style(
    author_name="author1",
    instruction_type="writing",
    draft_title=draft_title,
    draft_content=draft_content,
    num_examples=20
)
print("Updated Title:", updated_post['title'])
print("\nUpdated Content:")
print(updated_post['content'])