In [34]:
import os
import json
import sys
from typing import List, Dict
import warnings

src_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if src_path not in sys.path:
    sys.path.append(src_path)

from utils.helper import load_env

In [None]:
# Ignore notebook warnings
warnings.filterwarnings('ignore')

In [36]:
def read_posts_from_json(author_name: str, output_dir: str = "author_posts") -> List[Dict]:
    """
    Read posts from the JSON file.

    Args:
        author_name (str): The name of the author.
        output_dir (str): The name of the output directory.

    Returns:
        List[Dict]: A list of dictionaries containing post data.
    """
    # Use the project_root defined earlier
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    data_dir = os.path.join(project_root, 'data', output_dir)

    # Construct the full path to the JSON file
    json_filename = os.path.join(data_dir, f"{author_name}.json")

    # Check if the file exists
    if not os.path.exists(json_filename):
        raise FileNotFoundError(f"The file {json_filename} does not exist.")

    with open(json_filename, 'r', encoding='utf-8') as f:
        posts = json.load(f)

    return posts

def save_posts_to_json(posts: List[Dict], author_name: str, output_dir: str = "author_posts"):
    """
    Save posts back to the JSON file.

    Args:
        posts (List[Dict]): The list of post dictionaries to save.
        author_name (str): The name of the author.
        output_dir (str): The name of the output directory.
    """
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    data_dir = os.path.join(project_root, 'data', output_dir)
    json_filename = os.path.join(data_dir, f"{author_name}.json")

    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(posts, f, ensure_ascii=False, indent=2)

def save_classified_posts(posts: List[Dict], author_name: str, output_dir: str = "classified_posts"):
    """
    Save classified posts to a separate directory.

    Args:
        posts (List[Dict]): The list of classified post dictionaries.
        author_name (str): The name of the author.
        output_dir (str): The name of the output directory.
    """
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    data_dir = os.path.join(project_root, 'data', output_dir)
    os.makedirs(data_dir, exist_ok=True)

    # Filter only classified posts
    classified_posts = [
        post for post in posts
        if 'confidential' in post and post.get('status') == 'publish'
    ]

    json_filename = os.path.join(data_dir, f"{author_name}_classified.json")
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(classified_posts, f, ensure_ascii=False, indent=2)

def get_processed_post_ids(posts: List[Dict]) -> set:
    """
    Get the set of post IDs that have already been classified.

    Args:
        posts (List[Dict]): The list of post dictionaries.

    Returns:
        set: A set of post IDs that have been classified.
    """
    return {post['post_ID'] for post in posts if 'confidential' in post}


In [37]:
from haystack import Pipeline
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack.utils import Secret

In [38]:
from pydantic import BaseModel

class ConfidentialityClassification(BaseModel):
    is_confidential: bool
    reason: str

In [39]:
import pydantic
from pydantic import ValidationError
from typing import Optional
from colorama import Fore
from haystack import component

@component
class OutputValidator:
    def __init__(self, pydantic_model: pydantic.BaseModel):
        self.pydantic_model = pydantic_model
        self.iteration_counter = 0

    @component.output_types(valid_replies=List[str], invalid_replies=Optional[List[str]], error_message=Optional[str])
    def run(self, replies: List[str]):
        self.iteration_counter += 1

        try:
            output_dict = json.loads(replies[0])
            # My local model sometimes returns a 'properties' wrapper. But the content adheres to the schema.
            if 'properties' in output_dict:
                output_dict = output_dict['properties']
            self.pydantic_model.parse_obj(output_dict)
            print(
                Fore.GREEN
                + f"OutputValidator at Iteration {self.iteration_counter}: Valid JSON from LLM - No need for looping: {replies[0]}"
            )
            return {"valid_replies": [replies[0]]}

        except (ValueError, ValidationError) as e:
            print(
                Fore.RED
                + f"OutputValidator at Iteration {self.iteration_counter}: Invalid JSON from LLM - Let's try again.\n"
                f"Output from LLM:\n {replies[0]} \n"
                f"Error from OutputValidator: {e}"
            )
            return {"invalid_replies": replies, "error_message": str(e)}

In [40]:
# Original one
prompt_template = """
Analyze the following blog post and determine if it contains any confidential or sensitive information that should not be shared online. Create a complete, valid JSON object with your classification and reasoning.

Blog post:
Title: {{title}}
Content: {{content}}

Consider the following types of sensitive information:
1. Personal identifiable information (PII)
2. Financial data
3. Trade secrets or proprietary information
4. Unreleased product details
5. Internal company strategies or plans
6. Confidential client information
7. Sensitive health information
8. Login credentials or access codes

Follow this JSON schema, but only return the actual instances without any additional schema definition:
{{schema}}

Make sure your response is a complete, valid JSON object (dict) and not a list. Ensure all opening brackets, braces, and quotes have matching closing ones.

{% if invalid_replies and error_message %}
  You already created the following output in a previous attempt: {{invalid_replies}}
  However, this doesn't comply with the format requirements from above and triggered this Python exception: {{error_message}}
  Correct the output and try again. Just return the corrected output without any extra explanations.
{% endif %}
"""

prompt_builder = PromptBuilder(template=prompt_template)

In [41]:
# Latest one
prompt_template = """
Analyze the following blog post and determine if it contains any confidential or sensitive information that should not be shared online. Create a complete, valid JSON object with your classification and reasoning.

Blog post:
Title: {{title}}
Content: {{content}}

Guidelines for classification:
1. Confidential information includes:
   - Personal identifiable information (PII) such as full names, addresses, phone numbers, or email addresses of individuals who are not public figures
   - Financial data such as account numbers, credit card details, or specific salary information
   - Trade secrets or proprietary information about unreleased products or services
   - Internal company strategies or plans not meant for public disclosure
   - Confidential client information or details about private business relationships
   - Sensitive health information about specific individuals
   - Login credentials, access codes, or security vulnerabilities

2. Non-confidential information includes:
   - General knowledge or publicly available information
   - Personal opinions or experiences that don't reveal sensitive details
   - Information about released products or services
   - General industry trends or publicly known business strategies
   - Public figures' names or publicly known information about them
   - General health advice or information not tied to specific individuals
   - Technical information that is already in the public domain

3. When in doubt, lean towards classifying the post as non-confidential unless there is clear sensitive information.

4. Consider the context of the blog post. Information that might be sensitive in one context could be non-confidential in another.

Follow this JSON schema for your response:
{{schema}}

Make sure your response is a complete, valid JSON object (dict) and not a list. Ensure all opening brackets, braces, and quotes have matching closing ones.

{% if invalid_replies and error_message %}
  You already created the following output in a previous attempt: {{invalid_replies}}
  However, this doesn't comply with the format requirements from above and triggered this Python exception: {{error_message}}
  Correct the output and try again. Just return the corrected output without any extra explanations.
{% endif %}
"""

prompt_builder = PromptBuilder(template=prompt_template)

In [None]:
# Initialize the OutputValidator with the Pydantic model
output_validator = OutputValidator(pydantic_model=ConfidentialityClassification)

generator = OllamaGenerator(model="qwen2.5",
                            url = "http://localhost:11434",
                            generation_kwargs={
                            "num_predict": 100,
                            "temperature": 0.7,
                            })

pipeline = Pipeline(max_loops_allowed=3)

# Add components to the pipeline
pipeline.add_component(instance=prompt_builder, name="prompt_builder")
pipeline.add_component(instance=generator, name="llm")
pipeline.add_component(instance=output_validator, name="output_validator")

# Connect the components
pipeline.connect("prompt_builder", "llm")
pipeline.connect("llm", "output_validator")
pipeline.connect("output_validator.invalid_replies", "prompt_builder.invalid_replies")
pipeline.connect("output_validator.error_message", "prompt_builder.error_message")

pipeline.show()

In [53]:
def post_score(post):
    like_weight = 1
    view_weight = 0.1
    comment_weight = 2

    return (
        post.get('like_count', 0) * like_weight +
        post.get('views', 0) * view_weight +
        post.get('comment_count', 0) * comment_weight
    )

def get_top_posts(posts: List[Dict], n: int = 50) -> List[Dict]:
    """
    Get the top N posts based on engagement score.

    Args:
        posts (List[Dict]): List of post dictionaries.
        n (int): Number of top posts to return. Default is 50.

    Returns:
        List[Dict]: Top N posts sorted by engagement score.
    """
    # Filter posts
    filtered_posts = [
        post for post in posts
        if post.get('status') == 'publish' and (post.get('confidential') is None or post.get('confidential') == False)
    ]

    # Sort posts by score in descending order
    sorted_posts = sorted(filtered_posts, key=post_score, reverse=True)

    # Return top N posts
    return sorted_posts[:n]

def process_author_posts(author_name: str):
    print(f"\nProcessing posts for author: {author_name}")
    posts = read_posts_from_json(author_name)
    processed_ids = get_processed_post_ids(posts)
    top_posts = get_top_posts(posts, n=50)

    for i, post in enumerate(top_posts):
        if post['post_ID'] in processed_ids:
            print(f"Skipping already processed post {i+1}/{len(top_posts)}: {post['title']}")
            continue

        print(f"Processing post {i+1}/{len(top_posts)}: {post['title']}")

        input_dict = {
            "title": post["title"],
            "content": post["content"],
            "schema": ConfidentialityClassification.schema_json()
        }

        result = pipeline.run(data=input_dict)

        if result["output_validator"]["valid_replies"]:
            classification = json.loads(result["output_validator"]["valid_replies"][0])
            # My local model sometimes returns a 'properties' wrapper. But the content adheres to the schema.
            if 'properties' in classification:
                classification = classification['properties']
            print(f"Confidentiality classification: {classification}")
            post['confidential'] = classification['is_confidential']
            post['confidentiality_reason'] = classification['reason']
        else:
            print("Failed to generate valid classification after maximum iterations.")
            post['confidential'] = 'cannot_classify'
            post['confidentiality_reason'] = 'Classification failed'

        # Save the updated posts after each classification
        save_posts_to_json(posts, author_name)
        save_classified_posts(posts, author_name)

    print(f"Processing completed for author: {author_name}")
    return posts

In [45]:
from IPython.display import display, HTML
# Function to create HTML for a group of posts
def create_group_html(title, color, urls):
    html = f"<h3 style='color: {color};'>{title}</h3>"
    if urls:
        html += "<ul>"
        for url in urls:
            html += f"<li><a href='{url}'>{url}</a></li>"
        html += "</ul>"
    else:
        html += "<p>No posts in this category.</p>"
    return html

def display_results(all_posts: List[Dict]):
    confidential_posts = []
    non_confidential_posts = []
    unclassified_posts = []

    for post in all_posts:
        if 'confidential' in post:
            url = post.get('URL', 'No URL available')
            if post['confidential'] == True:
                confidential_posts.append(url)
            elif post['confidential'] == False:
                non_confidential_posts.append(url)
            else:
                unclassified_posts.append(url)

    result_html = (
        create_group_html("Confidential Posts", "red", confidential_posts) +
        create_group_html("Non-Confidential Posts", "green", non_confidential_posts) +
        create_group_html("Unclassified Posts", "orange", unclassified_posts)
    )

    display(HTML(result_html))

    print(f"\nSummary:")
    print(f"Confidential Posts: {len(confidential_posts)}")
    print(f"Non-Confidential Posts: {len(non_confidential_posts)}")
    print(f"Unclassified Posts: {len(unclassified_posts)}")

In [None]:
authors = ["author1", "author2", "author3", "author4", "author5"]
all_processed_posts = []
for author in authors:
    processed_posts = process_author_posts(author)
    all_processed_posts.extend(processed_posts)

display_results(all_processed_posts)