In [1]:
import json
import os
import sys
import warnings

import numpy as np
import torch

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
sys.path.append(project_root)

from src.utils.helper import load_env
from collections import Counter
from sentence_transformers import SentenceTransformer, util
from bs4 import BeautifulSoup

In [2]:
warnings.filterwarnings('ignore')
load_env()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
CLIENT_SECRET = os.getenv("WPCOM_CLIENT_SECRET")
ACCESS_TOKEN = os.getenv("WPCOM_ACCESS_TOKEN")

Found .env file at: /Users/firatgelbal/code/misc/a8c-data-blog/.env
load_dotenv() result: True


In [3]:
# Define WordPress.com API constants
AUTHORIZATION_BASE_URL = 'https://public-api.wordpress.com/oauth2/authorize'
TOKEN_URL = 'https://public-api.wordpress.com/oauth2/token'
CLIENT_ID = <CLIENT_ID>
REDIRECT_URI = <REDIRECT_URI>

site_url = "data.blog"

In [4]:
# Helper functions for data handling
def extract_names(metadata_dict):
    """
    Extracts the names of tags or categories from the metadata dictionary
    returned by the WordPress.com API.

    :param metadata_dict: Dictionary containing tag or category metadata
    :return: List of tag or category names
    """
    return list(metadata_dict.keys())

def load_posts(file_name='posts.json'):
    file_path = os.path.join('..', '..', 'data', file_name)
    with open(file_path, 'r') as f:
        return json.load(f)

def save_posts(posts, file_name='updated_posts.json'):
    file_path = os.path.join('..', '..', 'data', file_name)
    with open(file_path, 'w') as f:
        json.dump(posts, f, indent=2)

def get_sentence_transformer(model_name='all-MiniLM-L6-v2', cache_dir='./model_cache'):
   os.makedirs(cache_dir, exist_ok=True)
   cache_path = os.path.join(cache_dir, model_name.replace('/', '_'))

   if os.path.exists(cache_path):
       print(f"Loading model from cache: {cache_path}")
       model = SentenceTransformer(cache_path)
   else:
       print(f"Downloading and caching model: {model_name}")
       model = SentenceTransformer(model_name)
       model.save(cache_path)

   return model

In [5]:
posts = load_posts('validated_posts_with_llm_taxonomy.json')

In [7]:
# Load the embedding model
model_name = 'dunzhang/stella_en_1.5B_v5'
model = get_sentence_transformer(model_name)

Loading model from cache: ./model_cache/dunzhang_stella_en_1.5B_v5


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 10.13it/s]


In [8]:
# Generate embeddings for each post
post_embeddings = []
for post in posts:
    content = post['content']
    # Remove HTML tags to get the text content
    soup = BeautifulSoup(content, 'html.parser')
    text_content = soup.get_text()
    # Generate embedding for the full text content
    embedding = model.encode(text_content, convert_to_tensor=True)
    post_embeddings.append(embedding)

# Stack embeddings into a tensor
embeddings_tensor = torch.stack(post_embeddings)

# Compute cosine similarity matrix
cosine_scores = util.pytorch_cos_sim(embeddings_tensor, embeddings_tensor)

# Convert cosine_scores to a NumPy array for easier handling
cosine_scores = cosine_scores.cpu().numpy()

# Exclude self-similarity by setting diagonal to -1
np.fill_diagonal(cosine_scores, -1)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [12]:
# Set a high similarity threshold to ensure meaningful links
similarity_threshold = 0.75  # Adjust as needed

# Prepare a dictionary to map post IDs to their index in the posts list
post_id_to_index = {post['ID']: idx for idx, post in enumerate(posts)}

# Initialize an empty set to keep track of unique pairs (to avoid duplicates)
similar_post_pairs = set()

# Iterate over each post to find similar posts
for idx, source_post in enumerate(posts):
    scores = cosine_scores[idx]
    # Get indices of posts above the similarity threshold
    similar_indices = np.where(scores > similarity_threshold)[0]
    for target_idx in similar_indices:
        target_post = posts[target_idx]
        # Skip if there is already an existing link from source to target
        source_content = source_post['content']
        target_url = target_post['URL']
        # Parse the HTML content to find existing links
        soup = BeautifulSoup(source_content, 'html.parser')
        existing_links = [a['href'] for a in soup.find_all('a', href=True)]
        if target_url in existing_links:
            continue  # Skip if link already exists
        # Create a unique pair identifier (e.g., tuple of sorted post IDs)
        pair = tuple(sorted((source_post['ID'], target_post['ID'])))
        if pair not in similar_post_pairs:
            similar_post_pairs.add(pair)

# Convert the set to a list of tuples for further processing
similar_post_pairs = list(similar_post_pairs)

# Prepare the list of pairs to pass to the LLM
# Each pair will include the source post and the target post
pairs_to_process = []
for pair in similar_post_pairs:
    source_id, target_id = pair
    source_post = posts[post_id_to_index[source_id]]
    target_post = posts[post_id_to_index[target_id]]
    pairs_to_process.append({
        'source_post': source_post,
        'target_post': target_post
    })


In [18]:
for pair in pairs_to_process:
    print(pair['source_post']['title'])
    print(pair['target_post']['title'])
    print('-'*100)


Women of Datamattic: Madison Swain-Bowden
Women of Datamattic | Menaka Sankaralingam
----------------------------------------------------------------------------------------------------
How&#8217;s Your New Year&#8217;s Resolution Going?
Do New Year&#8217;s Resolutions Work? Data Suggests They Do!
----------------------------------------------------------------------------------------------------
How Communication Density Fuels Automattic
Analysis of A Beautiful Storm: Internal Communication at Automattic
----------------------------------------------------------------------------------------------------
Analysis of A Beautiful Storm: Internal Communication at Automattic
Hack Project: Tackling FOMO on the P2 Land
----------------------------------------------------------------------------------------------------
Introducing pipe, The Automattic Machine Learning Pipeline
Building Thousands of Reproducible ML Models with pipe, the Automattic Machine Learning Pipeline
--------------------

In [19]:
from haystack import Pipeline
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.anthropic import AnthropicGenerator
from haystack.utils import Secret

In [33]:
prompt_template = """
You are an assistant that helps improve blog posts by adding relevant internal links. Given the content of two blog posts, insert a hyperlink from the first post to the second post in a way that is natural and enhances the reader's experience.

Instructions:
- Analyze the content of both posts.
- Find a suitable place in the first post to insert a link to the second post.
- Use an anchor text that is relevant to the second post's content.
- Repeat the exercise for the second post as well in order to generate link from second post to first post.
- Do not alter the original meaning of the posts.
- Ensure that you do not create any grammatical errors or awkward sentences.
- The posts content may contain HTML tags.

Source Post Title: {{source_post_title}}
Source Post Content: {{source_post_content}}
Source Post URL: {{source_post_url}}

Target Post Title: {{target_post_title}}
Target Post Content: {{target_post_content}}
Target Post URL: {{target_post_url}}

Please provide the modified post content with the new link inserted.
Desired format:
<a href="[URL of target post]">[Context for the link]</a>
    Explanation: [Your explanation here]
"""

prompt_builder = PromptBuilder(template=prompt_template)

In [34]:
generator = AnthropicGenerator(Secret.from_env_var("ANTHROPIC_API_KEY"), model="claude-3-5-sonnet-20240620")

pipeline = Pipeline()
pipeline.add_component(instance=prompt_builder, name="prompt_builder")
pipeline.add_component(instance=generator, name="llm")

pipeline.connect("prompt_builder", "llm")


<haystack.core.pipeline.pipeline.Pipeline object at 0x37d2036b0>
🚅 Components
  - prompt_builder: PromptBuilder
  - llm: AnthropicGenerator
🛤️ Connections
  - prompt_builder.prompt -> llm.prompt (str)

In [31]:
# Function to process a single pair
def process_pair(pair):
    source_post = pair['source_post']
    target_post = pair['target_post']

    result = pipeline.run({
        "prompt_builder": {
            "source_post_title": source_post['title'],
            "source_post_content": source_post['content'],
            "source_post_url": source_post['URL'],
            "target_post_title": target_post['title'],
            "target_post_content": target_post['content'],
            "target_post_url": target_post['URL']
        }
    })

    return result["llm"]["replies"][0]

In [35]:
# Process all pairs and store responses
llm_responses = []
for pair in pairs_to_process[:2]:
    response = process_pair(pair)
    print(pair['source_post']['title'])
    print(pair['target_post']['title'])
    print(response)
    print('-'*100)
    llm_responses.append({
        'source_post_id': pair['source_post']['ID'],
        'target_post_id': pair['target_post']['ID'],
        'llm_response': response
    })

Women of Datamattic: Madison Swain-Bowden
Women of Datamattic | Menaka Sankaralingam
Here are the modified post contents with new internal links inserted:

For the source post (Madison Swain-Bowden):

<p>My time at Automattic has been stellar—I've had the opportunity to interact with many wonderful folks, and to grow in areas I wasn't expecting. Openverse has been a tremendous learning experience for me on the infrastructure side. I've appreciated our team's emphasis on ensuring that the community can get involved, participate in discussions, and help shape the project. A number of contributors from the community have provided significant contributions and improvements! Being able to do all our work in the open has been a huge asset. <a href="http://data.blog/2023/03/22/women-of-datamattic-menaka-sankaralingam/">Like other Women of Datamattic, I've found Automattic to be very supportive of work-life balance and personal growth</a>.</p>

Explanation: This link connects Madison's positiv

In [36]:
for pair in pairs_to_process[2:]:
    response = process_pair(pair)
    print(pair['source_post']['title'])
    print(pair['target_post']['title'])
    print(response)
    print('-'*100)
    llm_responses.append({
        'source_post_id': pair['source_post']['ID'],
        'target_post_id': pair['target_post']['ID'],
        'llm_response': response,
        'source_post_url': pair['source_post']['URL'],
        'source_post_title': pair['source_post']['title'],
        'target_post_url': pair['target_post']['URL'],
        'target_post_title': pair['target_post']['title']
    })

# Save the LLM responses to a file
with open('llm_link_suggestions.json', 'w') as f:
    json.dump(llm_responses, f, indent=2)
print(f"Processed {len(llm_responses)} pairs and saved results to llm_link_suggestions.json")

How Communication Density Fuels Automattic
Analysis of A Beautiful Storm: Internal Communication at Automattic
Here's the modified content for the source post with a new link inserted:

<p>My colleague on the Data Science team, Boris, previously <a href="https://data.blog/2017/04/12/welcoming-new-colleagues-a-data-based-story/">wrote</a> about the GM for data.blog and showed (with data, of course) how important the GM is in developing and maintaining social ties.</p>

<p>During a series of analyses I did for the HR team about how Automatticians communicate with one another, I found out that<strong> the GM is also really good for our communication</strong>! <a href="http://data.blog/2018/02/01/analysis-of-a-beautiful-storm-internal-communication-at-automattic/">This in-depth analysis of our internal communication network</a> provides further insights into how information flows within our distributed company.</p>

Explanation: I've inserted a link to the target post in a relevant section