In [133]:
import json
import sys
import os
import requests
import warnings

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
sys.path.append(project_root)

from src.utils.helper import load_env
from collections import Counter
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [94]:
# Load environment variables and API keys
warnings.filterwarnings('ignore')
load_env()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
CLIENT_SECRET = os.getenv("WPCOM_CLIENT_SECRET")
ACCESS_TOKEN = os.getenv("WPCOM_ACCESS_TOKEN")

Found .env file at: /Users/firatgelbal/code/misc/a8c-data-blog/.env
load_dotenv() result: True


In [62]:
# Define WordPress.com API constants
AUTHORIZATION_BASE_URL = 'https://public-api.wordpress.com/oauth2/authorize'
TOKEN_URL = 'https://public-api.wordpress.com/oauth2/token
CLIENT_ID = <CLIENT_ID>
REDIRECT_URI = <REDIRECT_URI>

site_url = "data.blog"

In [4]:
def get_posts(site_url, access_token, page_size=100, max_posts=1000):
    api_url = f"https://public-api.wordpress.com/rest/v1.1/sites/{site_url}/posts"

    headers = {
        "Authorization": f"Bearer {access_token}"
    }

    params = {
        "fields": "ID,title,content,categories,tags,URL,excerpt",
        "number": page_size,
        "page": 1
    }

    all_posts = []
    total_posts_checked = 0

    while True:
        try:
            response = requests.get(api_url, headers=headers, params=params)
            response.raise_for_status()

            data = response.json()
            posts = data.get('posts', [])

            if not posts:
                break  # No more posts to process

            for post in posts:
                total_posts_checked += 1
                all_posts.append({
                    'ID': post['ID'],
                    'title': post['title'],
                    'URL': post['URL'],
                    'content': post['content'],
                    'tags': post['tags'],
                    'categories': post['categories'],
                })

            if total_posts_checked >= max_posts:
                break  # Reached the maximum number of posts to check

            params['page'] += 1  # Move to the next page

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")
            break

    return all_posts

In [5]:
# Helper functions for data handling
def extract_names(metadata_dict):
    """
    Extracts the names of tags or categories from the metadata dictionary
    returned by the WordPress.com API.

    :param metadata_dict: Dictionary containing tag or category metadata
    :return: List of tag or category names
    """
    return list(metadata_dict.keys())

def load_posts(file_name='posts.json'):
    file_path = os.path.join('..', '..', 'data', file_name)
    with open(file_path, 'r') as f:
        return json.load(f)

def save_posts(posts, file_name='updated_posts.json'):
    file_path = os.path.join('..', '..', 'data', file_name)
    with open(file_path, 'w') as f:
        json.dump(posts, f, indent=2)

def get_sentence_transformer(model_name='all-MiniLM-L6-v2', cache_dir='./model_cache'):
   os.makedirs(cache_dir, exist_ok=True)
   cache_path = os.path.join(cache_dir, model_name.replace('/', '_'))

   if os.path.exists(cache_path):
       print(f"Loading model from cache: {cache_path}")
       model = SentenceTransformer(cache_path)
   else:
       print(f"Downloading and caching model: {model_name}")
       model = SentenceTransformer(model_name)
       model.save(cache_path)

   return model

In [63]:
posts = get_posts(site_url, ACCESS_TOKEN)

In [64]:
existing_categories = []
existing_tags = []
for post in posts:
    for category in post['categories']:
        if category not in existing_categories:
            existing_categories.append(category)
    for tag in post['tags']:
        if tag not in existing_tags:
            existing_tags.append(tag)

In [9]:
def find_similar_tags(tags, threshold=0.7):
    if not tags:
        return {}

    # Create TF-IDF vectors for tags
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(tags)

    # Compute cosine similarity
    cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

    similar_tags = {}
    for i, tag in enumerate(tags):
        similar = [(tags[j], cosine_similarities[i][j])
                   for j in range(len(tags))
                   if i != j and cosine_similarities[i][j] > threshold]
        if similar:
            similar_tags[tag] = similar

    return similar_tags

In [10]:
similar_tags = find_similar_tags(existing_tags)
similar_tags

{'Data Science': [('science', 0.7915145702180151)],
 'data analytics': [('analytics', 0.8251239666050941)],
 'machine learning': [('machine learning models', 0.7840102790432525)],
 'analytics': [('data analytics', 0.8251239666050941)],
 'a/b testing': [('testing bias', 0.7071067811865476)],
 'marketing': [('marketing science', 0.7482020483410375)],
 'marketing science': [('marketing', 0.7482020483410375)],
 'bias': [('testing bias', 0.7071067811865476)],
 'gender bias': [('gender', 0.7424392663387555)],
 'data ethics': [('ethics', 0.8395851060963966)],
 'ethics': [('data ethics', 0.8395851060963966)],
 'testing bias': [('a/b testing', 0.7071067811865476),
  ('bias', 0.7071067811865476)],
 'algorithms': [('learning algorithms', 0.7664706291023498)],
 'analysis': [('data analysis', 0.7915145702180151)],
 'Deep Learning': [('deep learning architecture', 0.7936814833423467)],
 'gender': [('gender bias', 0.7424392663387555)],
 'science': [('Data Science', 0.7915145702180151)],
 'descriptive

In [12]:
#model = SentenceTransformer('all-MiniLM-L6-v2')
# Cache the model for future re-use
model = get_sentence_transformer()

Loading model from cache: ./model_cache/all-MiniLM-L6-v2


In [13]:
def suggest_tags_and_categories(post_content, existing_tags, existing_categories):
    # Encode post content
    post_embedding = model.encode(post_content, convert_to_tensor=True)

    # Encode existing tags and categories
    tag_embeddings = model.encode(existing_tags, convert_to_tensor=True)
    category_embeddings = model.encode(existing_categories, convert_to_tensor=True)

    # Compute similarities
    tag_similarities = util.pytorch_cos_sim(post_embedding, tag_embeddings)[0]
    category_similarities = util.pytorch_cos_sim(post_embedding, category_embeddings)[0]

    # Get top 3 tags and top category
    suggested_tags = [existing_tags[i] for i in tag_similarities.argsort(descending=True)[:3]]
    suggested_category = existing_categories[category_similarities.argmax().item()]

    return suggested_tags, suggested_category

In [32]:
def analyze_posts_and_collect_stats(posts, existing_tags, existing_categories, filter_tags=None, serialize=False):
    if filter_tags is None:
        filter_tags = ['Automattic', 'code', 'data', 'Google', 'charts']

    suggested_tag_stats = Counter()
    suggested_category_stats = Counter()
    posts_analyzed = 0
    posts_with_new_suggestions = set()  # Use a set to count unique posts

    # Serialize existing tags and categories
    original_taxonomy = []

    for post in posts:
        posts_analyzed += 1
        print(f"\nAnalyzing post: {post['title']}")

        # Extract current tags and categories
        current_tags = extract_names(post['tags'])
        current_category = extract_names(post['categories'])[0] if post['categories'] else None

        # Serialize original taxonomy
        original_taxonomy.append({
            'ID': post['ID'],
            'title': post['title'],
            'URL': post['URL'],
            'tags': current_tags,
            'category': current_category
        })

        # Suggest tags and categories
        suggested_tags, suggested_category = suggest_tags_and_categories(
            post['content'], existing_tags, existing_categories
        )

        # Filter out specific tags
        suggested_tags = [tag for tag in suggested_tags if tag not in filter_tags]

        # Add suggested tags and category to the post object
        post['suggested_tags'] = suggested_tags
        post['suggested_category'] = suggested_category

        # Determine if post needs update
        new_tag_suggestions = set(suggested_tags) - set(current_tags)
        category_changed = suggested_category != current_category
        post['needs_update'] = bool(new_tag_suggestions or category_changed)

        # Collect stats for suggested tags
        if new_tag_suggestions:
            posts_with_new_suggestions.add(post['ID'])
            for tag in new_tag_suggestions:
                suggested_tag_stats[tag] += 1

        # Collect stats for suggested category
        if category_changed:
            posts_with_new_suggestions.add(post['ID'])
            suggested_category_stats[suggested_category] += 1

        print(f"Current tags: {current_tags}")
        print(f"Suggested tags: {suggested_tags}")
        print(f"Current category: {current_category}")
        print(f"Suggested category: {suggested_category}")
        print(f"Needs update: {post['needs_update']}")

    if serialize:
        # Serialize original taxonomy
        with open('original_taxonomy.json', 'w') as f:
            json.dump(original_taxonomy, f, indent=2)

    return {
        'suggested_tag_stats': suggested_tag_stats,
        'suggested_category_stats': suggested_category_stats,
        'posts_analyzed': posts_analyzed,
        'posts_with_new_suggestions': len(posts_with_new_suggestions)
    }

def print_taxonomy_stats(stats):
    print("\n--- Taxonomy Suggestion Statistics ---")
    print(f"Total posts analyzed: {stats['posts_analyzed']}")
    print(f"Unique posts with new suggestions: {stats['posts_with_new_suggestions']}")

    print("\nTop 10 Suggested Tags:")
    for tag, count in stats['suggested_tag_stats'].most_common(10):
        print(f"  {tag}: {count} posts")

    print("\nSuggested Categories:")
    for category, count in stats['suggested_category_stats'].most_common():
        print(f"  {category}: {count} posts")

def suggest_taxonomy_improvements(stats, existing_tags, existing_categories, tag_threshold=5, max_tags=50, max_categories=10):
    print("\n--- Suggested Taxonomy Improvements ---")

    # Suggest new tags to add
    new_tags = [tag for tag, count in stats['suggested_tag_stats'].items() if count >= tag_threshold and tag not in existing_tags]
    if new_tags:
        print(f"\nSuggested new tags to add (appeared in at least {tag_threshold} posts):")
        for tag in new_tags[:max_tags - len(existing_tags)]:
            print(f"  {tag}: {stats['suggested_tag_stats'][tag]} posts")

    # Suggest tags to remove (not suggested for any posts)
    unused_tags = set(existing_tags) - set(stats['suggested_tag_stats'].keys())
    if unused_tags:
        print("\nTags that were never suggested (consider removing):")
        for tag in unused_tags:
            print(f"  {tag}")

    # Suggest category changes
    if len(stats['suggested_category_stats']) > len(existing_categories):
        print("\nSuggested category changes:")
        for category, count in stats['suggested_category_stats'].most_common(max_categories):
            if category not in existing_categories:
                print(f"  Consider adding: {category} (suggested for {count} posts)")

    # Overall recommendations
    print("\nOverall recommendations:")
    if len(existing_tags) + len(new_tags) > max_tags:
        print(f"  - Consider limiting the total number of tags to {max_tags}")
    if len(stats['suggested_category_stats']) > max_categories:
        print(f"  - Consider limiting the total number of categories to {max_categories}")

In [15]:
filter_tags = ['Automattic', 'code', 'data', 'Google', 'charts']
stats = analyze_posts_and_collect_stats(posts, existing_tags, existing_categories, filter_tags)
print_taxonomy_stats(stats)


Analyzing post: Data Talks &amp; Conferences Recommended by Automatticians
Current tags: ['Data careers', 'Data Science', 'distributed work', 'remote work']
Suggested tags: ['conference', 'Data careers']
Current category: Data at Automattic
Suggested category: Data at Automattic
Needs update: True

Analyzing post: What your neurodivergent colleagues wish you knew
Current tags: ['distributed work', 'remote work']
Suggested tags: ['diversity', 'demographic targeting', 'imposter syndrome']
Current category: Interview
Suggested category: Network Science
Needs update: True

Analyzing post: From Support to Data Science and Analytics: My Journey at Automattic
Current tags: ['data analytics', 'Data careers', 'Data Engineering', 'Data Science', 'tech']
Suggested tags: ['Data careers', 'data scientist', 'support']
Current category: Data at Automattic
Suggested category: Data Engineering
Needs update: True

Analyzing post: Hack Project: Creating a tool to translate customer feedback into product

In [16]:
suggest_taxonomy_improvements(stats, existing_tags, existing_categories)


--- Suggested Taxonomy Improvements ---

Tags that were never suggested (consider removing):
  Amazon
  science of work
  science
  Top Level Domains
  bots
  Git
  Magenta
  gender
  accountability
  Automattic
  charts
  LTSM
  chess
  New Year's resolutions
  gender bias
  code reviews
  Donald Trump
  net neutrality
  math
  learning algorithms
  causal inference
  fake news
  critical review
  influencer marketing
  hand-drawn graphics
  pipe
  work
  plugin repository
  Go
  models
  retention
  continuous integration
  descriptive data analysis
  Thorsten Dietzsch
  ethics
  marketing
  Honor
  P2
  Albert-László Barabási
  Hans Rosling
  Conda
  CircleCI
  Siraj Raval
  W.E.B Du Boise
  network analysis
  Harry Potter
  big data
  marketing science
  Kevin Ferguson
  Strava
  functional programming languages
  plot
  security
  MySQL
  databases
  Reddit
  software architecture
  pop lyrics
  twitter
  Spotify
  uplift modelling
  feature engineering
  Hive
  Google
  Java
  c

In [39]:
def analyze_taxonomy(posts):
    category_counter = Counter()
    tag_counter = Counter()

    for post in posts:
        category_counter.update([post['suggested_category']])
        tag_counter.update(post['suggested_tags'])

    print("Category usage:")
    for category, count in category_counter.most_common():
        print(f"  {category}: {count}")

    print("\nTag usage:")
    for tag, count in tag_counter.most_common():
        print(f"  {tag}: {count}")

    unused_categories = set(CATEGORIES) - set(category_counter.keys())
    unused_tags = set(TAGS) - set(tag_counter.keys())

    print("\nUnused categories:", unused_categories)
    print("Unused tags:", unused_tags)

In [None]:
save_posts(posts, 'original_posts.json')

In [42]:
# Updated taxonomy
CATEGORIES = [
    'Data Engineering', 'Data Science', 'Data Visualization', 'Distributed Work',
    'Experimentation and A/B Testing', 'Network Science', 'Search and Information Retrieval',
    'Machine Learning and AI'
]

TAGS = [
    'Data Science', 'Data Engineering', 'Data Visualization', 'Machine Learning',
    'Artificial Intelligence', 'Distributed Work', 'Remote Work', 'Data Analytics',
    'Tech Industry', 'Communication', 'Productivity', 'Automattic', 'WordPress',
    'Semantic Search', 'Diversity and Inclusion', 'Work-Life Balance', 'Career and Professional Development',
    'Books and Reading', 'Learning and Education', 'Data Discoverability', 'Automation',
    'Software Engineering', 'A/B Testing', 'Experimentation', 'Marketing Analytics',
    'Python', 'Data Speaker Series', 'Scientific Communication', 'Bias in AI', 'Data Ethics',
    'Best Practices', 'Causal Inference', 'Technology Trends',
    'Conferences and Events', 'Transparency in Data', 'Data Retention', 'Network Analysis',
    'Open Source', 'Time Series Analysis', 'Deep Learning', 'Surveys and Research Methods',
    'Data Products', 'Elasticsearch', 'Natural Language Processing', 'Meetup',
    'Information Retrieval', 'Social Media Analytics', 'Feature Engineering', 'Missing Data Handling',
    'Pipe', 'API'
]

In [135]:
# Load the model
new_model = get_sentence_transformer(model_name='dunzhang/stella_en_1.5B_v5')

# Encode categories and tags once
category_embeddings = new_model.encode(CATEGORIES, convert_to_tensor=True)
tag_embeddings = new_model.encode(TAGS, convert_to_tensor=True)

Downloading and caching model: dunzhang/stella_en_1.5B_v5


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [137]:
updated_posts = []
for post in posts:
    # Suggest tags and categories
    suggested_tags, suggested_category = suggest_tags_and_categories(
        post['content'], TAGS, CATEGORIES
    )
    updated_posts.append({
            'ID': post['ID'],
            'title': post['title'],
            'URL': post['URL'],
            'categories': extract_names(post['categories']),
            'tags': extract_names(post['tags']),
            'suggested_category': post['suggested_category'],
            'suggested_tags': post['suggested_tags'],
    })

In [35]:
def assign_category_and_tags(post_content, num_tags=5):
    # Encode the post content
    post_embedding = model.encode(post_content, convert_to_tensor=True)

    # Find the most similar category
    category_similarities = util.pytorch_cos_sim(post_embedding, category_embeddings)[0]
    best_category_idx = category_similarities.argmax().item()
    assigned_category = CATEGORIES[best_category_idx]

    # Find the most similar tags
    tag_similarities = util.pytorch_cos_sim(post_embedding, tag_embeddings)[0]
    best_tag_idxs = tag_similarities.argsort(descending=True)[:num_tags]
    assigned_tags = [TAGS[idx] for idx in best_tag_idxs]

    return assigned_category, assigned_tags

def process_posts(posts):
    for post in posts:
        content = post['title'] + " " + post['content']  # Combine title and content for better context
        category, tags = assign_category_and_tags(content)
        post['suggested_category'] = category
        post['suggested_tags'] = tags
    return posts

In [65]:
updated_posts = process_posts(posts)

In [66]:
for post in updated_posts:
    print(f"Post ID: {post['ID']}")
    print(f"Title: {post['title']}")
    print(f"Suggested Category: {post['suggested_category']}")
    print(f"Suggested Tags: {', '.join(post['suggested_tags'])}")
    print()

Post ID: 3919
Title: Data Talks &amp; Conferences Recommended by Automatticians
Suggested Category: Data Science
Suggested Tags: Conferences and Events, Automattic, Data Science, Automation, Information Retrieval

Post ID: 3862
Title: What your neurodivergent colleagues wish you knew
Suggested Category: Network Science
Suggested Tags: Diversity and Inclusion, Automattic, Bias in AI, Artificial Intelligence, Automation

Post ID: 3856
Title: From Support to Data Science and Analytics: My Journey at Automattic
Suggested Category: Data Engineering
Suggested Tags: Data Analytics, Data Engineering, Feature Engineering, Data Visualization, Automation

Post ID: 3841
Title: Hack Project: Creating a tool to translate customer feedback into product insights
Suggested Category: Distributed Work
Suggested Tags: Distributed Work, Remote Work, Marketing Analytics, Social Media Analytics, Automation

Post ID: 3804
Title: Hack Project: Tackling FOMO on the P2 Land
Suggested Category: Network Science
Su

In [67]:
analyze_taxonomy(updated_posts)
save_posts(updated_posts)

Category usage:
  Data Science: 22
  Search and Information Retrieval: 16
  Data Visualization: 15
  Machine Learning and AI: 12
  Distributed Work: 11
  Network Science: 10
  Data Engineering: 7
  Experimentation and A/B Testing: 4

Tag usage:
  Data Ethics: 30
  Data Science: 28
  Automation: 26
  Social Media Analytics: 26
  Information Retrieval: 24
  Automattic: 22
  Data Retention: 21
  Data Visualization: 18
  Elasticsearch: 17
  Data Discoverability: 16
  Semantic Search: 15
  Data Analytics: 14
  Data Engineering: 14
  Transparency in Data: 14
  Distributed Work: 13
  WordPress: 12
  Artificial Intelligence: 11
  Bias in AI: 10
  Marketing Analytics: 10
  Deep Learning: 10
  Machine Learning: 10
  Feature Engineering: 9
  Technology Trends: 9
  Meetup: 8
  Remote Work: 7
  Surveys and Research Methods: 7
  Diversity and Inclusion: 6
  Work-Life Balance: 6
  Network Analysis: 6
  Data Speaker Series: 6
  Conferences and Events: 5
  Productivity: 5
  Experimentation: 5
  Softwar