In [14]:
import os
import json
from tqdm import tqdm

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [2]:
with open("../../data/wikipedia_pageviews/articles.json") as f:
    labels = json.load(f)
embeddings = np.load("../../data/wikipedia_pageviews/embeddings.npy").astype(np.float16)

In [3]:
embeddings.shape

(607692, 1536)

In [54]:
known_positive_labels = json.load(open("../../data/input/topics_manual.json"))
known_positive_labels = [label for label in known_positive_labels if label in labels]
known_positive_indices = [labels.index(label) for label in known_positive_labels]

In [8]:
kmeans = KMeans(n_clusters=16, init="k-means++", random_state=42)
kmeans.fit(embeddings)

In [36]:
# Assign each embedding to the closest cluster center
labels_predicted = kmeans.labels_

# Get the cluster centers
cluster_centers = kmeans.cluster_centers_

In [46]:
list_of_closest = []

for cluster_id in range(kmeans.n_clusters):
    # Get indices of points belonging to the current cluster
    cluster_indices = np.where(labels_predicted[known_positive_indices] == cluster_id)[0]

    if len(cluster_indices) == 0:
        continue

    print("Cluster", cluster_id)

    for idx in cluster_indices:
        print("    ", known_positive_labels[idx])
        
    # Get indices of points belonging to the current cluster
    cluster_indices = np.where(labels_predicted == cluster_id)[0]
    
    # Get the embeddings of points in the cluster
    cluster_embeddings = embeddings[cluster_indices]
    
    # Calculate the distances from each point to the cluster center
    distances = np.linalg.norm(cluster_embeddings - cluster_centers[cluster_id], axis=1)
    
    # Sort by distance and select the 10 closest points
    closest_indices = cluster_indices[np.argsort(distances)[:10]]
    
    # Print or collect the corresponding labels for the closest points
    print(" - Closest to cluster center:")
    for idx in closest_indices:
        print("    ", labels[idx])  # Or store these labels if needed
        list_of_closest.append(labels[idx])


Cluster 0
     Fashion
     Pencil
     Reptile
     Tundra
     Clay
     Frog
     Yarn
     Pinniped
     Fly
     Cucumber
     Sunglasses
     Chimney
     Toaster
     Carrot
     Dove
     Turkey
     Cheese
     Archipelago
     Ant
     Beetle
     Cactus
     Door
     Hammer
     Hurricane
     Leather
     Rice
     Hedgehog
     Skunk
     Reef
     Mug
     Flower
     Tulip
     Fossil
     Forest
     Glass
     Koala
     Mushroom
     Rabbit
     Skiing
     Squirrel
     Train
     Turtle
     Fence
     Fjord
     Shark
     Rainforest
     Xylophone
     Steppe
     Waterfall
     Starfish
     Mars
     Sand
     Fungi
     Glacier
     Moon
     Roof
     Silver
     Swimming
     Tiger
     Wolf
     Wheat
     Duck
     Crayon
     Dam
     Snail
     Vase
     Milky Way
     Zoo
     Tomato
     Painting
     Pork
     Bronze
     Marble
     Quill
     Shoe
     Volcano
     Whale
     Cotton
     Lion
     Beef
     Hippopotamus
     Bamboo
     Falcon
     

## Non-clustering approach

Not that useful as it results in very specific topics. For example

**Brazil -->**
- German Brazilians
- States of Brazil
- Brazil, Indiana
- Brazil of Hope
- State of Brazil
- Brazil (disambiguation)
- Brazil nut
- Brazilians
- Brazil Union
- Brazil Current

In [20]:
from scipy.spatial import KDTree
import numpy as np

def find_positive_by_distance(embeddings, known_positive_indices, tolerance):
    tree = KDTree(embeddings)  # Build KD-tree for embeddings
    positive_indices = set()
    
    for pos_idx in tqdm(known_positive_indices):
        # Query points within the distance tolerance
        close_indices = tree.query_ball_point(embeddings[pos_idx], r=tolerance)
        positive_indices.update(close_indices)
    
    return np.array(list(positive_indices))

def find_positive_by_k_nearest(embeddings, known_positive_indices, k):
    tree = KDTree(embeddings)  # Build KD-tree for embeddings
    positive_indices = set()
    
    for pos_idx in tqdm(known_positive_indices):
        # Query the k closest points (excluding the point itself)
        distances, nearest_indices = tree.query(embeddings[pos_idx], k=k+1)
        positive_indices.update(nearest_indices[1:])  # Skip the point itself
    
    return np.array(list(positive_indices))


In [31]:
import numpy as np

def find_positive_by_distance(embeddings, known_positive_indices, tolerance):
    # Normalize the embeddings to unit vectors (for cosine similarity)
    normalized_embeddings = embeddings
    positive_indices = set()

    for pos_idx in tqdm(known_positive_indices):
        # Compute cosine similarities
        cosine_similarities = np.dot(normalized_embeddings, normalized_embeddings[pos_idx].T)
        
        # Get indices where similarity is greater than (1 - tolerance)
        close_indices = np.where(cosine_similarities > (1 - tolerance))[0]
        positive_indices.update(close_indices)
    
    return np.array(list(positive_indices))

def find_positive_by_k_nearest(embeddings, known_positive_indices, k):
    # Normalize the embeddings to unit vectors (for cosine similarity)
    normalized_embeddings = embeddings
    positive_indices = set()

    for pos_idx in tqdm(known_positive_indices):
        # Compute cosine similarities
        cosine_similarities = np.dot(normalized_embeddings, normalized_embeddings[pos_idx].T)
        
        # Get the indices of the k closest points (excluding the point itself)
        nearest_indices = np.argsort(cosine_similarities)[-k-1:-1]  # Exclude the index of the point itself
        print("Average cosine similarity:", np.mean(cosine_similarities[nearest_indices]))
        positive_indices.update(nearest_indices)
        for idx in nearest_indices:
            print(labels[idx])
    
    return np.array(list(positive_indices))


In [34]:
positive_indices = find_positive_by_distance(embeddings, known_positive_indices, 0.7)

In [26]:
len(positive_indices)

867

In [33]:
positive_indices_nearest = find_positive_by_k_nearest(embeddings, known_positive_indices, k=10)

In [24]:
len(positive_indices_nearest)

NameError: name 'positive_indices_nearest' is not defined

## Active Learning

In [47]:
print(list_of_closest)

['Ring', 'Cabbit', 'Camel', 'Sand cat', 'Crab', 'Meal', 'Bushpig', 'Mink', 'Cameltoe', 'Bassaricyon', 'Don Republic', 'Resolution 181', 'Proclamation 4483', 'Forward Party (United States)', 'American Solidarity Party', 'Congress Poland', 'Homeland Solidarity Party', 'Unitary Democratic Coalition', 'SS United States', 'Proclamation 10043', 'Michelle Sol', 'Anne Aly', 'Michelle DaRosa', 'Rachel Allen', 'Maria Reynolds', 'Monica Dolan', 'Jill Harris', 'Anne DeMarinis', 'Anne Haney', 'Dana Davis', 'CODESYS', 'DreamWeb', 'DreamHost', 'GTK', 'RTL', 'SpaceWire', 'StarCraft', 'LandSpace', 'IDE', '.NET', 'Rajat Nagpal', 'Rajakumarudu', 'Chandala', 'Bakshish Singh', 'Rajitha', 'Darshan Kumar', 'Bombay Jayashri', 'Adoor Prakash', 'Samikssha Batnagar', 'Samarth Ramdas', 'Nightcrawler (film)', 'Who (film)', 'Anything (film)', 'A Man (film)', 'Only (film)', 'Some (film)', 'Exists (film)', 'Show Me (film)', 'Species (film)', 'Anonymous (film)', 'Guildford', 'Ugland House', 'Devonshire House', 'Englis

In [55]:
known_negative_labels = ['Cabbit', 'Bushpig', 'Bassaricyon', 'Don Republic', 'Resolution 181', 'Proclamation 4483', 'Forward Party (United States)', 'American Solidarity Party', 'Congress Poland', 'Homeland Solidarity Party', 'Unitary Democratic Coalition', 'SS United States', 'Proclamation 10043', 'Michelle Sol', 'Anne Aly', 'Michelle DaRosa', 'Rachel Allen', 'Maria Reynolds', 'Monica Dolan', 'Jill Harris', 'Anne DeMarinis', 'Anne Haney', 'Dana Davis', 'CODESYS', 'DreamWeb', 'DreamHost', 'GTK', 'RTL', 'SpaceWire', 'StarCraft', 'LandSpace', 'IDE', '.NET', 'Rajat Nagpal', 'Rajakumarudu', 'Chandala', 'Bakshish Singh', 'Rajitha', 'Darshan Kumar', 'Bombay Jayashri', 'Adoor Prakash', 'Samikssha Batnagar', 'Samarth Ramdas', 'Nightcrawler (film)', 'Who (film)', 'Anything (film)', 'A Man (film)', 'Only (film)', 'Some (film)', 'Exists (film)', 'Show Me (film)', 'Species (film)', 'Anonymous (film)', 'Guildford', 'Ugland House', 'Devonshire House', 'English country house', 'Lord British', 'Denham, Buckinghamshire', 'Blackheath, London', 'Blockley', 'Guildford Castle', 'Aldbourne', 'Who (album)', 'Just (song)', 'G. (album)', 'So (album)', 'Let It Be (album)', 'Our Songs (album)', "Buddy's Song (album)", '- (album)', 'Something Wild (album)', 'Come (album)', 'Matt DeCaro', 'Dave Rich', 'Mark Lenard', 'Ronny Graham', 'Matt Jay', 'Dan Markel', 'Dave Jerden', 'Brian Haner', 'Jimmy DeGrasso', 'Danny Leiner', 'Lippisch P.13a', 'Rec 2', 'PH', 'Trans effect', 'Beta-M', 'Entropy', 'Alpha Beta', 'Theta', 'ANOM', 'Body', 'Hope Town', 'Mount Berlin', 'Belmar, New Jersey', 'American Town', 'Cum Town', 'My Hometown', 'River Point', 'New Castle, Delaware', 'Alice Town', 'Flagstaff, Maine', 'Bell XF-109', 'Quest Aircraft', 'L42A1', 'LTV A-7 Corsair II', 'Titan II GLV', 'RQ-180', 'Lockheed 10E', 'British Aerospace 125', 'General Dynamics X-62 VISTA', 'AT-6B Wolverine', 'Pope Dionysius', 'Pseudo-Dionysius', 'Orarion', 'Antiquities', 'AD 5', 'Late antiquity', 'The Greek', 'Dionysius', 'AD 7', 'Heraclius', 'Nightcrawler', 'The Other Two', 'A Touch of Dead', 'The Beast Inside', 'Nightcrawler (comics)', 'Wrong Creatures', 'Alive II', 'Dream Weaver', 'Dark Paradise', 'A Dark Place', 'Arena football', '2024 Leagues Cup', 'World Club Challenge', 'Super League XXIX', 'Arena Football League', '2022 AFF Championship', '2023 Leagues Cup', "FIFA Women's Club World Cup", 'FIFA Club World Cup', '2024 GB3 Championship', 'Vagina', 'Porn', 'Penis', 'Breast', 'Ass']
known_negative_indices = [labels.index(label) for label in known_negative_labels]

In [79]:
# Custom query strategy: Mix between uncertainty and random confident sampling from top 10,000
def hybrid_query_strategy(classifier, X_pool):
    # Get prediction probabilities for the pool of unlabeled examples
    probabilities = classifier.predict_proba(X_pool)

    # Decide between confidence sampling (60%) and uncertainty sampling (40%)
    if np.random.rand() > 0.0:  # 60% confidence-based sampling, 40% uncertainty
        # Confident sampling: pick a random sample from the top 10,000 most confident predictions
        confidence_scores = probabilities[:, 1]  # Positive class confidence
        top_n = 10000  # We'll consider the top 10,000 most confident predictions
        top_confident_indices = np.argsort(-confidence_scores)[:top_n]  # Sort and select top 10,000
        query_idx = np.random.choice(top_confident_indices, size=10)  # Randomly pick one from top 10,000
        return query_idx, X_pool[query_idx]
    else:
        # Calculate uncertainty (samples closest to 0.5 probability for positive class)
        uncertainty_scores = np.abs(probabilities[:, 1] - 0.5)

        # Uncertainty sampling: pick the sample with the smallest uncertainty
        query_idx = np.argmin(uncertainty_scores)[:5]
        return query_idx, X_pool[query_idx]

In [80]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

# Assuming you have these datasets:
# embeddings: a numpy array of shape (600,000, embedding_dim) representing the page embeddings
# labels: a numpy array with 1 for "good" topics, and -1 for unlabeled topics (e.g., -1 as "unknown")

# Load or generate embeddings and labels
# For the sake of example, we assume embeddings have already been computed and loaded into 'embeddings'
# Replace these with your actual embeddings and initial labels
# Here 'positive_labels' is an array containing the indices of the labeled good topics
# embeddings = np.load('embeddings.npy')
# labels = np.load('labels.npy')

# Get the embeddings and the initial labels for known good topics
X_initial = embeddings[known_positive_indices + known_negative_indices]  # The embeddings of good topics
y_initial = np.concat([np.ones(len(known_positive_indices)), np.zeros(len(known_negative_indices))])  # Labels for good topics

# Unlabeled dataset (where label = -1)
unlabeled_indices = np.array([idx for idx in range(len(labels)) if idx not in known_positive_indices and idx not in known_negative_indices])
X_unlabeled = embeddings[unlabeled_indices]  # Embeddings of unknown topics

# Create an active learner using the hybrid query strategy
learner = ActiveLearner(
    estimator=LogisticRegression(solver='lbfgs'),
    query_strategy=hybrid_query_strategy,
    X_training=X_initial, y_training=y_initial
)

# Number of queries (iterations) we want to run active learning for
n_queries = 20

# Perform active learning loop
for query_idx in range(n_queries):
    print(f"Query {query_idx + 1}/{n_queries}")

    # Select the most uncertain sample from the unlabeled data
    query_index, query_instance = learner.query(X_unlabeled)

    # Let's simulate the process of manually labeling the query_instance
    # Here you would manually label the query_instance, for example, by checking it
    # In this case, we'll randomly assign a label (1 for good, 0 for bad)
    # Replace this with actual manual labeling
    manual_labels = []
    for idx in query_index:
        manual_label = input(f"Is '{labels[unlabeled_indices[idx]]}' a good topic? (y/n): ")
        manual_label = 1 if manual_label.lower() == 'y' else 0
        manual_labels.append(manual_label)

    # Teach the active learner the new labeled instance
    learner.teach(X_unlabeled[query_index], np.array(manual_labels))

    # Remove the newly labeled sample from the unlabeled pool
    X_unlabeled = np.delete(X_unlabeled, query_index, axis=0)
    unlabeled_indices = np.delete(unlabeled_indices, query_index)

# After active learning, your learner is trained on both initial data and newly labeled data.
# You can use it to classify the remaining topics.
predicted_labels = learner.predict(embeddings)

# Optional: save the final model and predictions
# np.save('predicted_labels.npy', predicted_labels)
# joblib.dump(learner, 'active_learner_model.pkl')



Query 1/20
Query 2/20
Query 3/20
Query 4/20
Query 5/20
Query 6/20
Query 7/20
Query 8/20
Query 9/20
Query 10/20
Query 11/20
Query 12/20
Query 13/20
Query 14/20
Query 15/20
Query 16/20
Query 17/20
Query 18/20
Query 19/20
Query 20/20


In [84]:
# Assuming your active learner is already trained
# Predict probabilities for all remaining unlabeled examples
# The `predict_proba` method gives us the probability that each example belongs to the positive class

# Get the prediction probabilities for the full dataset
probs = learner.predict_proba(embeddings)[:, 1]  # Probability of being "positive"

# Sort the unlabeled data by confidence in descending order
sorted_indices = np.argsort(-probs)  # Negative sign for descending order

In [88]:
# Print the top N suggestions based on model confidence
top_n = 50_000  # Adjust this to how many suggestions you want to print
print("Top suggestions (most confident predictions):")
for i in range(top_n):
    idx = sorted_indices[i]
    confidence = probs[sorted_indices[i]]
    print(f"{confidence:.4f} - {labels[idx]}")

with open("../../data/wikipedia_pageviews/filtered_with_classifier.json", "w") as f:
    json.dump([labels[i] for i in sorted_indices[:top_n]], f)

Top suggestions (most confident predictions):
0.9768 - Egg
0.9744 - Refrigerator
0.9735 - Water
0.9718 - Bicycle
0.9716 - Skateboard
0.9709 - Laptop
0.9698 - Guitar
0.9684 - Piano
0.9682 - Television
0.9676 - Vegetable
0.9668 - Ice cream
0.9659 - Microwave
0.9655 - Cars
0.9651 - Fish
0.9649 - Bird
0.9649 - Shark
0.9643 - IPod
0.9640 - Ice
0.9635 - Roof
0.9634 - IPhone
0.9631 - Penguin
0.9628 - Fruit
0.9627 - Trampoline
0.9624 - Truck
0.9624 - Kitchen
0.9622 - Coffee
0.9617 - Yogurt
0.9615 - Steak
0.9609 - Bread
0.9608 - Insect
0.9604 - Beach
0.9602 - Tomato
0.9601 - Elevator
0.9597 - Glass
0.9592 - Pattress
0.9591 - Cooking
0.9589 - Walking
0.9589 - Elephant
0.9588 - Grass
0.9586 - Computer
0.9585 - Paper
0.9585 - Toaster
0.9585 - Continent
0.9584 - Turkey
0.9583 - Copper
0.9581 - Toilet
0.9580 - Horse
0.9580 - Sushi
0.9580 - Lamp
0.9579 - Food
0.9574 - Garden
0.9574 - Toothbrush
0.9570 - Parrot
0.9569 - Oven
0.9567 - Milk
0.9566 - Flower
0.9564 - Ocean
0.9561 - Snow
0.9559 - Desert
0.

## ChatGPT

In [89]:
logprobs = np.load("../../data/wikipedia_pageviews/logprobs.npy")
with open("../../data/wikipedia_pageviews/logprobs-articles.json") as f:
    logprobs_labels = json.load(f)

In [90]:
logprobs.shape

(50000, 2)

In [91]:
probs = np.exp(logprobs)
probs = probs / np.sum(probs, axis=1, keepdims=True)

In [103]:
indices_sorted = np.argsort(-probs[:, 0])

for idx in indices_sorted[:11_000]:
    # if probs[idx, 0] > 0.9999:
    print(f"{probs[idx, 0]:.4f} {logprobs_labels[idx]}")

1.0000 Ice cream
1.0000 Sunglasses
1.0000 Pizza
1.0000 Cereal
1.0000 Automobiles
1.0000 Breakfast cereal
1.0000 Bicycle
1.0000 Water bottle
1.0000 Burrito
1.0000 Popcorn
1.0000 Baseball
1.0000 Cars
1.0000 Swimming pool
1.0000 Tomatoes
1.0000 Chocolate chip cookie
1.0000 Toaster
1.0000 Macaroni and cheese
1.0000 Basketball
1.0000 Tennis
1.0000 Fireworks
1.0000 Badminton
1.0000 Corn on the cob
1.0000 Volleyball
1.0000 Amusement park
1.0000 Pancake
1.0000 Peanut butter
1.0000 Solar energy
1.0000 Skateboarding
1.0000 Smartphones
1.0000 Soda
1.0000 Ketchup
1.0000 Chewing gum
1.0000 Guitar
1.0000 Bubble gum
1.0000 Watermelon
1.0000 Coffee
1.0000 Christmas tree
1.0000 Bagel
1.0000 Treadmill
1.0000 Potato
1.0000 Lego
1.0000 Hamburger
1.0000 Smartphone
1.0000 Bread
1.0000 Skateboard
1.0000 Refrigerator
1.0000 Broccoli
1.0000 Cycling
1.0000 Ukulele
1.0000 Chocolate
1.0000 Teddy bear
1.0000 Domestic cat
1.0000 Television set
1.0000 Avocado
1.0000 Chocolate cake
1.0000 Mobile phones
1.0000 Mario K

In [104]:
final_topics = list(set(
    [logprobs_labels[i] for i in indices_sorted[:11_000]] +
    json.load(open("../../data/input/topics_manual.json"))
))

In [105]:
len(final_topics)

11556

In [106]:
with open("../../data/wikipedia_pageviews/filtered_with_llm.json", "w", encoding="utf-8") as f:
    json.dump(final_topics, f)

with open("../../data/topics.json", "w", encoding="utf-8") as f:
    json.dump(final_topics, f)