In [1]:
from collections import Counter, defaultdict

import pandas as pd
from bertopic import BERTopic
from tqdm import tqdm

tqdm.pandas()

In [2]:
# utility functions
def split_tags(tags_str):
    if tags_str:
        return [tag.strip().lower() for tag in tags_str.split(",") if tag.strip()]

    return []

def count_hashtag_occurrences(hashtags):
    all_hashtags = [tag for sublist in hashtags for tag in sublist]
    return Counter(all_hashtags)

In [3]:
# Step 1: Preprocess the Data

df = pd.read_csv("../data/full_dataset.csv", usecols=["root_tags", "tags"])
df["root_tags"] = df["root_tags"].fillna("")
df["tags"] = df["tags"].fillna("")

# Concatenating both root_tags and reblogged tags to get all co-occurrences as edges between a graph.
tags_series = (
    pd.concat([df["root_tags"], df["tags"]]).apply(split_tags).reset_index(drop=True)
)
tags_series

0          [sadie sink, sadiesinkedit, femaledaily, daily...
1                                                         []
2                                                         []
3          [alchemy of souls, alchemy of souls: light and...
4          [manila mikey, manjiro sano, tokyo manji gang,...
                                 ...                        
3529079                                              [mp100]
3529080    [the way his expression changes😂, doctor who, ...
3529081    [we did a thing, harringrove, teacher!steve, k...
3529082         [what the hell am i gonna do when it’s done]
3529083              [other people’s art, quilting, turtles]
Length: 3529084, dtype: object

In [4]:
# Count the occurrences of each tag
tags_counts = tags_series.explode().value_counts()
tags_counts

art                                                             110696
my art                                                           77000
fanart                                                           53664
*                                                                32919
mine                                                             32794
                                                                 ...  
also don't go planning on stealing any baby larvitars /lh /j         1
my body is a canvas                                                  1
the lights!                                                          1
op the lights                                                        1
what the hell am i gonna do when it’s done                           1
Name: count, Length: 2282228, dtype: int64

In [5]:
# Get the top 200,000 tags
top_tags = tags_counts.head(200_000).index
top_tags

Index(['art', 'my art', 'fanart', '*', 'mine', 'digital art', 'pokemon',
       'illustration', 'artists on tumblr', 'video',
       ...
       'suggestive .', 'i loved it so much', 'anidalaedit',
       'i would pledge the allegiance if this was the flag',
       'johnny moodboard', 'max deacon',
       'and you know what it's been lots of fun', 'فالح الشبلي',
       'sapphire and ruby', 'ifrit fanart'],
      dtype='object', length=200000)

In [6]:
# Step 2: Apply BERTopic for Topic Extraction

# Now, we will use BERTopic to cluster these embeddings into hierarchical topics.
topic_model = BERTopic(verbose=True, calculate_probabilities=False)
topics, _ = topic_model.fit_transform(top_tags)

2024-10-25 14:58:51,669 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/6250 [00:00<?, ?it/s]

2024-10-25 15:00:31,603 - BERTopic - Embedding - Completed ✓
2024-10-25 15:00:31,604 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-10-25 15:01:36,434 - BERTopic - Dimensionality - Completed ✓
2024-10-25 15:01:36,445 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers

In [7]:
# Show topics information
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,50377,-1_de_he_bruce_la,"[de, he, bruce, la, her, art, they, james, his...","[tell me where to put it down etc, these are s..."
1,0,1918,0_fanart_fanfic_fandom_fanfiction,"[fanart, fanfic, fandom, fanfiction, fan, fant...","[(fanart), fanart for fanfic, fanart of fanfic]"
2,1,1368,1_user_userlgbtq_userjessie_userana,"[user, userlgbtq, userjessie, userana, userdee...","[userdee, userlgbtq, user: heteronegative]"
3,2,916,2_reader_femreader_eren_insert,"[reader, femreader, eren, insert, kirishima, h...","[kanafinwë x reader, i.n x reader, f/o x reader]"
4,3,844,3_queue_queued_queues_queueing,"[queue, queued, queues, queueing, queuing, que...","[queue queue queue, [ 🕰️ ; queue ], queue-e-d]"
...,...,...,...,...,...
3996,3995,10,3995_rapebait_r4pebait_r4p3bait_j4ilbait,"[rapebait, r4pebait, r4p3bait, j4ilbait, gamed...","[rapebait, gamebaidoithuong, gamebaionline]"
3997,3996,10,3996_both_danerys_devote_karna,"[both, danerys, devote, karna, toward, counts,...","[i love them both, personally very into the se..."
3998,3997,10,3997_literate_literaure_literatiedit_literatigifs,"[literate, literaure, literatiedit, literatigi...","[literatigifs, a literate passion, because bei..."
3999,3998,10,3998_edgeworth_edgworth_newsworth_networth,"[edgeworth, edgworth, newsworth, networth, lan...","[miles edgworth, miles edgeworth fanart, miles..."


In [8]:
# Step 3: Generate a hierarchical structure for topics
hierarchical_topics = topic_model.hierarchical_topics(top_tags)
hierarchical_topics.Parent_ID = hierarchical_topics.Parent_ID.astype(int)
hierarchical_topics.Child_Left_ID = hierarchical_topics.Child_Left_ID.astype(int)
hierarchical_topics.Child_Right_ID = hierarchical_topics.Child_Right_ID.astype(int)

100%|██████████| 3999/3999 [00:29<00:00, 136.24it/s]


In [9]:
# This takes too long to visualize for this many topics. 
# topic_model.visualize_topics()

In [10]:
len(hierarchical_topics)

3999

In [11]:
# This takes too long to run and is very big to show in a notebook
# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [12]:
# Build a dictionary to store parent-child relationships
parent_to_children = defaultdict(list)

# Populate the dictionary with relationships from the DataFrame
for _, row in hierarchical_topics.iterrows():
    # Append children under their respective parents
    parent_to_children[(row['Parent_ID'], row['Parent_Name'])].append((row['Child_Left_ID'], row['Child_Left_Name']))
    parent_to_children[(row['Parent_ID'], row['Parent_Name'])].append((row['Child_Right_ID'], row['Child_Right_Name']))

taxonomy_with_details = []

# Define the function to build taxonomy details with Current ID, Parent ID, and Taxonomy List
def build_taxonomy_details(parent, path, paths):
    parent_id, _ = parent
    current_path = path + [parent_id]  # Keep IDs as integers for clarity
    
    if parent not in parent_to_children:
        # For leaves, store the last ID, its parent, and the full taxonomy
        paths.append({
            "Current_ID": parent_id,
            "Parent_ID": current_path[-2] if len(current_path) > 1 else None,
            "Taxonomy_List": current_path
        })
    else:
        # Continue building paths for each child, marking the current ID and its path so far
        for child in parent_to_children[parent]:
            child_id, _ = child
            paths.append({
                "Current_ID": child_id,
                "Parent_ID": parent_id,
                "Taxonomy_List": current_path
            })
            # Recursive call for each child
            build_taxonomy_details(child, current_path, paths)

# Collect taxonomy details in the required format
taxonomy_details = []
for root in parent_to_children:
    build_taxonomy_details(root, [], taxonomy_details)

# Convert the list of dictionaries into a DataFrame
taxonomy_details_df = pd.DataFrame(taxonomy_details)
taxonomy_details_df['Path_Length'] = taxonomy_details_df['Taxonomy_List'].apply(len)

# Sort by Path_Length in descending order and drop duplicates based on Current_ID, keeping the longest path
longest_paths_df = taxonomy_details_df.sort_values(by='Path_Length', ascending=False).drop_duplicates(subset=['Current_ID'])

longest_paths_df = longest_paths_df.drop(columns=['Path_Length'])

In [13]:
longest_paths_df.set_index('Current_ID', inplace=True)
longest_paths_df

Unnamed: 0_level_0,Parent_ID,Taxonomy_List
Current_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
3533,6183,"[7998, 7997, 7996, 7995, 7994, 7993, 7991, 798..."
3711,6183,"[7998, 7997, 7996, 7995, 7994, 7993, 7991, 798..."
3538,6184,"[7998, 7997, 7996, 7995, 7994, 7993, 7991, 798..."
6183,6184,"[7998, 7997, 7996, 7995, 7994, 7993, 7991, 798..."
3539,6185,"[7998, 7997, 7996, 7995, 7994, 7993, 7991, 798..."
...,...,...
4365,5050,"[7998, 5050]"
4320,7997,"[7998, 7997]"
7996,7997,"[7998, 7997]"
7997,7998,[7998]


In [14]:
# Step 4: Assign Topics to a new tag

# Given a new tag, we'll embed it using the same embedding model, 
# find its closest embedding from the topic clusters, and assign it to the appropriate topic hierarchy.

# Assign a tag to a topic
top_topics_for_tag = topic_model.find_topics("my art", top_n=5)
top_topics_for_tag

([329, 179, 3131, 1070, 2095],
 [0.84817666, 0.75887907, 0.68016624, 0.675676, 0.63073105])

In [15]:
# Better visualization of the topics
top_topics_for_tag = pd.DataFrame(
    list(zip(*top_topics_for_tag)),
    columns=["topic_id", "score"]
)

top_topics_for_tag['topic_name'] = top_topics_for_tag['topic_id'].apply(
    lambda x: '|'.join([i[0] for i in topic_model.get_topic(x)]))
top_topics_for_tag

Unnamed: 0,topic_id,score,topic_name
0,329,0.848177,artblock|myart|blocked|uwu|dumps|unfunniness|r...
1,179,0.758879,installation|recreation|art|bumfuzzled|imperce...
2,3131,0.680166,paiis|heretics|eru|contacts|portal|cake|artwor...
3,1070,0.675676,awesome|excellent|amazing|bathroom|dope|artwor...
4,2095,0.630731,others|peoples|ppls|artwork|peeps|other|folks|...


In [16]:
top_topic_id = top_topics_for_tag.topic_id[0]
top_topic_id

329

In [17]:
topic_model.get_topic(top_topic_id)

[('artblock', 0.01529035317909584),
 ('myart', 0.011396070880145973),
 ('blocked', 0.010860996488531482),
 ('uwu', 0.010400846599092371),
 ('dumps', 0.008893992573017068),
 ('unfunniness', 0.008893992573017068),
 ('rebooping', 0.008893992573017068),
 ('footages', 0.008893992573017068),
 ('ordeal', 0.008893992573017068),
 ('envious', 0.008893992573017068)]

In [18]:
longest_paths_df.loc[top_topic_id]

Parent_ID                                                     4674
Taxonomy_List    [7998, 7997, 7996, 7995, 7994, 7993, 7991, 798...
Name: 329, dtype: object

In [19]:
hierarchical_topics[hierarchical_topics['Parent_ID'] == longest_paths_df.loc[top_topic_id].Parent_ID]

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
674,4674,myart_artblock_blocked_uwu_rebooping,"[329, 3965]",329,artblock_myart_blocked_uwu_dumps,3965,myart_notmyart_mydbart_myartwork_myartvld,0.915537


In [21]:
topic_model.get_topic_info(top_topic_id).T

Unnamed: 0,0
Topic,329
Count,77
Name,329_artblock_myart_blocked_uwu
Representation,"[artblock, myart, blocked, uwu, dumps, unfunni..."
Representative_Docs,[sorry for the spam dear people but it's the o...


In [24]:
topic_model.get_topic(top_topic_id)

[('artblock', 0.01529035317909584),
 ('myart', 0.011396070880145973),
 ('blocked', 0.010860996488531482),
 ('uwu', 0.010400846599092371),
 ('dumps', 0.008893992573017068),
 ('unfunniness', 0.008893992573017068),
 ('rebooping', 0.008893992573017068),
 ('footages', 0.008893992573017068),
 ('ordeal', 0.008893992573017068),
 ('envious', 0.008893992573017068)]

As a recap of the steps we took in this notebook:
1. We preprocessed the data by extracting tags and counting their occurrences.
2. We applied BERTopic to cluster the tags into topics.
3. We generated a hierarchical structure for the topics.
4. We assigned a new tag to a topic in the hierarchy.

This needs quite a lot of work to get a good understanding of the topics and their hierarchy.
A human in the loop is needed to validate the topics and their hierarchy, and adjust them as needed.

