In [None]:
from bertopic import BERTopic
import pandas as pd
import os
import json
import httpcore
setattr(httpcore, 'SyncHTTPTransport', 'AsyncHTTPProxy')
from googletrans import Translator, LANGUAGES
from tqdm import tqdm
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [120]:
translator = Translator()

path = "./feed_data/full_feed_download"
files = os.listdir(path)

def get_feed_data(x):
    return {'feed_uri': x['uri'], 
    'creator_cid': x['cid'], 
    'feed_createdAt': x['value'].get('createdAt',""),
    'feed_description': x['value'].get('description',""),
    'feed_displayname': x['value'].get('displayName',"")
    }

all_feeds = []
for file in tqdm(files):
    with open(f"{path}/{file}") as f:
        data = json.load(f)
    for user in data:
        for feed in data[user]:
            feed_data = get_feed_data(feed)
            feed_data['creator_did'] = user
            all_feeds.append(feed_data)

scraped_feeds = pd.DataFrame(all_feeds)
scraped_feeds = scraped_feeds[~scraped_feeds["feed_uri"].duplicated()].reset_index(drop = True)


# Load the Feed Likers
def get_likers(liker_dict):
    results = []
    feed_uri = list(liker_dict.keys())[0]
    for liker in liker_dict[feed_uri]:
        results.append({'feed_uri': feed_uri,
                        'liker_did': liker[0],
                        'liker_dsplayname': liker[1],
                        'liker_description': liker[2],
                        'liker_createdAt': liker[3]
                         })
    return results

path = "./feed_data/feed_likes"
files = os.listdir(path)

with open(f"{path}/{files[0]}") as f:
    data = json.load(f)

scraped_likers = []

for file in tqdm(files):
    with open(f"{path}/{file}") as f:
        data = json.load(f)
    for feed in data:
        scraped_likers.extend(get_likers({feed:data[feed]}))

scraped_likers = pd.DataFrame(scraped_likers)
scraped_likers.to_csv("./feed_likers.csv", index = False)


# Function to translate text to English
def translate_to_english(text):
    try:
        # Translate the text to English
        translation = translator.translate(text, dest='en')
        return translation.text
    except Exception as e:
        # Return original text if translation fails
        return text

tqdm.pandas()
description_dict = {}
remaining_descriptions = scraped_feeds[scraped_feeds["description_en"].isna()]
remaining_descriptions["description_en"] = remaining_descriptions["feed_description"].progress_apply(translate_to_english)
description_dict.update(remaining_descriptions.set_index("feed_description")["description_en"].to_dict())
scraped_feeds.loc[remaining_descriptions.index, "description_en"] = remaining_descriptions["description_en"]
scraped_feeds.to_csv("./feeds_en.csv", index = False)

100%|██████████| 44/44 [00:00<00:00, 67.18it/s]
100%|██████████| 18357/18357 [00:03<00:00, 5405.85it/s]


In [7]:
likers = pd.read_csv("./feed_likers.csv")
likers_per_feed = likers.groupby("feed_uri").size().to_dict()
df = pd.read_csv("./feeds_en.csv")

# Remove links
df["description_en"] = df["description_en"].str.replace(r'http\S+', '', regex=True)
# remove html tags including newlines
df["description_en"] = df["description_en"].str.replace(r'<.*?>', '', regex=True).str.replace(r'\n', ' ', regex=True)
# Remove tokens which are entirely numbers
df["description_en"] = df["description_en"].str.replace(r'\b\d+\b', '', regex=True)
# Remove non-latin characters (only a-zA-Z0-9)
df["description_en"] = df["description_en"].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
# Remove empty strings
df["description_en"] = df["description_en"].str.strip()
# Remove "blueskyfeedscom"
#df["description_en"] = df["description_en"].str.replace(r'blueskyfeedscom', '', regex=False)
df = df[df["description_en"].str.len() > 1]
text = df["description_en"]
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(text)

print(f"Number of topics: {len(set(topics))}")
print(f"Percentage unclustered: {100 * np.mean([x == -1 for x in topics]):.2f}%")
print(f"Number of descriptions: {len(text)}")

df["Topic"] = topics
df["number_of_likers"] = df.feed_uri.apply(lambda x: likers_per_feed[x] if x in likers_per_feed else 0)

top_topics = df.groupby("Topic").size().sort_values(ascending=False).head(11)
top_topics = top_topics[top_topics.index != -1]

top_topics_num_likers = df.groupby("Topic").number_of_likers.sum().sort_values(ascending=False).head(11)
top_topics_num_likers = top_topics_num_likers[top_topics_num_likers.index != -1]

topic_info = topic_model.get_topic_info()

# Ensure top_topics_num_likers has a name before merging
top_topics_num_likers.name = 'number_of_likers'
top_topics_num_likers = topic_info.merge(top_topics_num_likers, left_on='Topic', right_index=True, how='inner')

# Ensure top_topics has a name before merging
top_topics.name = 'topic_count'
top_topics = topic_info.merge(top_topics, left_on='Topic', right_index=True, how='inner')


def truncate_to_length(text, max_length):
    """Add words to get as close to max_length as possible without exceeding it"""
    words = text.split(', ')
    result = []
    current_text = ""
    i = 0
    
    while i < len(words):
        # Try adding the next word
        test_text = current_text + (", " if current_text else "") + words[i]
        
        if len(test_text) <= max_length:
            # Word fits, add it
            current_text = test_text
            result.append(words[i])
            i += 1
        else:
            # Word doesn't fit, try next word if it's shorter
            next_word_idx = i + 1
            while next_word_idx < len(words):
                test_text = current_text + (", " if current_text else "") + words[next_word_idx]
                if len(test_text) <= max_length:
                    # Found a shorter word that fits
                    current_text = test_text
                    result.append(words[next_word_idx])
                    i = next_word_idx + 1
                    break
                next_word_idx += 1
            if next_word_idx >= len(words):
                # No more words fit
                break
                
    return current_text
print(r"\begin{table}[!ht]")
print(r"\begin{adjustwidth}{-1in}{0in}")
print(r"\centering")
print(r"\begin{tabular}{@{\extracolsep{5pt}} cccccc}")
print(r"\toprule")
print(r"\multicolumn{3}{c}{Top Feeds by Number of Feeds} & \multicolumn{3}{c}{Top Feeds by Number of Likes} \\")
print(r"\cmidrule(r){1-3}\cmidrule(l){4-6}")
print(r"Topic & Representation & Count & Topic & Representation & Likes \\")
print(r"\cmidrule(r){1-1}\cmidrule(lr){2-2}\cmidrule(lr){3-3}\cmidrule(l){4-4}\cmidrule(lr){5-5}\cmidrule(l){6-6}")

# Sort top_topics_num_likers by number of likers
top_topics_num_likers = top_topics_num_likers.sort_values('number_of_likers', ascending=False)

# Zip the rows together
for (_, row1), (_, row2) in zip(top_topics.iterrows(), top_topics_num_likers.iterrows()):
   rep1 = ", ".join(row1['Representation'][:5]).replace('_', ' ')
   rep1 = truncate_to_length(rep1, 35)
   topic1 = row1['Topic']
   count1 = "{:,}".format(int(row1['topic_count']))
   
   rep2 = ", ".join(row2['Representation'][:5]).replace('_', ' ')
   rep2 = truncate_to_length(rep2, 35)
   topic2 = row2['Topic']
   likes2 = "{:,}".format(int(row2['number_of_likers']))
   
   print(f"{topic1} & {rep1} & {count1} & {topic2} & {rep2} & {likes2} \\\\")

print(r"\bottomrule")
print(r"\end{tabular}")
print(r"\caption{Top Topics by Number of Feeds and Number of Likes}")
print(r"\label{tab:topics}")
print(r"\end{adjustwidth}")
print(r"\end{table}")


df.to_csv("./feeds_with_topics.csv", index = False)

Batches: 100%|██████████| 1121/1121 [02:35<00:00,  7.21it/s]
2024-10-30 16:54:53,461 - BERTopic - Transformed documents to Embeddings
2024-10-30 16:55:27,023 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2024-10-30 16:55:30,175 - BERTopic - Clustered reduced embeddings


Number of topics: 463
Percentage unclustered: 41.67%
Number of descriptions: 35871
\begin{table}[!ht]
\begin{adjustwidth}{-1in}{0in}
\centering
\begin{tabular}{@{\extracolsep{5pt}} cccccc}
\toprule
\multicolumn{3}{c}{Top Feeds by Number of Feeds} & \multicolumn{3}{c}{Top Feeds by Number of Likes} \\
\cmidrule(r){1-3}\cmidrule(l){4-6}
Topic & Representation & Count & Topic & Representation & Likes \\
\cmidrule(r){1-1}\cmidrule(lr){2-2}\cmidrule(lr){3-3}\cmidrule(l){4-4}\cmidrule(lr){5-5}\cmidrule(l){6-6}
0 & art, my, artwork, artists, all & 612 & 17 & furry, fursuit, furrylist, furries & 9,690 \\
1 & music, songs, audio, song, jazz & 394 & 346 & furry, bskyprobablyawebsite, across & 6,222 \\
2 & games, game, gaming, board & 390 & 331 & follower, follow, followers, back & 5,308 \\
3 & oshikapu, shobamyu, kawaii, jay & 389 & 2 & games, game, gaming, board & 4,260 \\
5 & tracking, malifaux, pom, falcom & 338 & 29 & books, book, reading, read, readers & 3,514 \\
6 & tracking, matsuura, naka

In [13]:
import pandas as pd
df = pd.read_csv("./feeds_with_topics.csv", 
                 quotechar='"',        # Specify quote character
                 escapechar='\\',      # Specify escape character
                 skip_blank_lines=True, # Skip empty lines
                 lineterminator='\n')   # Specify line terminator
number_of_topics = df.Topic.nunique()
number_of_unclustered = df[df.Topic == -1].shape[0]
percentage_unclustered = number_of_unclustered / df.shape[0]
print(f"Number of topics: {number_of_topics}")
print(f"Number of unclustered: {number_of_unclustered}")
print(f"Percentage unclustered: {100*percentage_unclustered:.2f}%")


Number of topics: 463
Number of unclustered: 14946
Percentage unclustered: 41.67%
