In [3]:
import pandas as pd

data = pd.read_parquet('data/sample_Plastique.parquet')
data = data.sample(20000, random_state=42)
ids = list(data['tweet_id'])
docs = list(data['text'])

In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings
import os

model_name = "OrdalieTech/Solon-embeddings-large-0.1"

# Choose your embedding model
embedding_model = HuggingFaceEmbeddings(model_name=model_name,# We recommend starting with a small model
                                        model_kwargs={"device": "cpu"}, # Or cuda if you have GPU
                                        # encode_kwargs={"show_progress_bar": True}, # Show the progress of embeddings
                                        multi_process=False)  # set to True if you have mutliprocessing

In [5]:
from bunkatopics import Bunka

# Initialize Bunka with your chosen model and language preference
bunka = Bunka(embedding_model=embedding_model, language='french') # You can choose any language you prefer

# Fit Bunka to your text data
bunka.fit(docs=docs, ids=ids)

[32m2024-02-15 14:35:50 - [94mBunka[0m - INFO - [1mEmbedding documents... (can take varying amounts of time depending on their size)[0m
[32m2024-02-15 14:51:15 - [94mBunka[0m - INFO - [1mReducing the dimensions of embeddings...[0m
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
[32m2024-02-15 14:51:28 - [94mBunka[0m - INFO - [1mExtracting meaningful terms from documents...[0m
100%|██████████| 20000/20000 [02:43<00:00, 122.40it/s]


### Compute topics

In [10]:
from sklearn.cluster import KMeans
clustering_method = KMeans(n_clusters=20, random_state=42)

df_topics = bunka.get_topics(n_clusters=20, name_length=5, min_count_terms = 5, custom_clustering_model = clustering_method) # Specify the number of terms to describe each topic


df_topics = df_topics[df_topics['size']>=100]

# Filter with the new topics
new_topics = []

for topic in bunka.topics:
    if topic.topic_id in list(df_topics['topic_id']):
        new_topics.append(topic)

bunka.topics = new_topics

[32m2024-02-15 15:01:32 - [94mBunka[0m - INFO - [1mComputing the topics[0m


In [13]:
topic_ids_filtered = [x.topic_id for x in bunka.topics]

['bt-0',
 'bt-1',
 'bt-10',
 'bt-11',
 'bt-12',
 'bt-13',
 'bt-16',
 'bt-18',
 'bt-19',
 'bt-2',
 'bt-3',
 'bt-4',
 'bt-5',
 'bt-6',
 'bt-7',
 'bt-8',
 'bt-9']

In [14]:
# remove the docs that are off topics

new_docs = [x for x in bunka.docs if x.topic_id in topic_ids_filtered]

bunka.docs = new_docs

### Save the content to be read by the Web App

In [15]:
import json
# Save Topics
file_path = 'web/public/bunka_topics.json'
topics_json = [x.model_dump() for x in bunka.topics]
with open(file_path, "w") as json_file:
    json.dump(topics_json, json_file)


# Save Docs
file_path =  'web/public/bunka_docs.json'
files_json = [x.model_dump() for x in bunka.docs]
with open(file_path, "w") as json_file:
    json.dump(files_json, json_file)