In [3]:
from bunkatopics.datamodel import Term, Document, TopicRanking, BourdieuDimension
from pydantic import BaseModel, Field
import typing as t
import jsonlines

DOC_ID = str
TERM_ID = str
TOPIC_ID = str

class Document(BaseModel):
    doc_id: DOC_ID
    content: str
    size: t.Optional[float] = None
    x: t.Optional[float] = None
    y: t.Optional[float] = None
    topic_id: t.Optional[TOPIC_ID] = None
    topic_ranking: t.Optional[TopicRanking] = None  # Make topic_ranking optional
    term_id: t.Optional[t.List[TERM_ID]] = None
    embedding: t.Optional[t.List[float]] = Field(None, repr=False)
    bourdieu_dimensions: t.List[BourdieuDimension] = []


# Define a function to read documents from a JSONL file
def read_documents_from_jsonl(file_path):
    documents = []
    with jsonlines.open(file_path, mode="r") as reader:
        for item in reader:
            document = Document(**item)
            documents.append(document)
    return documents

def read_terms_from_jsonl(file_path):
    terms = []
    with jsonlines.open(file_path, mode="r") as reader:
        for item in reader:
            term = Term(**item)
            terms.append(term)
    return terms

In [6]:
from bunkatopics import Bunka
from langchain_community.embeddings import HuggingFaceEmbeddings


model_name = "OrdalieTech/Solon-embeddings-large-0.1"
embedding_model = HuggingFaceEmbeddings(model_name=model_name,# We recommend starting with a small model
                                        model_kwargs={"device": "cuda"}, # Or cuda if you have GPU
                                        encode_kwargs={"show_progress_bar": True}, # Show the progress of embeddings
                                        multi_process=False)  # set to True if you have mutliprocessing

bunka = Bunka(embedding_model=embedding_model, language='french') # You can choose any language you prefer

In [7]:
documents = read_documents_from_jsonl("../data/data_preprocessed/bunka_data/bunka_docs.jsonl")
terms = read_terms_from_jsonl("../data/data_preprocessed/bunka_data/bunka_terms.jsonl")

bunka.docs = documents
bunka.terms = terms

In [11]:
from sklearn.cluster import KMeans

clustering_method = KMeans(n_clusters=50)
df_topics = bunka.get_topics(n_clusters=25, 
                             name_length=5, 
                             min_count_terms = 20, 
                             top_terms_overall = 1000000,
                             max_doc_per_topic = 2000,
                             min_docs_per_cluster = 1000,
                             ranking_terms = 15,
                             ngrams = [1,2],
                             custom_clustering_model = clustering_method
                             ) #

[32m2024-01-31 15:29:34 - [94mBunka[0m - INFO - [1mComputing the topics[0m


In [14]:
# filter topics

df_topics = df_topics[df_topics['size']>=20]

# Filter with the new topics
new_topics = []

for topic in bunka.topics:
    if topic.topic_id in list(df_topics['topic_id']):
        new_topics.append(topic)

bunka.topics = new_topics


In [15]:
fig = bunka.visualize_topics(width=1000, height=1000, colorscale='Portland', density = True,label_size_ratio = 120, convex_hull = True, show_text=False)
fig

[32m2024-01-31 15:39:35 - [94mBunka[0m - INFO - [1mCreating the Bunka Map[0m


In [17]:
import copy
bunka_bis = copy.deepcopy(bunka)

In [22]:
clustering_method = KMeans(n_clusters=10)
df_topics_10 = bunka_bis.get_topics(n_clusters=25, 
                             name_length=20, 
                             min_count_terms = 20, 
                             top_terms_overall = 1000000,
                             max_doc_per_topic = 2000,
                             min_docs_per_cluster = 1000,
                             ranking_terms = 15,
                             ngrams = [1,2],
                             custom_clustering_model = clustering_method
                             ) #

[32m2024-01-31 15:44:05 - [94mBunka[0m - INFO - [1mComputing the topics[0m


In [21]:
fig_10 = bunka_bis.visualize_topics(width=1000, height=1000, colorscale='Portland', density = True,label_size_ratio = 120, convex_hull = True, show_text=False)
fig_10

[32m2024-01-31 15:42:46 - [94mBunka[0m - INFO - [1mCreating the Bunka Map[0m
