In [1]:
import jsonlines
from bunkatopics.datamodel import Document, Term

from bunkatopics.datamodel import TopicRanking, BourdieuDimension, Term
from pydantic import BaseModel, Field
import typing as t


DOC_ID = str
TERM_ID = str
TOPIC_ID = str


class Document(BaseModel):
    doc_id: DOC_ID
    content: str
    size: t.Optional[float] = None
    x: t.Optional[float] = None
    y: t.Optional[float] = None
    topic_id: t.Optional[TOPIC_ID] = None
    topic_ranking: t.Optional[TopicRanking] = None  # Make topic_ranking optional
    term_id: t.Optional[t.List[TERM_ID]] = None
    embedding: t.Optional[t.List[float]] = Field(None, repr=False)
    bourdieu_dimensions: t.List[BourdieuDimension] = []



# Define a function to read documents from a JSONL file
def read_documents_from_jsonl(file_path):
    documents = []
    with jsonlines.open(file_path, mode="r") as reader:
        for item in reader:
            document = Document(**item)
            documents.append(document)
    return documents

def read_terms_from_jsonl(file_path):
    terms = []
    with jsonlines.open(file_path, mode="r") as reader:
        for item in reader:
            term = Term(**item)
            terms.append(term)
    return terms

In [2]:
documents = read_documents_from_jsonl("exports/bunka_docs_lemonde.jsonl")
terms = read_terms_from_jsonl("exports/bunka_terms_lemonde.jsonl")

In [3]:
from bunkatopics import Bunka
from langchain_community.embeddings import HuggingFaceEmbeddings
import os

model_name = "OrdalieTech/Solon-embeddings-large-0.1"
embedding_model = HuggingFaceEmbeddings(model_name=model_name,# We recommend starting with a small model
                                        model_kwargs={"device": "cuda"}, # Or cuda if you have GPU
                                        encode_kwargs={"show_progress_bar": True}, # Show the progress of embeddings
                                        multi_process=False)  # set to True if you have mutliprocessing

bunka = Bunka(embedding_model=embedding_model, language='french') # You can choose any language you prefer

In [4]:
bunka.docs = documents
bunka.terms = terms

In [5]:
from sklearn.cluster import KMeans

clustering_method = KMeans(n_clusters=25, random_state=42)

df_topics_25 = bunka.get_topics(n_clusters=25, 
                             name_length=20, 
                             min_count_terms = 20, 
                             top_terms_overall = 1000000,
                             max_doc_per_topic = 2000,
                             min_docs_per_cluster = 1000,
                             ranking_terms = 15,
                             ngrams = [1,2],
                             custom_clustering_model = clustering_method
                             ) # Specify the number of terms to describe each topic

df_topics_25 = df_topics_25[df_topics_25['size']>=2000]

# Filter with the new topics
new_topics = []

for topic in bunka.topics:
    if topic.topic_id in list(df_topics_25['topic_id']):
        new_topics.append(topic)

bunka.topics = new_topics


[32m2024-02-16 12:57:47 - [94mBunka[0m - INFO - [1mComputing the topics[0m


In [10]:
import pandas as pd
df_topics = pd.DataFrame([x.model_dump() for x in bunka.topics])
df_topics = df_topics.drop('top_doc_content', axis=1)
df_topics.to_csv('exports/topics.csv')

In [11]:
#bunka.manually_clean_topics()
df_docs_topics = bunka.df_top_docs_per_topic_
df_docs_topics.to_csv('exports/df_topics_top_docs.csv')

In [None]:
# import pandas as pd
# df_embedding = pd.DataFrame([x.model_dump() for x in documents])
# df_embedding = df_embedding[['doc_id', 'embedding']].copy()
# df_embedding.to_csv('exports/df_embeddings.csv')

Unnamed: 0,doc_id,embedding
0,f9d2e3e9-81b5-4072-a,"[-0.03271692246198654, 0.06378008425235748, 0...."
1,7c5d0504-0854-4f2d-8,"[0.026198111474514008, 0.0498882457613945, -0...."
2,ac0dd882-cc8a-4156-8,"[-0.016506027430295944, 0.07320893555879593, 0..."
3,37f918ed-ae3e-42b8-b,"[0.012672492302954197, 0.02790186181664467, -0..."
4,2e442e12-bb4a-4db2-a,"[0.09116579592227936, 0.05506209284067154, -0...."
...,...,...
99959,cc52f8c7-7b17-4886-b,"[-0.03648990020155907, 0.029919540509581566, 0..."
99960,7cedd74a-2b27-47b4-a,"[-0.007133149076253176, -0.019600139930844307,..."
99961,08f1e250-1fd3-4d6f-8,"[0.005206968169659376, 0.019187677651643753, 0..."
99962,02086a5e-6e97-4e86-b,"[0.029431240633130074, 0.04790781810879707, 0...."


In [None]:
import pandas as pd
df_docs = pd.DataFrame([x.model_dump() for x in bunka.docs])
df_docs_filtered = df_docs[['doc_id', 'topic_id']].copy()
df_topics = pd.DataFrame([x.model_dump() for x in bunka.topics])
df_topics = df_topics[['topic_id', 'name']]
df_final_topics = pd.merge(df_docs_filtered, df_topics, on = 'topic_id')
df_final_topics.to_csv('exports/df_topics.csv')

In [None]:

from langchain.llms import OpenAI
from dotenv import load_dotenv
load_dotenv()

llm = OpenAI(openai_api_key = os.environ.get('OPEN_AI_KEY'))
df_topics_clean = bunka.get_clean_topic_name(llm=llm)

In [None]:
df_topics_clean = df_topics_clean.rename(columns={'topic_name':'topic_name_gpt'})

In [None]:
for topic in bunka.topics:
    try:
        topic.name = ' | '.join(topic.name.split(' | ')[:8])
    except:
        topic.name = topic.name


fig = bunka.visualize_topics(width=1200, height=1200, colorscale='Portland', density = True,label_size_ratio = 120, convex_hull = True, show_text=False)
fig

[32m2024-01-29 14:54:47 - [94mBunka[0m - INFO - [1mCreating the Bunka Map[0m


In [None]:
import pandas as pd
df_final_topics = pd.merge(df_topics_25, df_topics_clean[['topic_id', 'topic_name_gpt']], on = 'topic_id')

In [None]:
df_final_topics.to_csv('exports/topics.csv')
bunka.df_top_docs_per_topic_.to_csv('exports/top_documents_per_topic.csv')


import plotly.offline as offline

# Save the Plotly figure to an HTML file
offline.plot(fig, filename='exports/map.html', auto_open=False)

'exports/map.html'