In [1]:
#!pip -q install git+https://github.com/charlesdedampierre/BunkaTopics.git@dev --upgrade
# yes | pip uninstall bunkatopics


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
from bunkatopics.datamodel import Term, Document
import jsonlines

# Define a function to read documents from a JSONL file
def read_documents_from_jsonl(file_path):
    documents = []
    with jsonlines.open(file_path, mode="r") as reader:
        for item in reader:
            document = Document(**item)
            documents.append(document)
    return documents

def read_terms_from_jsonl(file_path):
    terms = []
    with jsonlines.open(file_path, mode="r") as reader:
        for item in reader:
            term = Term(**item)
            terms.append(term)
    return terms

In [5]:
from bunkatopics import Bunka
from langchain_community.embeddings import HuggingFaceEmbeddings


model_name = "OrdalieTech/Solon-embeddings-large-0.1"
embedding_model = HuggingFaceEmbeddings(model_name=model_name,# We recommend starting with a small model
                                        model_kwargs={"device": "cuda"}, # Or cuda if you have GPU
                                        encode_kwargs={"show_progress_bar": True}, # Show the progress of embeddings
                                        multi_process=False)  # set to True if you have mutliprocessing

bunka = Bunka(embedding_model=embedding_model, language='french') # You can choose any language you prefer

In [6]:
documents = read_documents_from_jsonl("../data/data_preprocessed/bunka_data/bunka_docs.jsonl")
terms = read_terms_from_jsonl("../data/data_preprocessed/bunka_data/bunka_terms.jsonl")

bunka.docs = documents
bunka.terms = terms

In [7]:
from sklearn.cluster import KMeans

clustering_method = KMeans(n_clusters=50, random_state=42)
df_topics = bunka.get_topics(n_clusters=25, 
                             name_length=5, 
                             min_count_terms = 20, 
                             top_terms_overall = 1000000,
                             max_doc_per_topic = 2000,
                             min_docs_per_cluster = 1000,
                             ranking_terms = 15,
                             ngrams = [1,2],
                             custom_clustering_model = clustering_method
                             ) #

[32m2024-01-31 16:17:04 - [94mBunka[0m - INFO - [1mComputing the topics[0m


In [10]:
# filter topics

df_topics = df_topics[df_topics['size']>=40]

# Filter with the new topics
new_topics = []

for topic in bunka.topics:
    if topic.topic_id in list(df_topics['topic_id']):
        new_topics.append(topic)

bunka.topics = new_topics

fig = bunka.visualize_topics(width=1000, height=1000, colorscale='Portland', density = True,label_size_ratio = 120, convex_hull = True, show_text=False)
fig

[32m2024-01-31 16:17:45 - [94mBunka[0m - INFO - [1mCreating the Bunka Map[0m


In [11]:
import copy
bunka_bis = copy.deepcopy(bunka)

In [12]:
clustering_method = KMeans(n_clusters=10, random_state=42)
df_topics_10 = bunka_bis.get_topics(n_clusters=25, 
                             name_length=20, 
                             min_count_terms = 20, 
                             top_terms_overall = 1000000,
                             max_doc_per_topic = 2000,
                             min_docs_per_cluster = 1000,
                             ranking_terms = 15,
                             ngrams = [1,2],
                             custom_clustering_model = clustering_method
                             ) #

[32m2024-01-31 16:18:09 - [94mBunka[0m - INFO - [1mComputing the topics[0m


In [13]:
fig_10 = bunka_bis.visualize_topics(width=1000, height=1000, colorscale='Portland', density = True,label_size_ratio = 120, convex_hull = True, show_text=False)
fig_10

[32m2024-01-31 16:18:19 - [94mBunka[0m - INFO - [1mCreating the Bunka Map[0m


### Comparison with other categories

In [80]:
def wrap_by_word(string, n_words):
    """returns a string where \\n is inserted between every n words"""
    try:
        a = string.split()
        ret = ""
        for i in range(0, len(a), n_words):
            ret += " ".join(a[i : i + n_words]) + "<br>"
    except Exception as e:
        print(e)
    return ret

In [83]:
import pandas as pd
import plotly.express as px

df_docs = pd.DataFrame([x.model_dump() for x in bunka_bis.docs])
df_docs = df_docs[['doc_id', 'topic_id']].copy()
df_docs = pd.merge(df_docs, df_topics_10, on ='topic_id')

df_sample = pd.read_csv('../data/data_preprocessed/merged_sample_data.csv', index_col=[0])
df_sample = df_sample[['file_id', 'category_name', 'full_category_name']].copy()
df_sample = df_sample.rename(columns={'file_id':'doc_id'})

df_cross = pd.merge(df_docs, df_sample, on = 'doc_id')
df_cross['topic_short'] = df_cross['topic_name'].apply(lambda x : ' - '.join(x.split(' | ')[:10]))
df_cross['topic_short'] = df_cross['topic_short'].apply(lambda x : wrap_by_word(x, 7))
df_cross.head(5)

Unnamed: 0,doc_id,topic_id,topic_name,size,percent,category_name,full_category_name,topic_short
0,bpt6k12477166,bt-6,tableaux | objets | collection | vente | dessi...,1916,7.48,Littérature,Littérature_Littérature italienne et roumaine,tableaux - objets - collection - vente<br>- de...
1,bpt6k3043014n,bt-6,tableaux | objets | collection | vente | dessi...,1916,7.48,Généralités,Généralités_Collections générales,tableaux - objets - collection - vente<br>- de...
2,bpt6k1240327f,bt-6,tableaux | objets | collection | vente | dessi...,1916,7.48,Littérature,Littérature_Littérature italienne et roumaine,tableaux - objets - collection - vente<br>- de...
3,bpt6k1246358v,bt-6,tableaux | objets | collection | vente | dessi...,1916,7.48,Littérature,Littérature_Littérature italienne et roumaine,tableaux - objets - collection - vente<br>- de...
4,bpt6k58248311,bt-6,tableaux | objets | collection | vente | dessi...,1916,7.48,Généralités,Généralités_Bibliographie,tableaux - objets - collection - vente<br>- de...


In [86]:
df_category = df_cross.groupby(['topic_short', 'category_name'])['doc_id'].count().reset_index()
df_category['norm_doc_id'] = df_category.groupby('topic_short')['doc_id'].transform(lambda x: x / x.sum())

# Create a scatter plot using Plotly
fig = px.scatter(df_category,
    x='category_name',
    y='topic_short',
    size = 'norm_doc_id',
    title='Normalized doc_id by topic_short',
    template='plotly_white',
    height = 900,
    width = 1000
)

fig.update_xaxes(title_text='')
fig.update_yaxes(title_text='')
fig.show()


In [67]:
df_sub_category = df_cross.groupby(['topic_short', 'full_category_name'])['doc_id'].count().reset_index()
df_sub_category['norm_doc_id'] = df_sub_category.groupby('full_category_name')['doc_id'].transform(lambda x: x / x.sum())

# Create a scatter plot using Plotly
fig = px.scatter(df_sub_category,
    x='topic_short',
    y='full_category_name',
    size = 'norm_doc_id',
    title='Normalized doc_id by topic_short',
    template='plotly_white',
    height = 3000,
    width = 1000
)
fig.update_xaxes(title_text='')
fig.update_yaxes(title_text='')
fig.show()

