In [1]:
#!pip -q install git+https://github.com/charlesdedampierre/BunkaTopics.git@dev --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from bunkatopics import Bunka
from langchain_community.embeddings import HuggingFaceEmbeddings
import os

model_name = "OrdalieTech/Solon-embeddings-large-0.1"

embedding_model = HuggingFaceEmbeddings(model_name=model_name,# We recommend starting with a small model
                                        model_kwargs={"device": "cuda"}, # Or cuda if you have GPU
                                        encode_kwargs={"show_progress_bar": True}, # Show the progress of embeddings
                                        multi_process=False)  # set to True if you have mutliprocessing

In [24]:
import pandas as pd
import tiktoken


encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoding.encode(string))
    return num_tokens


data = pd.read_csv('../data/data_preprocessed/merged_sample_data.csv', index_col=[0])
data = data[~data['title'].isna()]

data['token_count'] = data['title'].apply(num_tokens_from_string)
data = data[(data['token_count']<100)] # Remove title too long
data = data[(data['token_count']>3)] # Remove title too long


data = data[['file_id', 'title', 'token_count']].copy()
data = data.drop_duplicates('title', keep='first')
docs = list(data['title'])

In [26]:
bunka = Bunka(embedding_model=embedding_model, language='french') # You can choose any language you prefer
bunka.fit(docs)

[32m2024-01-31 15:38:12 - [94mBunka[0m - INFO - [1mEmbedding documents... (can take varying amounts of time depending on their size)[0m


Batches:   0%|          | 0/801 [00:00<?, ?it/s]

[32m2024-01-31 15:40:27 - [94mBunka[0m - INFO - [1mReducing the dimensions of embeddings...[0m
[32m2024-01-31 15:41:04 - [94mBunka[0m - INFO - [1mExtracting meaningful terms from documents...[0m
100%|██████████| 25604/25604 [04:01<00:00, 106.00it/s]


In [27]:
# Save the embeddings and the terms in Bunka Objects

import jsonlines  # You may need to install this library using pip

# Dump the data into JSONL files
with jsonlines.open("../data/data_preprocessed/bunka_data/bunka_docs.jsonl", mode="w") as writer:
    for item in bunka.docs:
        writer.write(item.dict())

# Dump the data into JSONL files
with jsonlines.open("../data/data_preprocessed/bunka_data/bunka_terms.jsonl", mode="w") as writer:
    for item in bunka.terms:
        writer.write(item.dict())