In [10]:
import pandas as pd
from datasets import Dataset, load_dataset


dataset = load_dataset("mlabonne/chatml_dpo_pairs")['train']
df = dataset.to_pandas()
df['doc_id'] = df.index

df.to_csv('results/training-datasets/all_data.csv')

df_sample = df.sample(2000, random_state=42)
df_sample.to_csv('results/training-datasets/sample.csv')

ids = list(df['doc_id'].astype(str))
docs_accepted = list(df['chosen'])
docs_rejected = list(df['rejected'])

In [None]:

# topic modeling on chosen and on rejected. Then find the non-common topics and create 4 datasets
# original one
# one in favor of ChatGPT
# one in favor of LLma


from bunkatopics import Bunka
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
import pandas as pd
from sklearn.cluster import KMeans

clustering_method = KMeans(n_clusters=30, random_state=42)

In [None]:
#models = ["sentence-transformers/distiluse-base-multilingual-cased-v2", "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"]
model_name = "WhereIsAI/UAE-Large-V1"

embedding_model = HuggingFaceEmbeddings(model_name=model_name,# We recommend starting with a small model
                                        model_kwargs={"device": "cuda"}, # Or cuda if you have GPU
                                        #encode_kwargs={"show_progress_bar": True}, # Show the progress of embeddings
                                        multi_process=False)  # set to True if you have mutliprocessing

bunka = Bunka(embedding_model=embedding_model, language='english') # You can choose any language you prefer


In [None]:

bunka.fit(docs = docs_accepted, ids = ids)
df_topics = bunka.get_topics(n_clusters=25, 
                            name_length=10, 
                            min_count_terms = 20, 
                            top_terms_overall = 1000000,
                            max_doc_per_topic = 2000,
                            min_docs_per_cluster = 1000,
                            ranking_terms = 15,
                            ngrams = [1,2],
                            custom_clustering_model = clustering_method
                            ) # Specify the number of terms to describe each topic



df_topics.to_csv('results/df_topics_accepted.csv')


fig = bunka.visualize_topics(width=1200, height=1200, colorscale='Portland', density = True,label_size_ratio = 120, convex_hull = True, show_text=True)
import plotly.offline as offline


# df_topics = bunka.get_topics(n_clusters=25, 
#                             name_length=10, 
#                             min_count_terms = 20, 
#                             top_terms_overall = 1000000,
#                             max_doc_per_topic = 2000,
#                             min_docs_per_cluster = 1000,
#                             ranking_terms = 15,
#                             ngrams = [1,2],
#                             custom_clustering_model = clustering_method
#                             ) # Specify the number of terms to describe each topic


# Save the Plotly figure to an HTML file
offline.plot(fig, filename='results/map_accepted_topics.html', auto_open=False)

In [None]:
bunka_bis = Bunka(embedding_model=embedding_model, language='english') # You can choose any language you prefer

bunka_bis.fit(docs = docs_rejected, ids = ids)
df_topics_bis = bunka_bis.get_topics(n_clusters=25, 
                            name_length=10, 
                            min_count_terms = 20, 
                            top_terms_overall = 1000000,
                            max_doc_per_topic = 2000,
                            min_docs_per_cluster = 1000,
                            ranking_terms = 15,
                            ngrams = [1,2],
                            custom_clustering_model = clustering_method
                            ) # Specify the number of terms to describe each topic


df_topics_bis.to_csv('results/df_topics_rejected.csv')

# df_topics_bis = bunka_bis.get_topics(n_clusters=25, 
#                             name_length=10, 
#                             min_count_terms = 20, 
#                             top_terms_overall = 1000000,
#                             max_doc_per_topic = 2000,
#                             min_docs_per_cluster = 1000,
#                             ranking_terms = 15,
#                             ngrams = [1,2],
#                             custom_clustering_model = clustering_method
#                             ) # Specify the number of terms to describe each topic
fig = bunka_bis.visualize_topics(width=1200, height=1200, colorscale='Portland', density = True,label_size_ratio = 120, convex_hull = True, show_text=True)

# Save the Plotly figure to an HTML file
offline.plot(fig, filename='results/map_rejected_topics.html', auto_open=False)
