In [2]:
from bunkatopics import BunkaTopics
import pandas as pd


data = pd.read_csv("data/imdb.csv", index_col=[0])
data = data.sample(1000, random_state=42)


model = BunkaTopics(
    data,  # dataFrame
    text_var="description",  # Text Columns
    index_var="imdb",  # Index Column (Mandatory)
    extract_terms=True,  # extract Terms ?
    terms_embeddings=False,  # extract terms Embeddings?
    docs_embeddings=True,  # extract Docs Embeddings?
    embeddings_model="distiluse-base-multilingual-cased-v1",  # Chose an embeddings Model
    multiprocessing=True,  # Multiprocessing of Embeddings
    language="en",  # Chose between English "en" and French "fr"
    sample_size_terms=len(data),
    terms_limit=10000,  # Top Terms to Output
    terms_ents=True,  # Extract entities
    terms_ngrams=(1, 2),  # Chose Ngrams to extract
    terms_ncs=True,  # Extract Noun Chunks
    terms_include_pos=["NOUN", "PROPN", "ADJ"],  # Include Part-of-Speech
    terms_include_types=["PERSON", "ORG"],
    reduction = 2
)  # Include Entity Types

2022-10-23 14:53:28,536 - Extracting Terms...
100%|████████████████████████████████████████████████████████| 995/995 [00:06<00:00, 149.97it/s]
2022-10-23 14:53:35,241 - Extracting Docs Embeddings...
100%|█████████████████████████████████████████████████████████| 995/995 [00:26<00:00, 38.26it/s]


UMAP(random_state=42, verbose=True)
Sun Oct 23 14:54:01 2022 Construct fuzzy simplicial set
Sun Oct 23 14:54:02 2022 Finding Nearest Neighbors
Sun Oct 23 14:54:02 2022 Finished Nearest Neighbor Search
Sun Oct 23 14:54:02 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Sun Oct 23 14:54:03 2022 Finished embedding


In [3]:
topics = model.get_clusters(
    topic_number=30,  # Number of Topics
    top_terms_included=100,  # Compute the specific terms from the top n terms
    top_terms=3,  # Most specific Terms to describe the topics
    term_type="lemma",  # Use "lemma" of "text"
    ngrams=[1, 1],
    clusterer = 'kmeans'
)  # N-grams for Topic Representation


In [4]:
# Visualize the clusters. It is adviced to choose less that 5 terms - top_terms = 5 - to avoid overchanging the Figure

fig = model.visualize_clusters(scatter_size = "avg_vote",
                               search=None, 
                               width=1200, 
                               height=1200, 
                               fit_clusters=False, 
                               density_plot=True)

import plotly
#plotly.offline.plot(fig)
plotly.offline.plot(fig)

UMAP(random_state=42, verbose=True)
Sun Oct 23 14:54:03 2022 Construct fuzzy simplicial set
Sun Oct 23 14:54:04 2022 Finding Nearest Neighbors
Sun Oct 23 14:54:04 2022 Finished Nearest Neighbor Search
Sun Oct 23 14:54:04 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Sun Oct 23 14:54:05 2022 Finished embedding


'temp-plot.html'

#### Get Folding

In [5]:
cat_1_name = "negative"
cat_2_name = "positive"

cat_1 = pd.DataFrame({'category':cat_1_name, 'term':['hate', 'violence', 'pain', 'negative']})
cat_2 = pd.DataFrame({'category':cat_2_name, 'term':['good', 'positive', "love"]})

dictionnary = pd.concat([cat_1, cat_2]).reset_index(drop=True)

df_folding, met = model.get_folding( dictionnary)

  0%|          | 0/995 [00:00<?, ?it/s]

  0%|          | 0/995 [00:00<?, ?it/s]


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
df_folding.label.value_counts()

import numpy as np
test = df_folding.copy()

for cat in set(test.label):
    # Take the top 5% of the dataset for cat_1
    test[cat][int(len(test)/20):] = 0
    #test[cat][:int(len(test)/20)] = np.exp(test[cat])
    #test[f'{cat}_exp'] = np.exp(test[cat])



fold_dim = 'positive'
import plotly.express as px
fig = px.scatter(
                test,
                x=0,
                y=1,
                #color = f'{fold_dim}_exp',
                color = fold_dim,
                color_continuous_scale = px.colors.sequential.Blues, 
                hover_data=["description", fold_dim],
                width=1000,
                height=1000,
            )

import plotly
plotly.offline.plot(fig, filename='file.html')

#### Get centroid documents

In [None]:
res = model.get_centroid_documents(top_elements=5)
res['centroid_docs'].iloc[0].split(' || ')[2]