In [7]:
from bunkatopics import BunkaTopics
import pandas as pd


data = pd.read_csv("data/imdb.csv", index_col=[0])
data = data.sample(3000, random_state=42)


model = BunkaTopics(
    data,  # dataFrame
    text_var="description",  # Text Columns
    index_var="imdb",  # Index Column (Mandatory)
    extract_terms=True,  # extract Terms ?
    terms_embeddings=False,  # extract terms Embeddings?
    docs_embeddings=True,  # extract Docs Embeddings?
    embeddings_model="distiluse-base-multilingual-cased-v1",  # Chose an embeddings Model
    multiprocessing=True,  # Multiprocessing of Embeddings
    language="en",  # Chose between English "en" and French "fr"
    sample_size_terms=len(data),
    terms_limit=10000,  # Top Terms to Output
    terms_ents=True,  # Extract entities
    terms_ngrams=(1, 2),  # Chose Ngrams to extract
    terms_ncs=True,  # Extract Noun Chunks
    terms_include_pos=["NOUN", "PROPN", "ADJ"],  # Include Part-of-Speech
    terms_include_types=["PERSON", "ORG"],
    reduction = 2
)  # Include Entity Types

# Extract the topics


2022-10-21 09:51:16,627 - Extracting Terms...
100%|██████████████████████████████████████████████████████████| 2990/2990 [00:20<00:00, 143.51it/s]
2022-10-21 09:51:37,655 - Extracting Docs Embeddings...
100%|███████████████████████████████████████████████████████████| 2990/2990 [00:44<00:00, 67.11it/s]


UMAP(random_state=42, verbose=True)
Fri Oct 21 09:52:23 2022 Construct fuzzy simplicial set
Fri Oct 21 09:52:25 2022 Finding Nearest Neighbors
Fri Oct 21 09:52:25 2022 Finished Nearest Neighbor Search
Fri Oct 21 09:52:25 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Fri Oct 21 09:52:28 2022 Finished embedding


In [2]:
#model.docs_embeddings = pd.read_csv('test/docs_embeddings.csv', index_col = [0])
#model.docs_embeddings.columns = model.docs_embeddings.columns.astype(int)

In [8]:
topics = model.get_clusters(
    topic_number=30,  # Number of Topics
    top_terms_included=100,  # Compute the specific terms from the top n terms
    top_terms=3,  # Most specific Terms to describe the topics
    term_type="lemma",  # Use "lemma" of "text"
    ngrams=[1, 1],
    clusterer = 'kmeans'
)  # N-grams for Topic Representation


In [9]:
# Visualize the clusters. It is adviced to choose less that 5 terms - top_terms = 5 - to avoid overchanging the Figure

fig = model.visualize_clusters(scatter_size = "votes",
                               search=None, 
                               width=1200, 
                               height=1200, 
                               fit_clusters=False, 
                               density_plot=True)

UMAP(random_state=42, verbose=True)
Fri Oct 21 09:52:29 2022 Construct fuzzy simplicial set
Fri Oct 21 09:52:31 2022 Finding Nearest Neighbors
Fri Oct 21 09:52:31 2022 Finished Nearest Neighbor Search
Fri Oct 21 09:52:31 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Fri Oct 21 09:52:34 2022 Finished embedding


In [11]:
import plotly
#plotly.offline.plot(fig)
plotly.offline.plot(fig, filename='/Users/charlesdedampierre/Desktop/bunka/research/src/bunka_research/figures/imdb.html')

'/Users/charlesdedampierre/Desktop/bunka/research/src/bunka_research/figures/imdb.html'

In [6]:
df_centroid = df_centroid.rename(
            columns={x: f"{x}" for x in range(len(self.reduction))}
        )

NameError: name 'df_centroid' is not defined

In [None]:
model.docs_embeddings.rename(
            columns={x: f"hi_{x}" for x in range(5)}
        )

In [None]:
res = model.get_centroid_documents(top_elements=5)
res['centroid_docs'].iloc[0].split(' || ')[2]

In [None]:
#model.df_fig.columns

In [None]:
centroids_emb = model.df_fig[["dim_1", "dim_2", "cluster_name_number"]]
centroids_emb = (
    centroids_emb.groupby("cluster_name_number").mean().reset_index()
)
centroids_emb.columns = ["centroid_name", "dim_1", "dim_2"]

In [None]:
df_fig_centroids = pd.concat([model.df_fig, centroids_emb])
df_fig_centroids

In [None]:
import plotly.express as px

In [None]:
fig = px.scatter(
                model.df_fig,
                x="dim_1",
                y="dim_2",
                size = model.df_fig['votes'],
                width=1000,
                height=1000,
            )

In [None]:
# Visualize the clusters. It is adviced to choose less that 5 terms - top_terms = 5 - to avoid overchanging the Figure

fig = model.visualize_clusters(search=None, width=1000, height=1000, fit_clusters=False, density_plot=True)
fig.show()

In [None]:
fig = model.visualize_clusters(search=None, width=1000, height=1000, fit_clusters=False, density_plot=False)
fig.show()

In [None]:
fig = model.visualize_clusters(search=None, width=1000, height=1000, fit_clusters=True, density_plot=True)
fig.show()

In [None]:
fig = model.visualize_clusters(search=None, width=1000, height=1000, fit_clusters=False, density_plot=True)
fig.show()