In [2]:
from bunkatopics import BunkaTopics
import pandas as pd
pd.options.mode.chained_assignment = None

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


data = pd.read_csv("data/imdb.csv", index_col=[0])
data = data.sample(1000, random_state=42)


model = BunkaTopics(
    data,  # dataFrame
    text_var="description",  # Text Columns
    index_var="imdb",  # Index Column (Mandatory)
    extract_terms=True,  # extract Terms ?
    terms_embeddings=False,  # extract terms Embeddings?
    docs_embeddings=True,  # extract Docs Embeddings?
    embeddings_model="distiluse-base-multilingual-cased-v1",  # Chose an embeddings Model
    multiprocessing=True,  # Multiprocessing of Embeddings
    language="en",  # Chose between English "en" and French "fr"
    sample_size_terms=len(data),
    terms_limit=10000,  # Top Terms to Output
    terms_ents=True,  # Extract entities
    terms_ngrams=(1, 2),  # Chose Ngrams to extract
    terms_ncs=True,  # Extract Noun Chunks
    terms_include_pos=["NOUN", "PROPN", "ADJ"],  # Include Part-of-Speech
    terms_include_types=["PERSON", "ORG"],
    reduction = 2
)  # Include Entity Types

2022-12-05 14:02:03,935 - Extracting Terms...
2022-12-05 14:02:04,235 - loaded 'en_core_web_sm' spaCy language pipeline
100%|██████████████████████████████████████████████████████████████████████████| 995/995 [00:07<00:00, 130.85it/s]
2022-12-05 14:02:11,975 - Extracting Docs Embeddings...
100%|███████████████████████████████████████████████████████████████████████████| 995/995 [00:34<00:00, 28.90it/s]


UMAP(random_state=42, verbose=True)
Mon Dec  5 14:02:47 2022 Construct fuzzy simplicial set


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Mon Dec  5 14:02:47 2022 Finding Nearest Neighbors
Mon Dec  5 14:02:48 2022 Finished Nearest Neighbor Search
Mon Dec  5 14:02:49 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Mon Dec  5 14:02:50 2022 Finished embedding


#### Get the top topics of the dataset

In [8]:
topics = model.get_clusters(
    topic_number=20,  # Number of Topics
    top_terms_included=1000,  # Compute the specific terms from the top n terms
    top_terms=3,  # Most specific Terms to describe the topics
    term_type="lemma",  # Use "lemma" of "text"
    ngrams=[2, 2],
    clusterer = 'kmeans'
)  # N-grams for Topic Representation


In [13]:
# Visualize the clusters. It is adviced to choose less that 5 terms - top_terms = 5 - to avoid overchanging the Figure

fig = model.visualize_clusters(scatter_size = "avg_vote",
                               search=None, 
                               width=1200, 
                               height=1200, 
                               fit_clusters=False, 
                               density_plot=True)

import plotly
#plotly.offline.plot(fig)
plotly.offline.plot(fig)

UMAP(random_state=42, verbose=True)
Mon Dec  5 13:35:21 2022 Construct fuzzy simplicial set
Mon Dec  5 13:35:22 2022 Finding Nearest Neighbors
Mon Dec  5 13:35:22 2022 Finished Nearest Neighbor Search
Mon Dec  5 13:35:22 2022 Construct embedding


Epochs completed: 100%| ██████████████████████████████████████████████████████████████████████████ 500/500 [00:01]


Mon Dec  5 13:35:23 2022 Finished embedding


'temp-plot.html'

#### Extract the top documents of the cluster

In [9]:
top_documents = model.get_specific_documents_per_cluster(
                                               top_n = 2, 
                                               top_type = 'terms_based')

In [10]:
df_top_documents = pd.merge(top_documents, model.data[[model.text_var]].reset_index(), on = model.index_var)

In [11]:
top_documents_pop = model.get_specific_documents_per_cluster(
                                               top_n = 10, 
                                               top_type = 'pop_based', 
                                               pop_var = 'avg_vote')

df_top_documents_var =  pd.merge(top_documents_pop, model.data[[model.text_var]].reset_index(), on = model.index_var)

#### Get Folding

In [12]:
cat_1_name = "negative"
cat_2_name = "positive"

cat_1 = pd.DataFrame({'category':cat_1_name, 'term':['hate', 'violence', 'pain', 'negative']})
cat_2 = pd.DataFrame({'category':cat_2_name, 'term':['good', 'positive', "love"]})

dictionnary = pd.concat([cat_1, cat_2]).reset_index(drop=True)

df_folding, met = model.get_folding( dictionnary)

  0%|          | 0/995 [00:00<?, ?it/s]

  0%|          | 0/995 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
df_folding

Unnamed: 0,negative,positive,label,description,0,1
tt0205177,0.010818,0.989182,positive,"Two inseparable, hormonal, English BFF teen bo...",17.148676,9.993646
tt2316801,0.051308,0.948692,positive,An unexpected romance blooms after the the you...,18.261145,9.824251
tt0465234,0.055398,0.944602,positive,Benjamin Gates must follow a clue left in,19.186310,6.952009
tt1846589,0.021137,0.978863,positive,An untested American submarine captain teams w...,17.330006,6.018647
tt0824758,0.195865,0.804135,positive,A historical drama that illustrates Russian au...,15.752570,8.627524
...,...,...,...,...,...,...
tt3286052,0.051308,0.948692,positive,Two girls must battle a mysterious evil force ...,18.292288,9.831059
tt0476958,0.064698,0.935302,positive,Three women are stalked by a killer with a gru...,18.814739,10.211515
tt1527788,0.043843,0.956157,positive,A quiet pawnshop keeper with a violent past ta...,18.458761,9.095775
tt0289043,0.021137,0.978863,positive,"Four weeks after a mysterious, incurable virus...",17.441000,7.976808


In [15]:
df_folding.label.value_counts()

import numpy as np
test = df_folding.copy()

for cat in set(test.label):
    # Take the top 5% of the dataset for cat_1
    test[cat][int(len(test)/20):] = 0
    #test[cat][:int(len(test)/20)] = np.exp(test[cat])
    #test[f'{cat}_exp'] = np.exp(test[cat])



fold_dim = 'positive'
import plotly.express as px
fig = px.scatter(
                test,
                x=0,
                y=1,
                #color = f'{fold_dim}_exp',
                color = fold_dim,
                color_continuous_scale = px.colors.sequential.Blues, 
                hover_data=["description", fold_dim],
                width=1000,
                height=1000,
            )

import plotly
plotly.offline.plot(fig, filename='file.html')

'file.html'

#### Get centroid documents

In [16]:
res = model.get_centroid_documents(top_elements=5)
res['centroid_docs'].iloc[0].split(' || ')[2]

'A New York philosophy grad student turns into a vampire after getting bitten by one, and then tries to come to terms with her new lifestyle and frequent craving for human blood.'