In [12]:
import pandas as pd
from bunka_topics.basic_class import BasicSemantics

In [14]:
df = pd.read_csv('data/imdb.csv', index_col = [0])
df = df.sample(1000, random_state = 42)
df = df[['imdb', 'description']].dropna()


In [None]:
# instantiate model
model = BasicSemantics(df, 
                       index_var="imdb", 
                       text_var="description")


model.fit(extract_terms=True,
            terms_embeddings=True,
            docs_embeddings=True,
            sample_size_terms=10000,
            terms_limit=3000,
            terms_ents=True, 
            language = 'en'
         )

Extracting Terms...


  0%|                                                                           | 0/995 [00:00<?, ?it/s]

Extracting Terms...


100%|████████████████████████████████████████████████████████████| 10000/10000 [00:25<00:00, 391.76it/s]


Extracting Docs Embeddings...


100%|█████████████████████████████████████████████████████████████| 12531/12531 [02:52<00:00, 72.66it/s]


UMAP(n_components=5, verbose=True)
Fri May 20 09:43:01 2022 Construct fuzzy simplicial set
Fri May 20 09:43:01 2022 Finding Nearest Neighbors
Fri May 20 09:43:01 2022 Building RP forest with 11 trees
Fri May 20 09:43:01 2022 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	 6  /  14
	Stopping threshold met -- exiting after 6 iterations
Fri May 20 09:43:07 2022 Finished Nearest Neighbor Search
Fri May 20 09:43:09 2022 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

Fri May 20 09:43:18 2022 Finished embedding
Extracting Terms Embeddings...


100%|██████████████████████████████████████████████████████████████| 3000/3000 [00:25<00:00, 119.83it/s]


UMAP(n_components=5, verbose=True)
Fri May 20 09:43:44 2022 Construct fuzzy simplicial set
Fri May 20 09:43:48 2022 Finding Nearest Neighbors
Fri May 20 09:43:49 2022 Finished Nearest Neighbor Search
Fri May 20 09:43:50 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Fri May 20 09:43:53 2022 Finished embedding


In [5]:
import pandas as pd
import plotly.express as px
import numpy as np
import warnings
import umap
from sklearn.cluster import KMeans
import logging

from topic_modeling.specificity import specificity
from topic_modeling.basic_class import BasicSemantics
from topic_modeling.centroids import find_centroids
from topic_modeling.extract_terms import extract_terms, extract_terms_df

In [6]:
import bamboolib

In [7]:
#top_terms = model.terms.head(300).copy()

#df_terms_indexed = model.df_terms_indexed.reset_index().copy()
#df_terms_indexed = df_terms_indexed.explode('text')
#df_terms_indexed = df_terms_indexed[df_terms_indexed['text'].isin(top_terms.index)]

#model.terms = top_terms
#model.df_terms_indexed = df_terms_indexed

In [8]:
#df_index_extented = model.df_terms_indexed.reset_index().copy()
#df_index_extented = df_index_extented.explode('text').reset_index(drop=True)
#df_index_extented = df_index_extented.set_index(model.index_var)

In [58]:
def get_clusters(topic_number=20, top_terms=10, term_type = 'lemma', top_terms_included = 100, ngrams = [1, 2]):  
    
    
    model.data["cluster"] = (
        KMeans(n_clusters=topic_number)
        .fit(model.docs_embeddings)
        .labels_.astype(str)
    )
    
    df_index_extented = model.df_terms_indexed.reset_index().copy()
    df_index_extented = df_index_extented.explode('text').reset_index(drop=True)
    
    df_index_extented = pd.merge(df_index_extented, model.terms[model.terms['ngrams'].isin(ngrams)].reset_index().head(top_terms_included), on = 'text')
    df_index_extented = df_index_extented.set_index(model.index_var)

    # Get the Topics Names
    df_clusters = pd.merge(
        model.data[["cluster"]],
        df_index_extented,
        left_index=True,
        right_index = True
    )

    _, _, edge = specificity(
        df_clusters, X="cluster", Y=term_type, Z=None, top_n=top_terms
    )

    topics = (
        edge.groupby("cluster")[term_type].apply(lambda x: " | ".join(x)).reset_index()
    )
    topics = topics.rename(columns={term_type: "cluster_name"})

    # Get the Topics Size

    topic_size = (
        model.data[["cluster"]].reset_index()
        .groupby("cluster")[model.index_var]
        .count()
        .reset_index()
    )
    topic_size.columns = ["cluster", "topic_size"]

    topics = pd.merge(topics, topic_size, on="cluster")
    topics = topics.sort_values("topic_size", ascending=False)
    model.topics = topics.reset_index(drop=True)

    model.df_topics_names = pd.merge(
        model.data[["cluster"]].reset_index(), topics, on="cluster"
    )

    model.df_topics_names["cluster_name_number"] = (
        model.df_topics_names["cluster"]
        + " - "
        + model.df_topics_names["cluster_name"]
    )
    
    model.df_topics_names = model.df_topics_names.set_index(model.index_var)

    return model.topics

In [61]:
res = get_clusters(topic_number=10, top_terms=10, top_terms_included = 200, term_type = "text", ngrams = [1, 2])

In [52]:
def wrap_by_word(string, n_words):
    """returns a string where \\n is inserted between every n words"""
    try:
        a = string.split()
        ret = ""
        for i in range(0, len(a), n_words):
            ret += " ".join(a[i : i + n_words]) + "<br>"
    except:
        pass

    return ret

In [77]:
search = 'commercial'

df_search = model.data[model.text_var].reset_index()
df_search = df_search[df_search['text'].str.contains(search, case=False)]
df_search = df_search.set_index(model.index_var)

In [62]:



"""
Visualize the embeddings in 2D.
There is an hover for the text and clusters have names.

"""

res = pd.merge(
    model.docs_embeddings,
    model.df_topics_names,
    left_index = True, 
    right_index = True,
)


#model.data = model.data.set_index(model.index_var)
res = pd.merge(res.drop("cluster", axis=1), model.data, left_index = True, right_index = True,)

# if not hasattr(model, "embeddings_2d"):
model.embeddings_2d = umap.UMAP(n_components=2, verbose=True).fit_transform(
    res[[0, 1, 2, 3, 4]]
)

res["dim_1"] = model.embeddings_2d[:, 0]
res["dim_2"] = model.embeddings_2d[:, 1]

res[model.text_var] = res[model.text_var].apply(lambda x: wrap_by_word(x, 10))
res["cluster_label"] = (
    res["cluster"].astype(object) + " - " + res["cluster_name"]
)

res['cluster_size'] = res['cluster'].astype(str) + '| ' + res['topic_size'].astype(str)

model.df_fig = res.reset_index(drop=True)

UMAP( verbose=True)
Fri May 20 13:30:05 2022 Construct fuzzy simplicial set
Fri May 20 13:30:05 2022 Finding Nearest Neighbors
Fri May 20 13:30:05 2022 Building RP forest with 11 trees
Fri May 20 13:30:05 2022 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	Stopping threshold met -- exiting after 3 iterations
Fri May 20 13:30:05 2022 Finished Nearest Neighbor Search
Fri May 20 13:30:05 2022 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

Fri May 20 13:30:19 2022 Finished embedding


In [82]:
df_fig = model.df_fig.copy()


#if search is not None:
    
centroids_emb = df_fig[['dim_1', 'dim_2', 'cluster_name_number']]
centroids_emb = centroids_emb.groupby('cluster_name_number').mean().reset_index()
centroids_emb.columns = ['centroid_name', 'dim_1', 'dim_2']

df_fig_centroids = pd.concat([df_fig, centroids_emb])
df_fig_centroids['centroid_name'] = df_fig_centroids['centroid_name'].fillna(" ")
df_fig_centroids['cluster_size'] = df_fig_centroids['cluster_size'].fillna("centroids")

fig = px.scatter(
            df_fig_centroids,
            x="dim_1",
            y="dim_2",
            color="cluster_size",
            text = 'centroid_name',
            hover_data=[model.text_var],
            width=2000,
            height=2000,
        )
#fig.update_layout(showlegend=False)
#fig.show()
import plotly
plotly.offline.plot(fig, filename='test_topic.html')


'test_topic.html'