In [1]:
from bunkatopics import BunkaTopics



In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


data = pd.read_csv("data/imdb.csv", index_col=[0])
data = data.sample(5000, random_state=42)

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [67]:
model = BunkaTopics(
    data,  # dataFrame
    text_var="description",  # Text Columns
    index_var="imdb",  # Index Column (Mandatory)
    extract_terms=True,  # extract Terms ?
    terms_embeddings=False,  # extract terms Embeddings?
    docs_embeddings=True,  # extract Docs Embeddings?
    embeddings_model="distiluse-base-multilingual-cased-v1",  # Chose an embeddings Model
    multiprocessing=True,  # Multiprocessing of Embeddings
    language="en",  # Chose between English "en" and French "fr"
    sample_size_terms=len(data),
    terms_limit=10000,  # Top Terms to Output
    terms_ents=True,  # Extract entities
    terms_ngrams=(1, 2),  # Chose Ngrams to extract
    terms_ncs=True,  # Extract Noun Chunks
    terms_include_pos=["NOUN", "PROPN", "ADJ"],  # Include Part-of-Speech
    terms_include_types=["PERSON", "ORG"],
    reduction = 2
)  # Include Entity Types


2023-01-02 20:06:38,994 - Extracting Terms...
100%|███████████████████████████████████████████████████████████| 4982/4982 [00:16<00:00, 306.63it/s]
2023-01-02 20:06:55,562 - Extracting Docs Embeddings...
100%|████████████████████████████████████████████████████████████| 4982/4982 [01:08<00:00, 72.28it/s]


UMAP(random_state=42, verbose=True)
Mon Jan  2 20:08:05 2023 Construct fuzzy simplicial set
Mon Jan  2 20:08:05 2023 Finding Nearest Neighbors
Mon Jan  2 20:08:05 2023 Building RP forest with 9 trees
Mon Jan  2 20:08:05 2023 NN descent for 12 iterations
	 1  /  12
	 2  /  12
	 3  /  12
	 4  /  12
	 5  /  12
	 6  /  12
	 7  /  12
	Stopping threshold met -- exiting after 7 iterations
Mon Jan  2 20:08:05 2023 Finished Nearest Neighbor Search
Mon Jan  2 20:08:05 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Mon Jan  2 20:08:10 2023 Finished embedding


In [88]:
model.docs_embeddings

Unnamed: 0,0,1
tt0205177,12.482412,0.559843
tt2316801,13.944825,0.769886
tt0465234,10.527050,3.093578
tt1846589,11.541634,4.480061
tt0824758,11.076320,-0.206151
...,...,...
tt1477834,14.545440,3.703584
tt1544608,11.337491,0.824299
tt0053285,14.728961,3.275138
tt0053472,10.349391,2.399004


In [94]:
#model.data_clusters.sort_values(0)

In [5]:
import openai
import os

openai.api_key  = os.getenv("OPENAI_API_KEY")

In [68]:
topics = model.get_clusters(
    #topic_number=20,  # Number of Topics
    topic_number = 8, 
    top_terms_included=10000,  # Compute the specific terms from the top n terms
    top_terms=3,  # Most specific Terms to describe the topics
    term_type="lemma",  # Use "lemma" of "text"
    ngrams=[1, 2]
)  # N-grams for Topic Representation
    

In [70]:
from sklearn.cluster import KMeans

def hierarchical_clusters(df, x = 0, y = 1, shapes = (8, 4, 2)):
    
    
    kmeans_0 = KMeans(n_clusters=shapes[0], random_state = 42)
    df['level_0'] = kmeans_0.fit(df[[x, y]].values).labels_
    #df['level_0'] =kmeans_0.labels_
    
    
    list_1 =[]
    for cluster in set(df['level_0']):
        top = df[df['level_0']==cluster]


        if len(top) < shapes[1]:
            n_cluster = len(top)
        else:
            n_cluster =  shapes[1]
            
        kmeans_1 = KMeans(n_clusters=n_cluster, random_state = 42)
        top['level_1'] = kmeans_1.fit(top[[x, y]]).labels_
        list_1.append(top)

    df_1 = pd.concat([x for x in list_1])

    list_2 = []
    for cluster in set(df_1['level_1']):
        top = df_1[df_1['level_1']==cluster]

        if len(top) < shapes[1]:
            n_cluster = len(top)
        else:
            n_cluster =  shapes[1]
            
        kmeans_2 =  KMeans(n_clusters=n_cluster, random_state = 42)
        kmeans_2.fit(top[[x, y]])
        top['level_2'] = kmeans_2.labels_
        list_2.append(top)

    df_2 = pd.concat([x for x in list_2])

    #final = df_2.sort_values(['level_0', 'level_1', 'level_2'], ascending=(True, True, True)).reset_index(drop=True)
    
    return df_2

In [71]:
df_embeddings = model.docs_embeddings.copy()

df_nested = hierarchical_clusters(df_embeddings, x = 0, y = 1, shapes = (8, 4, 2))
df_embeddings = df_embeddings.reset_index()
df_embeddings = df_embeddings.rename(columns = {'index':'imdb'})

In [72]:
kmeans_0 = KMeans(n_clusters=8, random_state = 42)
df_embeddings['level_0'] = kmeans_0.fit(df_embeddings[[0, 1]].values).labels_

In [81]:
df_embeddings[df_embeddings['imdb']=='tt0443496']

Unnamed: 0,imdb,0,1,level_0
4787,tt0443496,11.263092,0.8165,0


In [85]:
test = model.docs_embeddings.copy()
test = test.reset_index()
test[test['index']=='tt2171867']

Unnamed: 0,index,0,1
1516,tt2171867,10.157432,2.614612


In [39]:
def specificity(df: pd.DataFrame, X: str, Y: str, Z: str, top_n: int = 50):

    if Z is None:
        Z = "count_values"
        df[Z] = 1
        group = df.groupby([X, Y]).agg(count_values=(Z, "sum")).reset_index()
        cont = group.pivot(index=X, columns=Y, values=Z).fillna(0).copy()

    else:
        group = df.groupby([X, Y])[Z].sum().reset_index()
        cont = group.pivot(index=X, columns=Y, values=Z).fillna(0).copy()

    tx = df[X].value_counts()
    ty = df[Y].value_counts()

    cont = cont.astype(int)

    tx_df = pd.DataFrame(tx)
    tx_df.columns = ["c"]
    ty_df = pd.DataFrame(ty)
    ty_df.columns = ["c"]

    # Valeurs totales observées
    n = group[Z].sum()

    # Produit matriciel. On utilise pd.T pour pivoter une des deux séries.
    indep = tx_df.dot(ty_df.T) / n

    cont = cont.reindex(indep.columns, axis=1)
    cont = cont.reindex(indep.index, axis=0)

    # Contingency Matrix
    ecart = (cont - indep) ** 2 / indep
    chi2 = ecart.sum(axis=1)
    chi2 = chi2.sort_values(ascending=False)
    spec = ecart * np.sign(cont - indep)

    # Edge Table of X, Y, specificity measure
    spec[X] = spec.index
    edge = pd.melt(spec, id_vars=[X])
    edge.columns = [X, Y, "spec"]
    edge = edge.sort_values(by=[X, "spec"], ascending=[True, False]).reset_index(
        drop=True
    )
    edge = edge[edge.spec > 0]
    edge = edge.groupby([X]).head(top_n)
    edge = edge.reset_index(drop=True)

    return spec, chi2, edge

In [8]:
def get_openai_topic(str):
    
    prompt = f'Summarize the following terms in 1 short topic or theme related to movies in no more than 4 words:\n{str}'
    #prompt = f'Summarize the following terms in 1 short topic of maximum 2 words:\n {str}'
        
    api_res = openai.Completion.create(
        model='text-davinci-003',
        prompt=prompt,
        max_tokens=250,
        temperature=0
    )
    
    res = api_res['choices'][0]['text']
    
    return res
    

def clean_openai_topic(str):
    
    res = str.replace('\n\n', ' ')
    res = res.replace('.', '')
    res = res.replace('|', '')
    res = res.strip()
    res = res.title()
    
    return res
topics['openai_topic'] = topics['cluster_name'].apply(lambda x : get_openai_topic(x))
topics['openai_topic_clean'] = topics['openai_topic'].apply(lambda x: clean_openai_topic(x))
topics

2023-01-02 19:01:45,475 - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/completions
2023-01-02 19:01:47,216 - message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=1165 request_id=7c119484e34c1997265d4f704f1c57fd response_code=200
2023-01-02 19:01:47,222 - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/completions
2023-01-02 19:01:48,061 - message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=638 request_id=574c153d16eed29880eaeaa38c967aa7 response_code=200
2023-01-02 19:01:48,064 - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/completions
2023-01-02 19:01:48,939 - message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=669 request_id=d74b899879b95e1f343755d41702db85 response_code=200
2023-01-02 19:01:48,941 - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/completions
2023-01-02

Unnamed: 0,cluster,cluster_name,topic_size,percent,0,1,openai_topic,openai_topic_clean
0,8,police | cop | detective,709,14.2,10.643823,2.661458,\n\nPolice/Cop/Detective.,Police/Cop/Detective
1,5,family | mother | father,644,12.9,13.746353,1.256708,\n\nFamily Dynamics.,Family Dynamics
2,4,stranger | man | people,553,11.1,12.455687,2.600999,\n\nStrangers becoming people.,Strangers Becoming People
3,1,school | high | high school,546,11.0,11.851448,0.993126,\n\nHigh school movies.,High School Movies
4,9,wife | husband | woman,501,10.1,13.982466,-0.254639,| man\n\nGender in Movies.,Man Gender In Movies
5,6,love | school | teacher,491,9.9,12.593341,-0.529047,\n\nLove at School.,Love At School
6,2,alien | earth | planet,401,8.0,13.32707,4.210618,\n\nAlien invasion of Earth.,Alien Invasion Of Earth
7,0,war | world war | war ii,399,8.0,11.474276,4.515352,\n\nWorld War II Movies,World War Ii Movies
8,3,player | baseball | star,389,7.8,10.224411,0.019004,\n\nBaseball movie stars.,Baseball Movie Stars
9,7,king | princess | vampire,349,7.0,14.293118,3.227856,\n\nFantasy Adventure Films,Fantasy Adventure Films


In [9]:
# Learning to Prompt

In [10]:
# compute the optimal number of clusters

len_data = 2000

min_doc_cluster = 100

max_cluster = round(len_data/min_doc_cluster, 0)
max_cluster

10/2

5.0

In [11]:
def get_alt_topic(str):
    
    res = str.split(' | ')[:2]
    res = [x.capitalize() for x in res]
    res = ' & '.join(res)
    
    return res

topics['alternative_name'] = topics['cluster_name'].apply(lambda x : get_alt_topic(x))

In [12]:
from scipy.spatial import ConvexHull
from scipy import interpolate

def get_convex_hull_coord_all(df, interpolate_curve = True):

    points = df[[0, 1]].values
    hull = ConvexHull(points)
    x_hull = np.append(points[hull.vertices,0],
                       points[hull.vertices,0][0])
    y_hull = np.append(points[hull.vertices,1],
                       points[hull.vertices,1][0])
    
    
    if interpolate_curve:
        # interpolate
        dist = np.sqrt((x_hull[:-1] - x_hull[1:])**2 + (y_hull[:-1] - y_hull[1:])**2)
        dist_along = np.concatenate(([0], dist.cumsum()))
        spline, u = interpolate.splprep([x_hull, y_hull], 
                                        u=dist_along, s=0, per=1)
        interp_d = np.linspace(dist_along[0], dist_along[-1], 50)
        interp_x, interp_y = interpolate.splev(interp_d, spline)
    else:
        interp_x = x_hull
        interp_y = y_hull

    return interp_x, interp_y

def get_convex_hull_coord(df, i, col = 'cluster',  interpolate_curve = True):

    points = df[df[col] == i][[0, 1]].values
    hull = ConvexHull(points)
    x_hull = np.append(points[hull.vertices,0],
                       points[hull.vertices,0][0])
    y_hull = np.append(points[hull.vertices,1],
                       points[hull.vertices,1][0])
    
    
    if interpolate_curve:
        # interpolate
        dist = np.sqrt((x_hull[:-1] - x_hull[1:])**2 + (y_hull[:-1] - y_hull[1:])**2)
        dist_along = np.concatenate(([0], dist.cumsum()))
        spline, u = interpolate.splprep([x_hull, y_hull], 
                                        u=dist_along, s=0, per=1)
        interp_d = np.linspace(dist_along[0], dist_along[-1], 50)
        interp_x, interp_y = interpolate.splev(interp_d, spline)
    else:
        interp_x = x_hull
        interp_y = y_hull

    return interp_x, interp_y

In [13]:
import numpy as np

import plotly.graph_objects as go

In [14]:
width = 1000
height = 1000

In [15]:
x = model.docs_embeddings[0].values
y = model.docs_embeddings[1].values


model.data['avg_vote_bin'] = pd.qcut(model.data['avg_vote'], q = 3, labels = np.arange(3)).astype(int)
weights = model.data['avg_vote_bin'].values



interp_x, interp_y = get_convex_hull_coord_all(model.data_clusters[[0, 1]], interpolate_curve = False)

fig_density = go.Figure(
        go.Histogram2dContour(x=x, y=y, z = weights, colorscale="delta", histfunc = 'avg')
    )



# Add points with information
fig_density.add_trace(
    go.Scatter(
        x=interp_x,
        y=interp_y,
        mode="lines"
))






fig_density.update_layout(
        #font_size=25,
        width=1000,
        height=1000,
    
    margin=dict(
            t=0,
            b=0,
            r=0,
            l=0,
        ),)

for i in model.data_clusters['cluster'].unique():
    # get the convex hull
    
    interp_x, interp_y = get_convex_hull_coord(model.data_clusters, i, interpolate_curve = False)

    #plt.fill(interp_x, interp_y, color = 'blue', alpha=0.1)
    #plt.plot(interp_x, interp_y, 'black', alpha=0.4)

    # Add points with information
    fig_density.add_trace(
        go.Scatter(
            x=interp_x,
            y=interp_y,
            mode="lines"
    ))

    



# Add annotations for each row
for i, t in enumerate(text):
    
    fig_density.add_annotation(
            x=x[i],
            y=y[i],
            text=t,
            showarrow=True,
            arrowhead=1,
            font=dict(
                family="Courier New, monospace", size=1000 / 75, color="#ffffff"
            ),
            bordercolor="#c7c7c7",
            borderwidth=1000 / 1000,
            borderpad=1000 / 500,
            bgcolor="#ff7f0e",
            opacity=1,
            arrowcolor="#ff7f0e",
        )
    
    
        
    
fig_density.update_layout(showlegend=False)
fig_density.update_yaxes(showticklabels=False)
model.topics
x = model.docs_embeddings[0].values
y = model.docs_embeddings[1].values


model.data['avg_vote_bin'] = pd.qcut(model.data['avg_vote'], q = 3, labels = np.arange(3)).astype(int)
weights = model.data['avg_vote_bin'].values





fig_density.update_layout(
        #font_size=25,
        width=1000,
        height=1000,
    
    margin=dict(
            t=0,
            b=0,
            r=0,
            l=0,
        ),)

for i in model.data_clusters['cluster'].unique():
    # get the convex hull
    
    interp_x, interp_y = get_convex_hull_coord(model.data_clusters, i, interpolate_curve = False)

    #plt.fill(interp_x, interp_y, color = 'blue', alpha=0.1)
    #plt.plot(interp_x, interp_y, 'black', alpha=0.4)

    # Add points with information
    fig_density.add_trace(
        go.Scatter(
            x=interp_x,
            y=interp_y,
            mode="lines"
    ))

    
# Extract the data to plot
x = model.topics[0]
y = model.topics[1]
text = model.topics['alternative_name']
#text = model.topics['openai_topic_clean']


# Add annotations for each row
for i, t in enumerate(text):
    
    fig_density.add_annotation(
            x=x[i],
            y=y[i],
            text=t,
            showarrow=True,
            arrowhead=1,
            font=dict(
                family="Courier New, monospace", size=1000 / 75, color="#ffffff"
            ),
            bordercolor="#c7c7c7",
            borderwidth=1000 / 1000,
            borderpad=1000 / 500,
            bgcolor="#ff7f0e",
            opacity=1,
            arrowcolor="#ff7f0e",
        )
    
    
        
    
fig_density.update_layout(showlegend=False)
fig_density.update_yaxes(showticklabels=False)

NameError: name 'text' is not defined

In [None]:
## Hierachical Topics

In [None]:
# compute the right amptn of clusters

In [None]:

test = model.data_clusters.reset_index()

test[test['imdb']=='tt1091229']

In [None]:
df_terms_indexed = model.df_terms_indexed.reset_index()
df_terms_indexed = df_terms_indexed.explode('text').reset_index(drop=True)

df_terms_indexed = pd.merge(df_terms_indexed, model.terms.reset_index(), on = 'text')
df_terms_indexed = df_terms_indexed[df_terms_indexed['ngrams']==1].reset_index(drop=True)

In [None]:
df_merge_terms = pd.merge(topics_nested, df_terms_indexed, on = 'imdb')


In [None]:
new_df = df_merge_terms[['level_0', 'lemma']].drop_duplicates()

_, _, toop_1 = specificity(new_df, "level_0", "lemma", None, top_n=3)
toop_1 = toop_1.groupby('level_0')['lemma'].apply(lambda x : ' | '.join(x)).reset_index()

In [None]:
#df_merge_terms = df_merge_terms.rename(columns = {'text':'lemma'})

In [None]:
levels = (0, 1, 2)
top_n = 3




final = df_merge_terms[['level_0', 'level_1', 'level_2', 'lemma']]



In [None]:
toop_1

In [None]:
final

In [None]:


_, _, toop_1 = specificity(final, f"level_{levels[0]}", "lemma", None, top_n=top_n)
toop_1 = toop_1.rename(columns={"lemma": f"lemma_{levels[0]}"})
toop_1 = (
    toop_1.groupby([f"level_{levels[0]}"])[f"lemma_{levels[0]}"]
    .apply(lambda x: " | ".join(x))
    .reset_index()
)


new_df = final[['level_0', 'level_1', 'lemma']].drop_duplicates()

list_toop_2 = []
for i in set(new_df['level_0']):
    new_df_filtered = new_df[new_df['level_0']==i]
    _, _, toop_2 = specificity(new_df_filtered, f"level_1", "lemma", None, top_n=top_n)
    toop_2['level_0'] = i
    list_toop_2.append(toop_2)
    
df_toop_2 = pd.concat([x for x in list_toop_2])
df_toop_2 = df_toop_2.groupby(['level_0', 'level_1'])['lemma'].apply(lambda x : ' | '.join(x)).reset_index()
df_toop_2