In [1]:
import pandas as pd
import numpy as np
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_pickle("sample_data_no_images.pkl")

In [3]:
data["questions"] = [q[11:-10] for q in data["questions"].tolist()]

In [4]:
data

Unnamed: 0,questions,answers,image_classes,hidden_states
0,what ethnicity of cuisine does this restaurant...,A Chinese Cuisine restaurant.,"[Traffic sign, Stop sign]","[17.110271453857422, 13.300082206726074, 13.05..."
1,what does the license plate read?,A vehicle with a license plate that reads VE 8...,"[Person, Human body, Human leg, Human hair, Ma...","[13.568666458129883, 12.785804748535156, 13.19..."
2,what numbers are on the tail of the helicopter?,AAU.,"[Toy, Vehicle, Helicopter, Aircraft]","[12.305842399597168, 13.563484191894531, 14.84..."
3,what is written on the button with a blue outl...,A Kern EMB 220-1 is on the table.,"[Vehicle registration plate, Saucer, Plate, Pl...","[12.50804328918457, 15.923273086547852, 13.584..."
4,what is the 2nd letter of the word on the cont...,'A',"[Drum, Drink]","[13.862640380859375, 12.851885795593262, 14.51..."
...,...,...,...,...
315,what three letters are above the protein plus?,Arealkalita.,"[Drink, Food]","[12.743967056274414, 19.589651107788086, 14.17..."
316,what is the product shown?,A fruit shoot fruit shoot.,[Poster],"[13.869406700134277, 21.49329948425293, 13.056..."
317,what are the first two words on that piece of ...,'please note',"[Furniture, Cabinetry, Bed, Chest of drawers, ...","[11.29504108428955, 19.662620544433594, 13.571..."
318,what flavor is on the cup?,ABYSS,"[Beer, Dessert, Drink, Dairy, Mug, Coffee cup,...","[14.276371955871582, 15.000089645385742, 12.89..."


# The Topic Model

### Reduce the Dimensionality of the Embeddings With UMAP

In [5]:
embeddings_2d = umap.UMAP(
    n_neighbors=3,
    n_components=2,
    min_dist=0.0,
    metric="cosine",
    random_state=42
).fit_transform(np.array(data["hidden_states"].tolist()))

embeddings_2d

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


array([[10.11412   , 12.656948  ],
       [ 3.2380507 ,  0.9377883 ],
       [ 8.680018  ,  1.325887  ],
       [ 3.8704824 ,  7.656681  ],
       [ 3.6234593 ,  2.5589933 ],
       [ 4.5349526 ,  7.323949  ],
       [ 7.073277  ,  8.189024  ],
       [ 9.75236   , 14.577998  ],
       [ 4.6440935 ,  8.443387  ],
       [ 2.313878  ,  2.99107   ],
       [ 9.820884  , 10.765703  ],
       [10.363189  , 10.464721  ],
       [ 4.6144934 ,  7.4769917 ],
       [ 9.465852  ,  8.94971   ],
       [19.01484   , -6.232723  ],
       [ 8.758738  , 15.606736  ],
       [ 8.312315  ,  1.6160725 ],
       [ 5.82069   , 10.039809  ],
       [ 6.698754  ,  5.362098  ],
       [11.911112  ,  4.890356  ],
       [ 9.765072  ,  2.4393837 ],
       [ 5.1628437 ,  5.8300967 ],
       [ 5.7801933 , 10.511646  ],
       [ 3.9304714 ,  3.1438665 ],
       [ 6.4751306 ,  8.6955185 ],
       [19.010944  , -6.23797   ],
       [-7.8849945 ,  3.3081334 ],
       [ 7.952765  ,  1.2429444 ],
       [10.744633  ,

### Cluster With HDBSCAN

In [6]:
clusters = hdbscan.HDBSCAN(
    min_cluster_size=8,
    prediction_data=True,
).fit(embeddings_2d)

clusters

In [7]:
np.unique(clusters.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7])

### Augment the Dataset
Append the X and Y coordinates from UMAP Dim Reduction to each embedding, and also assign its cluster

In [8]:
data["x"] = embeddings_2d[:, 0]
data["y"] = embeddings_2d[:, 1]
data["clusters"] = clusters.labels_

data

Unnamed: 0,questions,answers,image_classes,hidden_states,x,y,clusters
0,what ethnicity of cuisine does this restaurant...,A Chinese Cuisine restaurant.,"[Traffic sign, Stop sign]","[17.110271453857422, 13.300082206726074, 13.05...",10.114120,12.656948,4
1,what does the license plate read?,A vehicle with a license plate that reads VE 8...,"[Person, Human body, Human leg, Human hair, Ma...","[13.568666458129883, 12.785804748535156, 13.19...",3.238051,0.937788,3
2,what numbers are on the tail of the helicopter?,AAU.,"[Toy, Vehicle, Helicopter, Aircraft]","[12.305842399597168, 13.563484191894531, 14.84...",8.680018,1.325887,1
3,what is written on the button with a blue outl...,A Kern EMB 220-1 is on the table.,"[Vehicle registration plate, Saucer, Plate, Pl...","[12.50804328918457, 15.923273086547852, 13.584...",3.870482,7.656681,7
4,what is the 2nd letter of the word on the cont...,'A',"[Drum, Drink]","[13.862640380859375, 12.851885795593262, 14.51...",3.623459,2.558993,3
...,...,...,...,...,...,...,...
315,what three letters are above the protein plus?,Arealkalita.,"[Drink, Food]","[12.743967056274414, 19.589651107788086, 14.17...",8.984244,15.321684,2
316,what is the product shown?,A fruit shoot fruit shoot.,[Poster],"[13.869406700134277, 21.49329948425293, 13.056...",9.875735,15.359409,2
317,what are the first two words on that piece of ...,'please note',"[Furniture, Cabinetry, Bed, Chest of drawers, ...","[11.29504108428955, 19.662620544433594, 13.571...",2.351391,2.375961,3
318,what flavor is on the cup?,ABYSS,"[Beer, Dessert, Drink, Dairy, Mug, Coffee cup,...","[14.276371955871582, 15.000089645385742, 12.89...",8.175888,1.179951,1


## Find Frequent Words in Clusters


In [9]:
n_top_words = 5

def generate_c_tf_idf(docs_per_topic, clustered_embeddings_2d, col):
    count = CountVectorizer(
        ngram_range=(1, 1),
        stop_words="english"
    ).fit(docs_per_topic[col].values)

    t = count.transform(docs_per_topic[col].values).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(len(clustered_embeddings_2d), sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count


def extract_top_n_words_per_topic(grouped, c_tf_idf, count: CountVectorizer, n=20):
    words = count.get_feature_names_out()
    labels = sorted(list(grouped["clusters"]))
    tf_idf_transposed = c_tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {
        label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)
    }
    return top_n_words

def print_to_words(top_words):
    c_ids = list(sorted(top_words.keys()))
    if -1 in top_words.keys():
        c_ids = c_ids[1:]
        c_ids.append(-1)

    for start in range(0, len(c_ids), 2):
        current_ids = c_ids[start:(start+2)]

        print("-"*(33*len(current_ids) + 2))
        for i, c_id in enumerate(current_ids):
            idx = "|  " if i == 0 else ""
            end = "\n" if i == len(current_ids) - 1 else ""
            msg = f"Topic #{c_id}" if c_id != -1 else "Outliers (-1)"
            print(idx, msg.ljust(29), "| ", end=end)
        print("-"*(33*len(current_ids) + 2))

        for i, word_scores in enumerate(zip(*[top_words[i] for i in current_ids])):
            for j, (word, score) in enumerate(word_scores):
                end = " " if j < len(word_scores) else "\n"
                idx = str(i + 1).ljust(3) if j == 0 else ""
                print(idx, word.ljust(20), str(score.round(5)).ljust(8), "|", end=end)
            print()
        print()
        

### Class-Based Term-Frequency Inverse-Document-Frequency (in answers)
This allows us to find the frequency and relevancy of a word in a cluster relative to other clusters.

In [10]:
grouped_answers = data.groupby(['clusters'], as_index=False).agg({'answers': ' '.join})
c_tf_idf_ans, count_ans = generate_c_tf_idf(grouped_answers, data, "answers")

grouped_answers

Unnamed: 0,clusters,answers
0,-1,A poster for Parachute Jump Steeplechase Park....
1,0,A person is holding a phone that says Run Reco...
2,1,AAU. A dead end sign. A red stop sign is on a ...
3,2,A book titled Take Your Girlie To The Movies. ...
4,3,A vehicle with a license plate that reads VE 8...
5,4,A Chinese Cuisine restaurant. A web corner is ...
6,5,'a' A man sits at a table with a book titled '...
7,6,A person is holding a bottle of irit A person ...
8,7,A Kern EMB 220-1 is on the table. A gallon of ...


#### Extract top n Words From Each Topic (in answers)

In [11]:
top_words_in_answers = extract_top_n_words_per_topic(grouped_answers, c_tf_idf_ans, count_ans, n_top_words)

print_to_words(top_words_in_answers)

--------------------------------------------------------------------
|   Topic #0                      |  Topic #1                      | 
--------------------------------------------------------------------
1   raja                 0.33355  |  boat                 0.13681  | 
2   app                  0.22237  |  sign                 0.11741  | 
3   menu                 0.22237  |  stop                 0.11532  | 
4   open                 0.22237  |  plane                0.11532  | 

--------------------------------------------------------------------
|   Topic #2                      |  Topic #3                      | 
--------------------------------------------------------------------
1   bottle               0.2089   |  box                  0.44633  | 
2   titled               0.11242  |  person               0.09894  | 
3   fruit                0.09951  |  wearing              0.08962  | 
4   arealkalita          0.09951  |  shirt                0.08609  | 

----------------------

### Class-Based Term-Frequency Inverse-Document-Frequency (in questions)
This allows us to find the frequency and relevancy of a word in a cluster relative to other clusters.

In [12]:
grouped_questions = data.groupby(['clusters'], as_index=False).agg({'questions': ' '.join})
c_tf_idf_ques, count_ques = generate_c_tf_idf(grouped_questions, data, "questions")

grouped_questions

Unnamed: 0,clusters,questions
0,-1,what activity is being performed here? what is...
1,0,what is the total distance in kilometers? what...
2,1,what numbers are on the tail of the helicopter...
3,2,it is a romantic movie? what is lite? who pres...
4,3,what does the license plate read? what is the ...
5,4,what ethnicity of cuisine does this restaurant...
6,5,what are the letters at the bottom right of th...
7,6,what brand name is written in black on counter...
8,7,what is written on the button with a blue outl...


#### Extract top n Words From Each Topic (in questions)

In [13]:
top_words_in_questions = extract_top_n_words_per_topic(grouped_questions, c_tf_idf_ques, count_ques, n_top_words)
print_to_words(top_words_in_questions)

--------------------------------------------------------------------
|   Topic #0                      |  Topic #1                      | 
--------------------------------------------------------------------
1   app                  0.6919   |  plane                0.18566  | 
2   shown                0.3759   |  number               0.16674  | 
3   need                 0.3036   |  sign                 0.15942  | 
4   account              0.3036   |  boat                 0.1565   | 

--------------------------------------------------------------------
|   Topic #2                      |  Topic #3                      | 
--------------------------------------------------------------------
1   left                 0.27346  |  letter               0.1392   | 
2   brand                0.13494  |  number               0.11745  | 
3   product              0.13065  |  kind                 0.0984   | 
4   food                 0.12378  |  guy                  0.08811  | 

----------------------

## Combine all the Data

In [14]:
data.to_csv("embeds.csv", index=False)

In [15]:
def convert_topics_to_df(topics):
    topic_ids = []
    words = []
    values = []

    for topic_id, vals in topics.items():
        for word, value in vals:
            topic_ids.append(topic_id)
            words.append(word)
            values.append(value)

    return pd.DataFrame({
        "clusters": topic_ids,
        "word": words,
        "score": values
    })

convert_topics_to_df(top_words_in_answers).to_csv("answer_topics.csv", index=False)
convert_topics_to_df(top_words_in_questions).to_csv("question_topics.csv", index=False)