In [1]:
import pandas as pd
import numpy as np
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_pickle("sample_data_no_images.pkl")

In [3]:
data["questions"] = [q[11:-10] for q in data["questions"].tolist()]

In [4]:
data

Unnamed: 0,questions,answers,image_classes,hidden_states
0,what ethnicity of cuisine does this restaurant...,A Chinese Cuisine restaurant.,"[Traffic sign, Stop sign]","[17.110271453857422, 13.300082206726074, 13.05..."
1,what does the license plate read?,A vehicle with a license plate that reads VE 8...,"[Person, Human body, Human leg, Human hair, Ma...","[13.568666458129883, 12.785804748535156, 13.19..."
2,what numbers are on the tail of the helicopter?,AAU.,"[Toy, Vehicle, Helicopter, Aircraft]","[12.305842399597168, 13.563484191894531, 14.84..."
3,what is written on the button with a blue outl...,A Kern EMB 220-1 is on the table.,"[Vehicle registration plate, Saucer, Plate, Pl...","[12.50804328918457, 15.923273086547852, 13.584..."
4,what is the 2nd letter of the word on the cont...,'A',"[Drum, Drink]","[13.862640380859375, 12.851885795593262, 14.51..."
...,...,...,...,...
315,what three letters are above the protein plus?,Arealkalita.,"[Drink, Food]","[12.743967056274414, 19.589651107788086, 14.17..."
316,what is the product shown?,A fruit shoot fruit shoot.,[Poster],"[13.869406700134277, 21.49329948425293, 13.056..."
317,what are the first two words on that piece of ...,'please note',"[Furniture, Cabinetry, Bed, Chest of drawers, ...","[11.29504108428955, 19.662620544433594, 13.571..."
318,what flavor is on the cup?,ABYSS,"[Beer, Dessert, Drink, Dairy, Mug, Coffee cup,...","[14.276371955871582, 15.000089645385742, 12.89..."


# The Topic Model

### Reduce the Dimensionality of the Embeddings With UMAP

In [5]:
embeddings_2d = umap.UMAP(
    n_neighbors=8,
    n_components=2,
    min_dist=0.0,
    metric="cosine",
    random_state=42
).fit_transform(np.array(data["hidden_states"].tolist()))

embeddings_2d

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


array([[ 4.082209 ,  4.946981 ],
       [ 8.95047  ,  5.619771 ],
       [10.515555 ,  3.9264753],
       [ 6.9284763,  3.6484177],
       [ 9.588885 ,  5.623309 ],
       [ 6.400137 ,  3.8952918],
       [ 6.9367533,  6.115734 ],
       [ 2.5288975,  5.3121047],
       [ 6.4655914,  4.9769816],
       [ 9.794556 ,  3.8213296],
       [ 4.0610447,  4.6157794],
       [ 4.426619 ,  4.44627  ],
       [ 5.4905386,  4.2346487],
       [ 6.2526045,  5.5308595],
       [ 2.4888084,  4.2407546],
       [ 2.9905143,  5.9687266],
       [ 7.261942 ,  3.340099 ],
       [ 6.6431913,  6.4957643],
       [ 7.634403 ,  6.4207063],
       [ 6.3226943,  3.2846642],
       [ 9.11071  ,  7.362857 ],
       [ 8.441862 ,  5.7301154],
       [ 7.0352025,  6.590607 ],
       [ 9.537308 ,  5.252404 ],
       [ 6.831423 ,  5.954632 ],
       [ 2.5279613,  4.249075 ],
       [ 3.7460144,  2.9390404],
       [ 9.507171 ,  3.5718508],
       [ 3.918123 ,  3.7946842],
       [ 8.728104 ,  3.7314043],
       [ 2

### Cluster With HDBSCAN

In [6]:
clusters = hdbscan.HDBSCAN(
    min_cluster_size=8,
    prediction_data=True,
).fit(embeddings_2d)

clusters

In [7]:
np.unique(clusters.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7])

### Augment the Dataset
Append the X and Y coordinates from UMAP Dim Reduction to each embedding, and also assign its cluster

In [8]:
data["x"] = embeddings_2d[:, 0]
data["y"] = embeddings_2d[:, 1]
data["clusters"] = clusters.labels_

data

Unnamed: 0,questions,answers,image_classes,hidden_states,x,y,clusters
0,what ethnicity of cuisine does this restaurant...,A Chinese Cuisine restaurant.,"[Traffic sign, Stop sign]","[17.110271453857422, 13.300082206726074, 13.05...",4.082209,4.946981,6
1,what does the license plate read?,A vehicle with a license plate that reads VE 8...,"[Person, Human body, Human leg, Human hair, Ma...","[13.568666458129883, 12.785804748535156, 13.19...",8.950470,5.619771,5
2,what numbers are on the tail of the helicopter?,AAU.,"[Toy, Vehicle, Helicopter, Aircraft]","[12.305842399597168, 13.563484191894531, 14.84...",10.515555,3.926475,0
3,what is written on the button with a blue outl...,A Kern EMB 220-1 is on the table.,"[Vehicle registration plate, Saucer, Plate, Pl...","[12.50804328918457, 15.923273086547852, 13.584...",6.928476,3.648418,2
4,what is the 2nd letter of the word on the cont...,'A',"[Drum, Drink]","[13.862640380859375, 12.851885795593262, 14.51...",9.588885,5.623309,5
...,...,...,...,...,...,...,...
315,what three letters are above the protein plus?,Arealkalita.,"[Drink, Food]","[12.743967056274414, 19.589651107788086, 14.17...",2.674927,6.195192,4
316,what is the product shown?,A fruit shoot fruit shoot.,[Poster],"[13.869406700134277, 21.49329948425293, 13.056...",3.190143,4.627464,-1
317,what are the first two words on that piece of ...,'please note',"[Furniture, Cabinetry, Bed, Chest of drawers, ...","[11.29504108428955, 19.662620544433594, 13.571...",9.301596,4.187019,0
318,what flavor is on the cup?,ABYSS,"[Beer, Dessert, Drink, Dairy, Mug, Coffee cup,...","[14.276371955871582, 15.000089645385742, 12.89...",10.172656,3.484006,0


## Find Frequent Words in Clusters


In [9]:
n_top_words = 10

def generate_c_tf_idf(docs_per_topic, clustered_embeddings_2d, col):
    count = CountVectorizer(
        ngram_range=(1, 1),
        stop_words="english"
    ).fit(docs_per_topic[col].values)

    t = count.transform(docs_per_topic[col].values).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(len(clustered_embeddings_2d), sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count


def extract_top_n_words_per_topic(grouped, c_tf_idf, count: CountVectorizer, n=20):
    words = count.get_feature_names_out()
    labels = sorted(list(grouped["clusters"]))
    tf_idf_transposed = c_tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {
        label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)
    }
    return top_n_words

def print_to_words(top_words):
    c_ids = list(sorted(top_words.keys()))
    if -1 in top_words.keys():
        c_ids = c_ids[1:]
        c_ids.append(-1)

    for start in range(0, len(c_ids), 2):
        current_ids = c_ids[start:(start+2)]

        print("-"*(33*len(current_ids) + 2))
        for i, c_id in enumerate(current_ids):
            idx = "|  " if i == 0 else ""
            end = "\n" if i == len(current_ids) - 1 else ""
            msg = f"Topic #{c_id}" if c_id != -1 else "Outliers (-1)"
            print(idx, msg.ljust(29), "| ", end=end)
        print("-"*(33*len(current_ids) + 2))

        for i, word_scores in enumerate(zip(*[top_words[i] for i in current_ids])):
            for j, (word, score) in enumerate(word_scores):
                end = " " if j < len(word_scores) else "\n"
                idx = str(i + 1).ljust(3) if j == 0 else ""
                print(idx, word.ljust(20), str(score.round(5)).ljust(8), "|", end=end)
            print()
        print()
        

### Class-Based Term-Frequency Inverse-Document-Frequency (in answers)
This allows us to find the frequency and relevancy of a word in a cluster relative to other clusters.

In [10]:
grouped_answers = data.groupby(['clusters'], as_index=False).agg({'answers': ' '.join})
c_tf_idf_ans, count_ans = generate_c_tf_idf(grouped_answers, data, "answers")

grouped_answers

Unnamed: 0,clusters,answers
0,-1,A Samsung phone is sitting on a table. A pictu...
1,0,AAU. A. A picture of a man with a shirt that s...
2,1,A car with a license plate that says 10033. A3...
3,2,A Kern EMB 220-1 is on the table. A gallon of ...
4,3,A person is holding a phone that says Run Reco...
5,4,A book titled Take Your Girlie To The Movies. ...
6,5,A vehicle with a license plate that reads VE 8...
7,6,A Chinese Cuisine restaurant. 'a' A man sits a...
8,7,"Apr 22, 2010 Thursday PM A game is on the scre..."


#### Extract top n Words From Each Topic (in answers)

In [11]:
top_words_in_answers = extract_top_n_words_per_topic(grouped_answers, c_tf_idf_ans, count_ans, n_top_words)

print_to_words(top_words_in_answers)

--------------------------------------------------------------------
|   Topic #0                      |  Topic #1                      | 
--------------------------------------------------------------------
1   box                  0.53436  |  boat                 0.35852  | 
2   holding              0.07417  |  written              0.17501  | 
3   says                 0.07346  |  helicopter           0.1511   | 
4   book                 0.07338  |  plane                0.1511   | 
5   10                   0.07142  |  white                0.11622  | 
6   00                   0.07005  |  a320                 0.09945  | 
7   person               0.06314  |  hat                  0.09945  | 
8   cup                  0.0467   |  a3200                0.09945  | 
9   ice                  0.0467   |  knife                0.09945  | 
10  clock                0.0467   |  tarmac               0.09945  | 

--------------------------------------------------------------------
|   Topic #2          

### Class-Based Term-Frequency Inverse-Document-Frequency (in questions)
This allows us to find the frequency and relevancy of a word in a cluster relative to other clusters.

In [12]:
grouped_questions = data.groupby(['clusters'], as_index=False).agg({'questions': ' '.join})
c_tf_idf_ques, count_ques = generate_c_tf_idf(grouped_questions, data, "questions")

grouped_questions

Unnamed: 0,clusters,questions
0,-1,how did the screen get cracked? what word is a...
1,0,what numbers are on the tail of the helicopter...
2,1,what is the box car's serial number? what is t...
3,2,what is written on the button with a blue outl...
4,3,what is the total distance in kilometers? what...
5,4,it is a romantic movie? what is lite? what s i...
6,5,what does the license plate read? what is the ...
7,6,what ethnicity of cuisine does this restaurant...
8,7,what time is on the screen? which subcategory ...


#### Extract top n Words From Each Topic (in questions)

In [13]:
top_words_in_questions = extract_top_n_words_per_topic(grouped_questions, c_tf_idf_ques, count_ques, n_top_words)
print_to_words(top_words_in_questions)

--------------------------------------------------------------------
|   Topic #0                      |  Topic #1                      | 
--------------------------------------------------------------------
1   does                 0.13097  |  boat                 0.47373  | 
2   letter               0.12831  |  plane                0.33721  | 
3   brand                0.11546  |  number               0.31545  | 
4   cup                  0.11431  |  airplane             0.27433  | 
5   kind                 0.10884  |  helicopter           0.25242  | 
6   bottle               0.10849  |  tail                 0.23687  | 
7   numbers              0.10849  |  letters              0.1994   | 
8   say                  0.1042   |  island               0.1559   | 
9   ice                  0.08121  |  serial               0.1559   | 
10  cream                0.08121  |  service              0.1559   | 

--------------------------------------------------------------------
|   Topic #2          

## Combine all the Data

In [14]:
data.to_csv("./out_data/embeds.csv", index=False)

In [15]:
def convert_topics_to_df(topics):
    topic_ids = []
    words = []
    values = []

    for topic_id, vals in topics.items():
        for word, value in vals:
            topic_ids.append(topic_id)
            words.append(word)
            values.append(value)

    return pd.DataFrame({
        "clusters": topic_ids,
        "word": words,
        "score": values
    })

convert_topics_to_df(top_words_in_answers).to_csv("./out_data/answer_topics.csv", index=False)
convert_topics_to_df(top_words_in_questions).to_csv("./out_data/question_topics.csv", index=False)