In [1]:
# import libraries
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from collections import Counter
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# convert pandas dataframe to list of questions
df = pd.read_excel('../data/graduate_admission_interview_questions.xlsx')
texts = (
    df["question"]
    .dropna()
    .astype(str)
    .tolist())
texts

['Can you tell me about yourself?',
 'Why have you chosen this course/institution?',
 'Do you think your undergraduate record reflects your effort and ability?',
 'What are your strengths?',
 'What are your weaknesses?',
 'Which academic or businessperson do you most admire and why?',
 'What difficult decision have you had to make in the last six months?',
 'How do you plan to fund your studies?',
 'What questions do you have for me, or about the course or university?',
 'Why this course?',
 'Why this university?',
 'Tell us about yourself',
 'What are your career aspirations',
 'Describe your strengths/weaknesses',
 'Describe a recent challenge you faced and how you approached it',
 'How you intend to fund your studies',
 'What questions do you have?',
 'Tell me about yourself',
 'How will you contribute to our program?',
 'What are your research interests?',
 'How have your undergraduate studies prepared you for this program?',
 'Have you worked on any projects that are particularly 

In [3]:
# KMeans Clustering Model
# create embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
texts_embeddings = model.encode(texts)

# normalize by unit length
texts_embeddings = texts_embeddings / np.linalg.norm(texts_embeddings, axis=1, keepdims=True)

# initialize KMeans clustering model
clustering_model = KMeans(n_clusters=15, random_state=42) # number of questions to cluster

# train the model by feeding the texts embeddings
clustering_model.fit(texts_embeddings)

cluster_assignment = clustering_model.labels_
print(cluster_assignment)

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 186.57it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


[ 8  2 10 10 10  3  1 14  7  2  2  8  9 10 11 14  7  8  6 14  4  6  1  3
  9  5  6 11  3  8 10  3  9  4  3 10  3 12  9  4 14 13  4 14  9  1  3  6
 14  1  1  1 11  4 11 10  6  6  3  0  2  4  1  0  0  0  2  6  0  0 10  0
 11 11  0  5 13  5  5 12 10  9  7  8  2  6  3 13  6  1  7  0  3 14  2 12
  6  6  0  9  9  1  9  1 11  9  1  8  5 14  4 11  3  0 12  1 10  3 10 14
  3 12  2  9  3  9 12  4  0 14 14 14 11  1  1  8  6  1  1  1  5 14  4  9
  3  9  6  6  4  9  6  0  3  2  6  3  9  1  6 13  2  3 14 14 14  5 11  1
 11 13  8  6  0  6  9  0]




In [4]:
# create empty dictionary
clustered_sentences = {}

# for all sentences and its cluster assignment from enumerated list [(sentence_id, cluster_id), ...]
for sentence_id, cluster_id in enumerate(cluster_assignment):

    # if the cluster id is not in the dict
    if cluster_id not in clustered_sentences:
        # add cluster id as key with empty list value
        clustered_sentences[cluster_id] = []

    # collect the text with sentence_id as a key for that cluster_id
    clustered_sentences[cluster_id].append(texts[sentence_id])
clustered_sentences

{np.int32(8): ['Can you tell me about yourself?',
  'Tell us about yourself',
  'Tell me about yourself',
  'Tell me about yourself.',
  'Tell Me About Yourself',
  'Take about a minute to tell me what I should really know about you?',
  'What should we know about you?',
  'Can you tell me a bit about yourself?'],
 np.int32(2): ['Why have you chosen this course/institution?',
  'Why this course?',
  'Why this university?',
  'Why did you get a poor grade in____?',
  'Can you explain why your admission test scores went up (down) when you took the test a second time?',
  'Why Our School/Program?',
  'Why did you choose this school?',
  'Why did you choose to apply to this university?',
  'Why did you choose to apply to our school/program?',
  'Why our school/program?'],
 np.int32(10): ['Do you think your undergraduate record reflects your effort and ability?',
  'What are your strengths?',
  'What are your weaknesses?',
  'Describe your strengths/weaknesses',
  'What do you consider your

In [5]:
# Top 1 and Top 3 representative questions from an example cluster
# get canonical form
cat_text_emb = model.encode(clustered_sentences[11])

# centroid
centroid = cat_text_emb.mean(axis=0, keepdims=True)

# compute similarities using cosine of the embeddings to the centroid
similarities = cosine_similarity(cat_text_emb, centroid).flatten()
print(similarities)

# Index of most representative question
rep_idx = similarities.argmax()
print('Representative question idx:', rep_idx)

# Representative question
canonical_question = clustered_sentences[11][rep_idx]
print("Canonical question:", canonical_question)

# top 3 index
top3_idx = similarities.argsort()[-3:][::-1] # [-3:] get last 3 indices because the index ascending, [::-1] reverse to get descending order
top3_questions = [clustered_sentences[11][i] for i in top3_idx]
print("Top 3 representative questions:")
for q in top3_questions:
    print("-", q)

[0.72411555 0.7332921  0.7344887  0.6529205  0.5475418  0.68379056
 0.6305382  0.72673064 0.7170994  0.62781376 0.6528783 ]
Representative question idx: 2
Canonical question: Tell me how you handle stress.
Top 3 representative questions:
- Tell me how you handle stress.
- How do you handle challenges and setbacks in your studies?
- How do you manage stress and competing priorities in your life?


In [6]:
# create an empty list to collect canonical questions
rep_questions = []

for i in range(len(clustered_sentences)):
    # create embeddings from each clustered sentences
    cat_text_emb = model.encode(clustered_sentences[i])
    # compute the centroid
    centroid = cat_text_emb.mean(axis=0, keepdims=True)
    # compute similarities
    similarities = cosine_similarity(cat_text_emb, centroid).flatten()
    # get top 1 index
    rep_idx = similarities.argmax()
    # collect
    rep_questions.append((clustered_sentences[i][rep_idx], len(clustered_sentences[i])))



In [7]:
rep_questions = sorted(rep_questions, key=lambda x: x[1], reverse=True)
rep_questions
for q, count in rep_questions:
    print(q)
    print( ">> Similar questions appeared", count , "times")
    print("-----")

What is your most significant accomplishment?
>> Similar questions appeared 19 times
-----
How will you contribute to our program?
>> Similar questions appeared 19 times
-----
Why are you interested in this degree?
>> Similar questions appeared 18 times
-----
What are your career goals?
>> Similar questions appeared 17 times
-----
What are your research interests?
>> Similar questions appeared 16 times
-----
What courses have you enjoyed the most throughout your college career?
>> Similar questions appeared 15 times
-----
What are your strengths and weaknesses?
>> Similar questions appeared 11 times
-----
Tell me how you handle stress.
>> Similar questions appeared 11 times
-----
Why did you choose this school?
>> Similar questions appeared 10 times
-----
How do you feel your undergraduate studies have prepared you for this program?
>> Similar questions appeared 10 times
-----
Tell me about yourself
>> Similar questions appeared 8 times
-----
What would you change about yourself and wh