## Get All Questions

In [4]:
import json
# Load JSON data
with open('strategyqa-data/strategyqa_dataset/strategyqa_train.json', 'r') as file:
    data = json.load(file)

questions = []
for item in data:
    questions.append(item['question'])
    
print('{} Questions Read'.format(len(questions)))

2290 Questions Read


In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode Sentences
# batch-size is 32 by default
corpus_embeddings = model.encode(questions, show_progress_bar=True)

Batches:   0%|          | 0/72 [00:00<?, ?it/s]

## Cluster Them

In [7]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [8]:
# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=22)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(questions[sentence_id])

In [11]:
for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print("Cluster Length: ", len(cluster))
    print(cluster)
    print("")

Cluster  1
Cluster Length:  203
['Are more people today related to Genghis Khan than Julius Caesar?', 'Do the anchors on Rede Globo speak Chinese?', 'Is the language used in Saint Vincent and the Grenadines rooted in English?', 'Did land owners elect their rulers in the Kingdom of Hungary?', 'Did Japanese serfdom have higher status than English counterpart?', 'Does Ukrainian Greek Catholic Church recognize Alexander Nevsky as a saint?', 'Can you write a whole Haiku in a single tweet?', "Will Tokyo Tower be repainted only once during President Trump's first term?", 'Does a person need to be a parent to become a grandparent?', 'Could Elizabeth I of England have seen the play Dido, Queen of Carthage ?', 'Would it be typical for a Rede Globo anchor to say Konnichiwa to the viewers?', 'Are most books written as a Haiku?', 'Could the main character of "Alice\'s Adventures in Wonderland" join a Masonic Lodge?', 'Did King James I despise fairy beings?', "Was Charlemagne's father instrumental i

In [23]:
for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print("Cluster Length: ", len(cluster))
    # Convert list of sentences to a single string
    text = " ".join(cluster[0:15])
    print(text)
    # Generate the summary
    summary = summarizer(text)
    # Print the summary text
    print("Summary:", summary[0]['summary_text'])
    print("--")

Cluster  1
Cluster Length:  203
Are more people today related to Genghis Khan than Julius Caesar? Do the anchors on Rede Globo speak Chinese? Is the language used in Saint Vincent and the Grenadines rooted in English? Did land owners elect their rulers in the Kingdom of Hungary? Did Japanese serfdom have higher status than English counterpart? Does Ukrainian Greek Catholic Church recognize Alexander Nevsky as a saint? Can you write a whole Haiku in a single tweet? Will Tokyo Tower be repainted only once during President Trump's first term? Does a person need to be a parent to become a grandparent? Could Elizabeth I of England have seen the play Dido, Queen of Carthage ? Would it be typical for a Rede Globo anchor to say Konnichiwa to the viewers? Are most books written as a Haiku? Could the main character of "Alice's Adventures in Wonderland" join a Masonic Lodge? Did King James I despise fairy beings? Was Charlemagne's father instrumental in outcome of the Battle of Tours?
Summary:  A

In [30]:
for i, cluster in clustered_sentences.items():
    with open('question_clusters/cluster'+str(i+1)+'.json', 'w') as f:
        json.dump(cluster, f)