In [27]:
import requests
import os
import json
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
from collections import defaultdict
from dotenv import load_dotenv
import prisma
import asyncio
import nest_asyncio


nest_asyncio.apply()
load_dotenv()

True

# Cluster Topics

### Generating Search String Embeddings

In [4]:
openai.api_key = "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"

search_string = 'Responsive Whiteboards'
search_vector = get_embedding(search_string, engine="text-search-babbage-query-001")

### Loading Vectors

In [8]:
top_k = 100

url = "https://terrarium-1ce80e9.svc.us-west1-gcp.pinecone.io/query"
data = {
    "vector": search_vector,
    "includeValues": True,
    "topK": top_k
}
headers = {
  "Content-Type": "application/json",
  "Api-Key": os.environ['PINECONE_API_KEY']
}

response = requests.post(url, data=json.dumps(data), headers=headers)
pinecone_vectors = response.json()
filtered_vectors = list(filter(lambda x: x["score"] > 0.25, pinecone_vectors['matches']))

### Classifying Vectors

In [9]:
n_clusters = 10

vector_matrix = pd.DataFrame(filtered_vectors)

matrix = np.vstack(vector_matrix['values'].values)
assert matrix.shape[1] == 2048
kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42, n_init=10)
kmeans.fit(matrix)
labels = kmeans.labels_

### Converting to Clusters

In [50]:
stripped_vectors = [{"id": i["id"], "score": i["score"], "cluster": labels[idx]} for idx, i in enumerate(filtered_vectors)]

tmp = defaultdict(list)
for item in stripped_vectors:
    tmp[item['cluster']].append([item['id']])
output = dict(tmp.items())

In [55]:
final_output = dict(zip(map(int, output.keys()), list(map(lambda x: list(map(lambda y: y[0], x)), output.values()))))

In [None]:
final_output

In [56]:
with open('cluster-ids.json', "w") as outfile:
    json.dump(final_output, outfile)

### Get All Feature Requests By Ids


In [5]:
f = open('./cluster-frs.json')
data = json.load(f)

fr_data = {k: list(map(json.loads, v)) for k, v in data.items()}

In [None]:
for cluster, data in fr_data.items():
    print(cluster)
    print(data[0])

### Analysing Cluster Content [Local Clusters]

In [7]:
[x['fr'] for x in fr_data['7']]

['1. Collapsing whiteboards',
 '1. Multiple panes/whiteboards',
 '1. A whiteboard specifically for templates',
 '2. Share whiteboard with colleagues',
 '2. Great for using images on whiteboards',
 '2. Whiteboards with multiple cards',
 '2. Map showing recently edited whiteboards',
 '2. A hotkey to quickly access the whiteboard.',
 '2. An animation feature for whiteboards.',
 '1. More "snap to grid" on the whiteboard',
 '2. A whiteboard template with specific spacing/layout']

In [18]:
def shorten_prompt_data(unsorted_input_list, max_length=3700):
    input_list = sorted(unsorted_input_list, key=lambda x: len(x))
    
    char_count = 0
    output_str = ""
    for fr in input_list:
        if (char_count + len(fr) < max_length):
            output_str += "\n" + fr
            char_count += len(fr)
        else:
            break
    return output_str

In [25]:
def analyse_local_cluster(input_list, feature_title, top_p=0.15):
    x = shorten_prompt_data(input_list)
    prompt = f"""
        What are three themes many of the feature requests below share about the feature '{feature_title}'?
        Don't use any verbs. Describe each theme in fewer than five words. Be specific and look for
        unique themes that aren't common. Provide your answer in the form of a Python list, and don't 
        include any newline  characters. Include no other commentary, and use fewer than five words. The 
        feature requests are below: {x}
    """
    res = openai.Completion.create(model="text-davinci-003", 
                                        prompt=prompt,
                                        top_p=top_p, 
                                        max_tokens=200)
    return res['choices'][0]['text']

In [26]:
topic_responses = {}
for cluster, frs in fr_data.items():
    response = analyse_local_cluster([x['fr'] for x in frs], 'Responsive, Interactive Whiteboards')
    print(response)
    topic_responses[cluster] = response


['Customization', 'Organization', 'Interconnectivity']

['Exportability', 'Collaboration', 'Customizability']

['Interactivity', 'Customization', 'Collaboration']

['Collaboration', 'Customization', 'Ease-of-Use']

['Interactivity', 'Customization', 'Organization']

['Zooming', 'Selection', 'Transparency']

['Interactivity', 'Organization', 'Connectivity']

['Collaboration', 'Organization', 'Interactivity']

['Interactivity', 'Connectivity', 'Flexibility']

2) The ability to create multiple whiteboards and easily switch between them.

3) The ability to add annotations to whiteboards and save them for future reference.

['Collaboration', 'Organization', 'Annotation']


In [302]:
response.strip()

"['Collapsing whiteboards', 'Multiple panes/whiteboards', 'Whiteboards with multiple cards']"

In [306]:
json.loads(response.replace("\'", "\""))

['Collapsing whiteboards',
 'Multiple panes/whiteboards',
 'Whiteboards with multiple cards']

### Analysing Cluster Content (Global Clusters)

In [135]:
flat_fr_data = [item for sublist in fr_data.values() for item in sublist]

In [138]:
flat_fr_data[0]

{'fr_id': '1004812227775307786-1853161927541104386',
 'message_id': '1004812227775307786',
 'message': "hi alan. partially. but being able to expand more than one whiteboard in the sidebar will show more, at-a-glance, than breadcrumbs alone. breadcrumbs show me where i've been, but the sidebar will show me where i might need to go next.",
 'created_at': '2022-08-04T18:05:03.610000+00:00',
 'author': 'Sams_Here',
 'label': 'Request',
 'fr': '1. The ability to expand more than one whiteboard in the sidebar.',
 'kmeans_labels': 4,
 'userId': '110421822788553907926',
 'user': None,
 'features': None}

In [148]:
tmp = defaultdict(list)
for item in flat_fr_data:
    tmp[str(item['kmeans_labels'])].append(item['fr'])
kmeans_output = dict(tmp.items())

In [235]:
def analyse_kmeans_cluster(input_list, top_p):
    x = shorten_prompt_data(input_list)
    res = openai.Completion.create(model="text-davinci-002", 
                                    prompt="What are three things many of these feature requests asking for about whiteboards. Answer in the form Common Theme: {x} and use only nouns. List three common themes. \n Feature Requests:" + x, 
                                    top_p=top_p, 
                                    max_tokens=200)
    return res['choices'][0]['text']

In [236]:
analyse_kmeans_cluster(kmeans_output['41'][:10], top_p=0.15)

'\n\nCommon Theme: Zooming\nCommon Theme: Positioning of cards\nCommon Theme: Presentation'

# Other

In [14]:
f = open("/Users/finnmacken/Desktop/TerrariumV2/machine-learning-pipeline/test-dataset.json")
data = json.load(f)

In [24]:
vector_matrix = pd.DataFrame(data)
matrix = np.vstack(vector_matrix['values'].values)
assert matrix.shape[1] == 2048
kmeans = KMeans(n_clusters=20, init="k-means++", random_state=42, n_init=10)
kmeans.fit(matrix)
labels = kmeans.labels_

In [35]:
filtered_vectors = [{"id": i["id"], "score": i["score"], "cluster": labels[idx]} for idx, i in enumerate(data)]

In [55]:
tmp = defaultdict(list)
for item in filtered_vectors:
    tmp[item['cluster']].append([item['id']])
output = dict(tmp.items())


In [None]:
for cluster, embeddings in output.items():
    new_embeddings = [{
                    "featureId_featureRequestId": {
                        "featureId": 17,
                        "featureRequestId": embedding[0],
                    }
                } for embedding in embeddings]
    print(new_embeddings)