In [1]:
# Cluster info for pinecone databases
import os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import openai
import pinecone
from dotenv import load_dotenv, find_dotenv
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import random
import nltk
from nltk.tokenize import word_tokenize
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv(find_dotenv(),override=True)

# Pinecone
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENVIRONMENT') 
)
index_name = 'langchain-quickstart'
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002",openai_api_key=os.getenv('OPENAI_API_KEY'))
vectorstore = pinecone.Index(index_name)

In [3]:
def get_ids_from_query(index,input_vector):
  print("searching pinecone...")
  results = index.query(vector=input_vector, top_k=10000,include_values=False)
  ids = set()
  print(type(results))
  for result in results['matches']:
    ids.add(result['id'])
  return ids

def get_all_ids_from_index(index, num_dimensions, namespace=""):
  num_vectors = index.describe_index_stats()["namespaces"][namespace]['vector_count']
  all_ids = set()
  while len(all_ids) < num_vectors:
    print("Length of ids list is shorter than the number of total vectors...")
    input_vector = np.random.rand(num_dimensions).tolist()
    print("creating random vector...")
    ids = get_ids_from_query(index,input_vector)
    print("getting ids from a vector query...")
    all_ids.update(ids)
    print("updating ids set...")
    print(f"Collected {len(all_ids)} ids out of {num_vectors}.")

  return all_ids

all_ids = get_all_ids_from_index(vectorstore, num_dimensions=1536, namespace="")
all_ids=list(all_ids)

Length of ids list is shorter than the number of total vectors...
creating random vector...
searching pinecone...
<class 'pinecone.core.client.model.query_response.QueryResponse'>
getting ids from a vector query...
updating ids set...
Collected 10000 ids out of 11802.
Length of ids list is shorter than the number of total vectors...
creating random vector...
searching pinecone...
<class 'pinecone.core.client.model.query_response.QueryResponse'>
getting ids from a vector query...
updating ids set...
Collected 11551 ids out of 11802.
Length of ids list is shorter than the number of total vectors...
creating random vector...
searching pinecone...
<class 'pinecone.core.client.model.query_response.QueryResponse'>
getting ids from a vector query...
updating ids set...
Collected 11756 ids out of 11802.
Length of ids list is shorter than the number of total vectors...
creating random vector...
searching pinecone...
<class 'pinecone.core.client.model.query_response.QueryResponse'>
getting ids f

In [4]:
max_size=1000
scale_factor=0.1
subset_size = min(max_size,int(len(all_ids) * scale_factor))   # Calculate the number of indices you need (10% of the total size). If this exceeds 1000, it picks 1000.
random_subset_indices = random.sample(range(len(all_ids)), subset_size) # Get a random subset of indices
all_ids_slice = [all_ids[index] for index in random_subset_indices] # Downselect to the random subset

In [5]:
vector_slice=vectorstore.fetch(ids=all_ids_slice)

In [6]:
# print(vector_slice['vectors'][all_ids_slice[0]]['metadata']['page'])
# print(vector_slice['vectors'][all_ids_slice[0]]['metadata']['source'])
# print(vector_slice['vectors'][all_ids_slice[0]]['metadata']['text'])
# print(vector_slice['vectors'][all_ids_slice[0]]['values'])

vector_slice_text=[]
vector_slice_embeddings=[]
for id in all_ids_slice:
    vector_slice_text.append(vector_slice['vectors'][id]['metadata']['text'])
    vector_slice_embeddings.append(vector_slice['vectors'][id]['values'])

In [7]:
# Step 1: Preprocess the Data
# Assuming you have a numpy array called 'pinecone_vectors' with shape (num_vectors, vector_dim)
# Make sure your data is in the right format and normalized if necessary

# Step 2: Choose a Clustering Algorithm
num_clusters = 10  # Specify the desired number of clusters

# TODO: Step 3: Determine the Number of Clusters (optional)
# You can use techniques like the elbow method or silhouette analysis to determine the optimal number of clusters

# Step 4: Apply the Clustering Algorithm
kmeans = KMeans(n_clusters=num_clusters, init="k-means++", n_init=10)
cluster_labels = list(kmeans.fit_predict(vector_slice_embeddings))

# Step 5: Interpret the Clusters
# You can analyze the cluster centroids or representative vectors to understand the cluster's properties
cluster_centers = kmeans.cluster_centers_

In [8]:
cluster_data = [text for text, label in zip(vector_slice_text, cluster_labels) if label == 0]
print(cluster_data[1])

Figure 1.ISIS1/=-Scale Model Background Thewirebrakewasdesigned toenablethecontrolled deployment ofaninflatable boom. Ingeneral, inflatable boomsmustactasstructures duringdeployment, andtheymustdeployinacontrolled manner. Onewayofcreating aninflatable boomistomakeacylindrical tubewithendcaps.Theunpressurized tubecanbeflattened androlledontoaspool.Byintroducing inflation gasintothefreeendofthetube, thetubewillunrollfromthespool.Thespoolrotates astheboomisdeployed. Acontrolled deployment mechanism isusedtoregulate therotation rateofthespool.Theresistance ofthespoolissetatalevel thatallowsadequate pressure tobemaintained intheinflated portionoftheboomwhilethedeployment envelope andrateremainpredictable. Figure2showsthecontrolled deployment mechanism usedfor ISIS............... =_z =. Figure2.ISISControlled Deployment DevicewithBoom 232


In [13]:
# Step 6: Assign Automatic Descriptions using OpenAI
cluster_descriptions = []

def truncate_string_at_n_tokens(text, n):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Retrieve the first N tokens
    truncated_tokens = tokens[:n]
    # Join the tokens back into a string
    truncated_string = ' '.join(truncated_tokens)
    return truncated_string

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

for i in range(num_clusters):
    cluster_data = [text for text, label in zip(vector_slice_text, cluster_labels) if label == i]
    cluster_content = ", ".join([str(vec) for vec in cluster_data])  # Modify this based on your specific data representation
    # print('Cluster content: \n'+cluster_content+'\n'+'---'+'\n')
    cluster_content=truncate_string_at_n_tokens(cluster_content,500)
    # print('Truncated cluster content: \n'+cluster_content+'\n'+'---'+'\n')

    prompt = f"Cluster {i+1} content:\n {cluster_content}.\n---\n Describe the common characteristics or theme of this cluster."
    print('Prompt: \n'+prompt+'\n'+'---'+'\n')
    response=completion_with_backoff(engine='gpt-3.5-turbo-instruct',
                                    prompt=prompt,
                                    max_tokens=300,
                                    temperature=0)
    # response = openai.Completion.create(
    #     engine='davinci-instruct-beta',
    #     prompt=prompt,
    #     temperature=0
    # )
    print(response)
    # description = response.choices[0].text.strip()
    # cluster_descriptions.append(description)
    # print('Description: \n'+description+'\n'+'---'+'\n')

Prompt: 
Cluster 1 content:
 their effective length and maximizing the elastic stiff ness with high-modulus materials ( e.g. , titanium ) . This also proved to be beneficial for thermal stability since it minimized the length of the higher CTE materials . Distributed preload sy stems were used in most applications since it was not practical from a mass and cost standpoint to have an independent preload device for each latch interface . However , the preload mechanisms were designed to be much more compliant than the interface fittings which transfer the operational loads . Providing this compliance keeps the preload device out of the primary stiffness path and maintains a relatively constant preload across the interfaces . Primary Mirror The primary mirror deployment and latch mechanisms consist of a pair of hinges and four latches for each wing . The deployment hinges and wing latches act as independent systems . Wing Deployment Hinges The hinges have a dual function . First , the hin

In [None]:
cluster_labels == i
vector_slice_text[cluster_labels == i]

In [None]:

# Step 7: Visualize the Clusters
# Reduce the dimensionality of the vectors for visualization using PCA
pca = PCA(n_components=2)
pca_vectors = pca.fit_transform(pinecone_vectors)

# Plot the clusters
plt.figure(figsize=(8, 6))
for i in range(num_clusters):
    cluster_data = pca_vectors[cluster_labels == i]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=cluster_descriptions[i])
plt.title("Pinecone Vector Clustering")
plt.legend()
plt.show()

# Step 8: Evaluate and Refine (optional)
# You can use clustering evaluation metrics to assess the quality of the clustering results