In [10]:
# Cluster info for pinecone databases
import os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import openai
import pinecone
from dotenv import load_dotenv, find_dotenv
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import random
import nltk
from nltk.tokenize import word_tokenize

In [2]:
load_dotenv(find_dotenv(),override=True)

# Pinecone
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENVIRONMENT') 
)
index_name = 'langchain-quickstart'
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002",openai_api_key=os.getenv('OPENAI_API_KEY'))
vectorstore = pinecone.Index(index_name)

In [3]:
def get_ids_from_query(index,input_vector):
  print("searching pinecone...")
  results = index.query(vector=input_vector, top_k=10000,include_values=False)
  ids = set()
  print(type(results))
  for result in results['matches']:
    ids.add(result['id'])
  return ids

def get_all_ids_from_index(index, num_dimensions, namespace=""):
  num_vectors = index.describe_index_stats()["namespaces"][namespace]['vector_count']
  all_ids = set()
  while len(all_ids) < num_vectors:
    print("Length of ids list is shorter than the number of total vectors...")
    input_vector = np.random.rand(num_dimensions).tolist()
    print("creating random vector...")
    ids = get_ids_from_query(index,input_vector)
    print("getting ids from a vector query...")
    all_ids.update(ids)
    print("updating ids set...")
    print(f"Collected {len(all_ids)} ids out of {num_vectors}.")

  return all_ids

all_ids = get_all_ids_from_index(vectorstore, num_dimensions=1536, namespace="")
all_ids=list(all_ids)

Length of ids list is shorter than the number of total vectors...
creating random vector...
searching pinecone...
<class 'pinecone.core.client.model.query_response.QueryResponse'>
getting ids from a vector query...
updating ids set...
Collected 10000 ids out of 11802.
Length of ids list is shorter than the number of total vectors...
creating random vector...
searching pinecone...
<class 'pinecone.core.client.model.query_response.QueryResponse'>
getting ids from a vector query...
updating ids set...
Collected 11556 ids out of 11802.
Length of ids list is shorter than the number of total vectors...
creating random vector...
searching pinecone...
<class 'pinecone.core.client.model.query_response.QueryResponse'>
getting ids from a vector query...
updating ids set...
Collected 11731 ids out of 11802.
Length of ids list is shorter than the number of total vectors...
creating random vector...
searching pinecone...
<class 'pinecone.core.client.model.query_response.QueryResponse'>
getting ids f

In [4]:
subset_size = min(1000,int(len(all_ids) * 0.1))   # Calculate the number of indices you need (10% of the total size). If this exceeds 1000, it picks 1000.
random_subset_indices = random.sample(range(len(all_ids)), subset_size) # Get a random subset of indices
all_ids_slice = [all_ids[index] for index in random_subset_indices] # Downselect to the random subset

In [5]:
vector_slice=vectorstore.fetch(ids=all_ids_slice)

In [6]:
# print(vector_slice['vectors'][all_ids_slice[0]]['metadata']['page'])
# print(vector_slice['vectors'][all_ids_slice[0]]['metadata']['source'])
# print(vector_slice['vectors'][all_ids_slice[0]]['metadata']['text'])
# print(vector_slice['vectors'][all_ids_slice[0]]['values'])

vector_slice_text=[]
vector_slice_embeddings=[]
for id in all_ids_slice:
    vector_slice_text.append(vector_slice['vectors'][id]['metadata']['text'])
    vector_slice_embeddings.append(vector_slice['vectors'][id]['values'])

In [13]:
# Step 1: Preprocess the Data
# Assuming you have a numpy array called 'pinecone_vectors' with shape (num_vectors, vector_dim)
# Make sure your data is in the right format and normalized if necessary

# Step 2: Choose a Clustering Algorithm
num_clusters = 10  # Specify the desired number of clusters

# TODO: Step 3: Determine the Number of Clusters (optional)
# You can use techniques like the elbow method or silhouette analysis to determine the optimal number of clusters

# Step 4: Apply the Clustering Algorithm
kmeans = KMeans(n_clusters=num_clusters, init="k-means++", n_init=10)
cluster_labels = list(kmeans.fit_predict(vector_slice_embeddings))

# Step 5: Interpret the Clusters
# You can analyze the cluster centroids or representative vectors to understand the cluster's properties
cluster_centers = kmeans.cluster_centers_

In [15]:
# Step 6: Assign Automatic Descriptions using OpenAI
cluster_descriptions = []

def truncate_string_at_n_tokens(text, n):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Retrieve the first N tokens
    truncated_tokens = tokens[:n]
    # Join the tokens back into a string
    truncated_string = ' '.join(truncated_tokens)
    return truncated_string

for i in range(num_clusters):
    cluster_data = vector_slice_text[cluster_labels == i]
    cluster_content = ", ".join([str(vec) for vec in cluster_data])  # Modify this based on your specific data representation
    cluster_content=truncate_string_at_n_tokens(cluster_content,2000)
    print('Cluster content: '+cluster_content+'\n'+'---'+'\n')
    prompt = f"Cluster {i+1} content: {cluster_content}. Describe the common characteristics or theme of this cluster."
    response = openai.Completion.create(
        engine='davinci-instruct-beta-v3',
        prompt=prompt,
        max_tokens=50,
        temperature=0
    )
    description = response.choices[0].text.strip()
    cluster_descriptions.append(description)
    print('\n'+description+'\n')

['The common characteristics or theme of this cluster is that all of the words are related to figures or images.', 'The common characteristics or theme of this cluster is that all of the words are related to the concept of a "figure." This could include things like shapes, numbers, or illustrations.', 'This cluster is about different types of valves.', 'This cluster is about different types of valves.', 'This cluster is about different types of valves.', 'The common characteristics or theme of this cluster is that all of the words are related to figures or images.', 'The common characteristics or theme of this cluster is that all of the words are related to figures or images.', 'The common characteristics or theme of this cluster is that all of the words are related to figures or images.', 'The common characteristics or theme of this cluster is that all of the words are related to figures or images.', 'The common characteristics or theme of this cluster is that all of the words are rel

In [None]:

# Step 7: Visualize the Clusters
# Reduce the dimensionality of the vectors for visualization using PCA
pca = PCA(n_components=2)
pca_vectors = pca.fit_transform(pinecone_vectors)

# Plot the clusters
plt.figure(figsize=(8, 6))
for i in range(num_clusters):
    cluster_data = pca_vectors[cluster_labels == i]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=cluster_descriptions[i])
plt.title("Pinecone Vector Clustering")
plt.legend()
plt.show()

# Step 8: Evaluate and Refine (optional)
# You can use clustering evaluation metrics to assess the quality of the clustering results