In [3]:
# Cluster info for pinecone databases
import os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import openai
import pinecone
from dotenv import load_dotenv, find_dotenv
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import random
import nltk
from nltk.tokenize import word_tokenize
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [4]:
load_dotenv(find_dotenv(),override=True)

# Pinecone
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENVIRONMENT') 
)
index_name = 'ams'
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002",openai_api_key=os.getenv('OPENAI_API_KEY'))
vectorstore = pinecone.Index(index_name)

In [5]:
def get_ids_from_query(index,input_vector):
  print("searching pinecone...")
  results = index.query(vector=input_vector, top_k=10000,include_values=False)
  ids = set()
  print(type(results))
  for result in results['matches']:
    ids.add(result['id'])
  return ids

def get_all_ids_from_index(index, num_dimensions, namespace=""):
  num_vectors = index.describe_index_stats()["namespaces"][namespace]['vector_count']
  all_ids = set()
  while len(all_ids) < num_vectors:
    print("Length of ids list is shorter than the number of total vectors...")
    input_vector = np.random.rand(num_dimensions).tolist()
    print("creating random vector...")
    ids = get_ids_from_query(index,input_vector)
    print("getting ids from a vector query...")
    all_ids.update(ids)
    print("updating ids set...")
    print(f"Collected {len(all_ids)} ids out of {num_vectors}.")

  return all_ids

all_ids = get_all_ids_from_index(vectorstore, num_dimensions=1536, namespace="")
all_ids=list(all_ids)

Length of ids list is shorter than the number of total vectors...
creating random vector...
searching pinecone...
<class 'pinecone.core.client.model.query_response.QueryResponse'>
getting ids from a vector query...
updating ids set...
Collected 7546 ids out of 7546.


In [6]:
max_size_query=1000 # The fectch function will top out around 1000 entries.

def iterate_in_chunks(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i+chunk_size]

# Create a list of all of the vector embeddings and text that it iterated in max_size_query chunks
vectors=[]
vector_text=[]
vector_embeddings=[]
for chunk in iterate_in_chunks(all_ids, max_size_query):
    vector_temp=vectorstore.fetch(ids=chunk)
    vectors.append(vector_temp)
    for id in chunk:
        vector_text.append(vector_temp['vectors'][id]['metadata']['text'])
        vector_embeddings.append(vector_temp['vectors'][id]['values'])

In [7]:
wcss = []
silhouette_scores = []
i_clusters=range(20,1001,20)
for i in i_clusters:
    # print(i)
    kmeans = KMeans(n_clusters=i, init='k-means++', n_init='auto',random_state=42)
    kmeans.fit(vector_embeddings)
    wcss.append(kmeans.inertia_)

    labels = kmeans.labels_
    silhouette_scores.append(silhouette_score(vector_embeddings, labels))

plt.plot(i_clusters, wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

best_num_clusters = i_clusters[np.argmax(silhouette_scores)]

plt.plot(i_clusters, silhouette_scores)
plt.title('Silhouette Scores')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()
print(best_num_clusters)

In [None]:
# TODO: add labeling using free open source and cheap llm. Try bloom https://lancerninja.com/open-source-models-with-langchain/

In [None]:
# Step 1: Preprocess the Data
# Assuming you have a numpy array called 'pinecone_vectors' with shape (num_vectors, vector_dim)
# Make sure your data is in the right format and normalized if necessary

# Step 2: Choose a Clustering Algorithm
# num_clusters = best_num_clusters  # Specify the desired number of clusters
num_clusters = 50  # Specify the desired number of clusters

# Step 4: Apply the Clustering Algorithm
kmeans = KMeans(n_clusters=num_clusters, init="k-means++", n_init='auto')
cluster_labels = list(kmeans.fit_predict(vector_embeddings))

# Step 5: Interpret the Clusters
# You can analyze the cluster centroids or representative vectors to understand the cluster's properties
cluster_centers = kmeans.cluster_centers_

In [None]:
# Step 6: Assign Automatic Descriptions using OpenAI
cluster_descriptions = []

def truncate_string_at_n_tokens(text, n):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Retrieve the first N tokens
    truncated_tokens = tokens[:n]
    # Join the tokens back into a string
    truncated_string = ' '.join(truncated_tokens)
    return truncated_string

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

for i in range(num_clusters):
    cluster_data = [text for text, label in zip(vector_slice_text, cluster_labels) if label == i]
    cluster_content = ", ".join([str(vec) for vec in cluster_data])  # Modify this based on your specific data representation
    # print('Cluster content: \n'+cluster_content+'\n'+'---'+'\n')
    cluster_content=truncate_string_at_n_tokens(cluster_content,500)
    # print('Truncated cluster content: \n'+cluster_content+'\n'+'---'+'\n')

    prompt = f"Cluster {i+1} content:\n {cluster_content}.\n---\n Describe the common characteristics or theme of this cluster."
    # print('Prompt: \n'+prompt+'\n'+'---'+'\n')
    response=completion_with_backoff(engine='gpt-3.5-turbo-instruct',
                                    prompt=prompt,
                                    max_tokens=100,
                                    temperature=0)
    # print(response)
    description = response.choices[0].text.strip()
    cluster_descriptions.append(description)
    print(f'Cluster {i+1} Description: \n'+description+'\n'+'---'+'\n')

In [None]:
cluster_labels == i
vector_slice_text[cluster_labels == i]

In [None]:

# Step 7: Visualize the Clusters
# Reduce the dimensionality of the vectors for visualization using PCA
pca = PCA(n_components=2)
pca_vectors = pca.fit_transform(pinecone_vectors)

# Plot the clusters
plt.figure(figsize=(8, 6))
for i in range(num_clusters):
    cluster_data = pca_vectors[cluster_labels == i]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=cluster_descriptions[i])
plt.title("Pinecone Vector Clustering")
plt.legend()
plt.show()

# Step 8: Evaluate and Refine (optional)
# You can use clustering evaluation metrics to assess the quality of the clustering results