In [11]:
# data preparation
import pandas as pd
import os
import json

In [12]:
# --- Configuration ---
JSON_FILE_PATH = f"/Users/dhirendrachoudhary/Desktop/Workstation/Research/APIGenie/data/scikit-learn-api-reference.json"
CHROMA_DB_PATH = "./chroma_db" # Path to store ChromaDB files
CHROMA_COLLECTION_NAME = "sklearn_apis"
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2' # or 'all-mpnet-base-v2'

In [13]:
from src.prepare_data import load_and_flatten_data
api_document = load_and_flatten_data(JSON_FILE_PATH)

In [14]:
# save flattened data to a json
with open("data/flattened_apis.json", "w") as f:
    json.dump(api_document, f, indent=4)

In [15]:
from src.vectordb_embedding import (
    initialize_embedding_model,
    create_and_populate_vector_db,
    retrieve_relevant_apis
)

# 2. Initialize embedding model
embedding_model = initialize_embedding_model(EMBEDDING_MODEL_NAME)

# 3. Create/Populate Vector DB
api_collection = create_and_populate_vector_db(
    api_document, embedding_model, CHROMA_DB_PATH, CHROMA_COLLECTION_NAME
)

  from .autonotebook import tqdm as notebook_tqdm


Loading embedding model: all-MiniLM-L6-v2...
Embedding model loaded.
Initializing ChromaDB client...
Using existing collection: sklearn_apis
Generating embeddings for 575 API documents...


Batches: 100%|██████████| 18/18 [00:03<00:00,  5.51it/s]


No new documents to add. All documents might already exist.


In [29]:
import re

test_queries = [
    "Build a classifier for multi-class text data, data is sparse",
    "I need to preprocess numerical features that have different scales, preparing for an SVM.",
    "Find a clustering algorithm suitable for a large number of samples and features.",
    "How to perform feature selection to improve my regression model?",
    "Combine preprocessing and a classification model into a single unit."
]

# 4. Retrieve relevant APIs
print(f"Query: {test_queries[0]}")
# def retrieve_relevant_apis(query_text, model, collection, n_results=5):
relevant_apis = retrieve_relevant_apis(
    test_queries[0], embedding_model, api_collection, n_results=5
)

Query: Build a classifier for multi-class text data, data is sparse

User Query: 'Build a classifier for multi-class text data, data is sparse'


In [30]:
relevant_apis

{'ids': [['121', '177', '108', '175', '423']],
 'embeddings': None,
 'documents': [["API Name: LatentDirichletAllocation. Belongs to module: sklearn.decomposition. Signature: class sklearn.decomposition.LatentDirichletAllocation(n_components=10, *, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None). Example Usage: from sklearn.decomposition import LatentDirichletAllocation\nfrom sklearn.datasets import make_multilabel_classification\n# This produces a feature matrix of token counts, similar to what\n# CountVectorizer would produce on text.\nX, _ = make_multilabel_classification(random_state=0)\nlda = LatentDirichletAllocation(n_components=5,\n    random_state=0)\nlda.fit(X)\nLatentDirichletAllocation(...)\n# get topics for some given samples:\nlda.tra