In [1]:
# This section is to read .env file in the current directory. You need to set GOOGLE_API_KEY in the file. Alternatively just set hardcode google_api_key to the API Key value. Do not checkin the key to git
from dotenv import load_dotenv
import os
load_dotenv()  # Automatically looks for a `.env` file in current dir
#print(os.environ['GOOGLE_API_KEY'])

True

In [2]:
import faiss
import google.generativeai as genai
import numpy as np
from pyspark.sql import SparkSession
import os
FILE_LOCATION_STANDARD_DIAGNOSIS = "symptom_descriptions_top10K.csv"


google_api_key = os.environ.get('GOOGLE_API_KEY')
MODEL = "models/embedding-001"
BATCH_SIZE_FOR_GOOGLE_API = 1000
dimension = 768

spark = SparkSession.builder \
    .appName("ParallelEmbeddingGeneration") \
    .master("local[*]") \
    .getOrCreate()

def configure_genai():
    genai.configure(api_key=google_api_key)

def generate_embeddings_in_batches(batch):
    configure_genai()
    response = genai.embed_content(
        model=MODEL,
        content=batch,
        task_type="retrieval_document"
    )

    #print (response)

    embeddings = response["embedding"]
    return embeddings  # List[List[float]]

def gen_standard_embeddings(standard_diagnosis_list):
    # Create RDD and process partitions in Spark
    rdd = spark.sparkContext.parallelize(standard_diagnosis_list, numSlices=6)

    def process_partition(partition):
        configure_genai()
        partition = list(partition)
        all_embeddings = []
        for i in range(0, len(partition), BATCH_SIZE_FOR_GOOGLE_API):
            batch = partition[i:i + BATCH_SIZE_FOR_GOOGLE_API]
            batch_embeddings = generate_embeddings_in_batches(batch)
            all_embeddings.extend(batch_embeddings)
        return all_embeddings

    # ✅ Step 1: Collect all embeddings back to driver
    all_embeddings = rdd.mapPartitions(process_partition).collect()

    # ✅ Step 2: Convert to NumPy array and build FAISS index on driver
    embeddings_np = np.array(all_embeddings, dtype='float32')
    faiss.normalize_L2(embeddings_np)

    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings_np)

    # ✅ Step 3: Save index
    faiss.write_index(index, "faiss_standard_strings_embeddings.index")
    print("Saved FAISS index with", index.ntotal, "vectors.")




In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import faiss
import pandas as pd

def gen_matched_strings(standard_embeddings_path, standard_diagnosis_list_filename, to_be_matched_strings):
    """
    Match input strings to the closest standard diagnosis using cosine similarity.
    """
    genai.configure(api_key=google_api_key)

    # Generate embeddings for the input strings
    input_strings_embeddings = []
    for to_be_matched_string in to_be_matched_strings:
        embedding = genai.embed_content(
            model=MODEL,
            content=to_be_matched_string,
            task_type="retrieval_query"
        )["embedding"]
        input_strings_embeddings.append(embedding)
    #print ("input_strings_embeddings", input_strings_embeddings)

    # Load the FAISS index
    index = faiss.read_index(standard_embeddings_path)

    # Reconstruct all vectors from the FAISS index
    faiss_vectors = index.reconstruct_n(0, index.ntotal)

    # Normalize the FAISS vectors
    #faiss_vectors = faiss.normalize_L2(faiss_vectors)

    # Normalize the input embeddings
    input_strings_embeddings = np.array(input_strings_embeddings, dtype='float32')
    #input_strings_embeddings = faiss.normalize_L2(input_strings_embeddings)
    #print ("input_strings_embeddings", input_strings_embeddings)
    
    # Compute cosine similarities and find the best matches
    matched_strings = []
    similarities = []

    df = pd.read_csv(standard_diagnosis_list_filename)
    standard_diagnosis_list = df.iloc[:, 0].astype(str).tolist()

    #print ("standard_diagnosis_list", standard_diagnosis_list)

    for input_str_emb in input_strings_embeddings:
        #print ("input_str_emb", input_str_emb)
        #print ("faiss_vectors", faiss_vectors)
        # Compute cosine similarity between input_str_emb and all FAISS vectors
        similarity_scores = cosine_similarity(faiss_vectors, input_str_emb.reshape(1, -1)).flatten()

        # Find the best match
        best_index = np.argmax(similarity_scores)
        similarities.append(similarity_scores[best_index])
        
        
        #print ("best_index", best_index)
        #print ("standard_diagnosis_list[best_index]", standard_diagnosis_list[best_index])
        
        # Extract the first column as an array of strings
        
        matched_strings.append(standard_diagnosis_list[best_index])

    return similarities, matched_strings
    

In [4]:
import pandas as pd
# Read the CSV file and extract the first column
df = pd.read_csv(FILE_LOCATION_STANDARD_DIAGNOSIS)
# Extract the first column as an array of strings
first_column = df.iloc[:, 0].astype(str).tolist()


# Generate embeddings
gen_standard_embeddings(first_column)
#print("Generated embeddings shape:", standard_embeddings.shape)

                                                                                

Saved FAISS index with 9999 vectors.


In [5]:
to_be_matched_strings=[
    "sudden leg pain while sitting",
    "T2 diabetes",
    "acute arm rash after walk",
    "intermittent joints tingling after walk",
    "mild head inflam while sitting",
    "severe throat stiffness in morning",
    " "


]

faiss_loc = "faiss_standard_strings_embeddings.index"
standard_diag_file_loc = FILE_LOCATION_STANDARD_DIAGNOSIS

similarities, matched_strings = gen_matched_strings (faiss_loc,standard_diag_file_loc, to_be_matched_strings)

for input, matched, similarity in zip (to_be_matched_strings,matched_strings,similarities):
    print(f"Test Input: {input}")
    print(f"\tBest Match: {matched}")
    print(f"\tSimilarity Score: {similarity:.4f}\n")

Test Input: sudden leg pain while sitting
	Best Match: sudden leg pain while sitting
	Similarity Score: 0.7597

Test Input: T2 diabetes
	Best Match: progressive leg burning post surgery - case 2120
	Similarity Score: 0.6122

Test Input: acute arm rash after walk
	Best Match: sudden arm rash after walking
	Similarity Score: 0.7928

Test Input: intermittent joints tingling after walk
	Best Match: sudden joints tingling after walking
	Similarity Score: 0.7933

Test Input: mild head inflam while sitting
	Best Match: mild head inflammation while sitting - case 2781
	Similarity Score: 0.7781

Test Input: severe throat stiffness in morning
	Best Match: severe throat stiffness in morning
	Similarity Score: 0.7810

Test Input:  
	Best Match: mild throat inflammation post surgery - case 3520
	Similarity Score: 0.6015

