In [None]:
import yaml
import pymongo
import json
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np
import sys
sys.path.append("/Users/deemish2/git/mongodb-vector-atlas-search")



In [None]:
import os
from llama_index.core import SimpleDirectoryReader

In [None]:
from src.data_helper import insert_data, update_data, update_terms

In [None]:
# from src.pipeline import vector_search_pipeline, search_pipeline

In [None]:
model = SentenceTransformer('nli-mpnet-base-v2')

In [None]:
def generate_embedding(text):
    return model.encode(text).tolist()

In [None]:
with open('../config.yaml', 'r') as stream:
    config = yaml.safe_load(stream)

# Load the configuration from the config.yaml file
mongo_uri = config['mongodb']['mongo_uri']
db_name = config['mongodb']['db_name']
collection_name = config['mongodb']['ucce_collection_name']

In [None]:
# Connect to your Atlas cluster
mongodb_client = pymongo.MongoClient(mongo_uri)
collection = mongodb_client[db_name][collection_name]
vector_search_index = "ucce_vector_index"
search_index = "ucce_search_index"

In [None]:
print(collection)

In [None]:
with open("../input/ucce.json", "r") as f:
    data = json.load(f)

In [None]:
data[0]

In [None]:
# collection.delete_many({})

In [None]:
insert_data(data,model,collection)

In [None]:
# update data

update_data(model, data, collection)

In [None]:
update_terms( data, collection)

In [None]:
for doc in collection.find({'terms':{"$exists": True}}).limit(5):
	print(doc.get('query'))

### Vector Search

In [None]:
query = ""


In [None]:
def perform_search(collection, query, index_name, path):
    pipeline = [
        {
            "$vectorSearch": {
                "index": index_name,
                "path": path,
                "queryVector": generate_embedding(query),
                "numCandidates": 20,
                "limit": 10
            }
        },
        {
            "$project": {
                "_id": 1,
                "sentence": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]
    return list(collection.aggregate(pipeline))

In [None]:
def merge_results(results):
    # Simplified merging logic, possibly just concatenation or more complex merging
    combined_results = {}
    for result_set in results:
        for item in result_set:
            item_id = item['_id']
            if item_id not in combined_results:
                combined_results[item_id] = item
            else:
                # Example of merging scores, could be more complex
                combined_results[item_id]['score'] = max(combined_results[item_id]['score'], item['score'])
    return list(combined_results.values())

In [None]:
results = []
paths = ["sen_embedding", "query_embedding1", "query_embedding2", "query_embedding3", "query_embedding4"]
for path in paths:
    results.append(perform_search(collection, query, vector_search_index, path))

In [None]:
vector_results = merge_results(results)

In [None]:
vector_scores = {doc['_id']: round(doc['score'],2) for doc in vector_results}

In [None]:
vector_scores

### Semantic Search

In [None]:
def search_pipeline(query,index_name):
    pipeline = [
        {
            "$search": {
                "index": index_name,
                "text": {
                    "query": query,
                    "path": {
                        "wildcard": "*"
                    }
                }
            }
        },
        {
            "$project": {
                "_id": 1,
                "paragraph.sentence": 1,
                "score": 1,
                "normalizedScore": 1
            }
        },
        {
  "$addFields": {
    "score": {
      "$meta": "searchScore"
    }
  }
},
{
  "$setWindowFields": {
    "output": {
      "maxScore": {
        "$max": "$score"
      }
    }
  }
},
{
  "$addFields": {
    "normalizedScore": {
      "$divide": [
        "$score", "$maxScore"
      ]
    }
  }
}
    ]
    return pipeline

In [None]:
search_pipeline = search_pipeline(query,search_index)

In [None]:
search_result = collection.aggregate(search_pipeline)

In [None]:
keyword_scores = {doc['_id']: round(doc['normalizedScore'],2) for doc in search_result}

In [None]:
keyword_scores

### Distribution-Based Score Fusion (DBSF)

In [None]:
def normalize(scores):
    min_score = min(scores.values())
    max_score = max(scores.values())
    return {doc_id: (score - min_score) / (max_score - min_score) for doc_id, score in scores.items()}

In [None]:
normalized_vector = normalize(vector_scores)
normalized_keyword = normalize(keyword_scores)

In [None]:
# Combine scores
fused_scores = {doc_id: (normalized_vector.get(doc_id, 0) + normalized_keyword.get(doc_id, 0)) 
                for doc_id in set(normalized_vector.keys()).union(normalized_keyword.keys())}

In [None]:
def retrieve_documents(fused_scores):
    documents = []
    # Retrieve documents by _id and add the fused score
    for doc_id, score in fused_scores.items():
        document = collection.find_one({'_id': doc_id})
        if document:
            document['fused_score'] = score
            documents.append(document)
    
    # Sort documents by fused score in descending order
    documents.sort(key=lambda x: x['fused_score'], reverse=True)
    return documents

In [None]:
sorted_documents = retrieve_documents(fused_scores)

In [None]:
for doc in sorted_documents:
    doc_id = doc['_id']

    # Assign vector score if it exists in the dictionary, else default to some value (e.g., 0)
    doc['vector_score'] = vector_scores.get(doc_id, 0)

    # Assign keyword score if it exists in the dictionary, else default to 0
    doc['keyword_score'] = keyword_scores.get(doc_id, 0)

    # Assign normalized vector score if it exists in the dictionary, else default to some value (e.g., 0)
    doc['normalized_vector_score'] = normalized_vector.get(doc_id, 0)

    # Assign normalized keyword score if it exists in the dictionary, else default to 0
    doc['normalized_keyword_score'] = normalized_keyword.get(doc_id, 0)


In [None]:
sorted_documents

In [None]:
# Display the sorted documents and their scores
for doc in sorted_documents:
    print(f"Document ID: {doc['_id']}\n paragraph: {doc['sentence']}\n  DBSF_Score: {round(doc['fused_score'],2)}\n vector_Score: {round(doc['vector_score'],2)}\n normalized_vector_score: {round(doc['normalized_vector_score'],2)}\n keyword_score: {doc['keyword_score']}\n normalized_keyword_score: {round(doc['normalized_keyword_score'],2)}\n")


