In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load pre-trained Sentence Transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

data = '../data/patsnap_data.xlsx'
# Read patent data from Excel file
patent_data = pd.read_excel(data, sheet_name='sheet1')

# Extract relevant fields (title, abstract, claims) from the patent data
patent_documents = patent_data[['Title', 'Abstract', 'Claims']].values.tolist()

# Example query
query = "unmanned aerial vehicle with autonomous navigation system"

# Generate embeddings for the query and patent documents
query_embedding = model.encode(query)
patent_embeddings = model.encode(patent_documents)

# Calculate cosine similarity between the query and patent documents
cosine_scores = util.pytorch_cos_sim(query_embedding, patent_embeddings)

# Convert cosine similarity scores to numpy array for easier manipulation
cosine_scores = cosine_scores.numpy()

# Rank the patent documents based on similarity scores
ranked_documents_indices = np.argsort(-cosine_scores, axis=0)

# Define the number of top-ranked documents to display
num_documents_to_display = 5

# Print ranked patent documents along with their similarity scores
print(f"Top {num_documents_to_display} Ranked Patent Documents:")
for i in range(num_documents_to_display):
    # Check if there are documents available
    if len(ranked_documents_indices) > 0:
        document_index = ranked_documents_indices[i][0]
        similarity_score = cosine_scores[document_index][0]
        print(f"Rank {i+1}: {patent_documents[document_index][0]} (Similarity Score: {similarity_score:.4f})")
    else:
        print("No relevant patent documents found.")


Top 5 Ranked Patent Documents:
Rank 1: Sonotube compatible unmanned aerial vehicle and system (Similarity Score: 0.5556)


IndexError: index 1 is out of bounds for axis 0 with size 1