In [1]:
import pandas as pd
import numpy as np

In [2]:
data = '../data/patsnap_data.xlsx'
patent_data = pd.read_excel(data, sheet_name='sheet1')

In [5]:
patent_data.head(3)

Unnamed: 0,Number,Publication Number,Title,Legal Status & Events,Current Assignee,Application Date,IPC,Patent Valuation,Abstract,Abstract (Translated)(English),Claims,Title (Translated)(English),CPC
0,1,US6056237A,Sonotube compatible unmanned aerial vehicle an...,Non-payment,1281329 ALBERTA LTD.,1997-06-25,B64D1/02 | B64D1/00 | B64D33/02 | B64C39/00 | ...,-,The present invention is generally comprised o...,The present invention is generally comprised o...,I claim:_x000D_\n1. A sonotube compatible unma...,Sonotube compatible unmanned aerial vehicle an...,B64C3/40 | B64C5/12 | B64C39/024 | B64D1/02 | ...
1,2,US8511606B1,Unmanned aerial vehicle base station,Granted,THE BOEING COMPANY,2009-12-09,B64D41/00,"$ 56,000","A method and apparatus comprising a platform, ...","A method and apparatus comprising a platform, ...",1. An apparatus comprising:_x000D_\na platform...,Unmanned aerial vehicle base station,B64C39/028 | B64C39/024 | B64C2201/066 | B64C2...
2,3,US8948935B1,Providing a medical support device via an unma...,Granted | Transfer,WING AVIATION LLC,2013-01-02,G06Q10/00 | B64C39/02 | G16H40/67,"$ 79,000",Embodiments described herein may relate to an ...,Embodiments described herein may relate to an ...,1. An unmanned aerial vehicle (UAV) comprising...,Providing a medical support device via an unma...,A61B5/00 | A61B19/0264 | B64C39/024 | G06F19/3...


In [7]:
patent_data.isnull().sum()

Number                            0
Publication Number                0
Title                             0
Legal Status & Events             0
Current Assignee                  0
Application Date                  0
IPC                               0
Patent Valuation                  0
Abstract                          0
Abstract (Translated)(English)    0
Claims                            0
Title (Translated)(English)       0
CPC                               0
dtype: int64

In [8]:
def clean_text(text):
  """
  This function cleans text data by converting to lowercase, removing punctuation,
  and performing optional stemming or lemmatization.

  Args:
      text (str): The text data to be cleaned.

  Returns:
      str: The cleaned text data.
  """
  # Convert to lowercase
  text = text.lower()

  # Remove punctuation
  import string
  punctuations = string.punctuation
  text = text.translate(str.maketrans('', '', punctuations))

  # Optional: Stemming or Lemmatization (choose one and uncomment)
  # from nltk.stem import PorterStemmer
  # stemmer = PorterStemmer()
  # text = stemmer.stem(text)

  # from nltk.stem import WordNetLemmatizer
  # lemmatizer = WordNetLemmatizer()
  # text = lemmatizer.lemmatize(text)

  return text


In [9]:
# Cos Title and Abstract columns contain text data for cleaning
patent_data['Cleaned_Title'] = patent_data['Title'].apply(clean_text)
patent_data['Cleaned_Abstract'] = patent_data['Abstract'].apply(clean_text)

In [10]:
patent_data.head()

Unnamed: 0,Number,Publication Number,Title,Legal Status & Events,Current Assignee,Application Date,IPC,Patent Valuation,Abstract,Abstract (Translated)(English),Claims,Title (Translated)(English),CPC,Cleaned_Title,Cleaned_Abstract
0,1,US6056237A,Sonotube compatible unmanned aerial vehicle an...,Non-payment,1281329 ALBERTA LTD.,1997-06-25,B64D1/02 | B64D1/00 | B64D33/02 | B64C39/00 | ...,-,The present invention is generally comprised o...,The present invention is generally comprised o...,I claim:_x000D_\n1. A sonotube compatible unma...,Sonotube compatible unmanned aerial vehicle an...,B64C3/40 | B64C5/12 | B64C39/024 | B64D1/02 | ...,sonotube compatible unmanned aerial vehicle an...,the present invention is generally comprised o...
1,2,US8511606B1,Unmanned aerial vehicle base station,Granted,THE BOEING COMPANY,2009-12-09,B64D41/00,"$ 56,000","A method and apparatus comprising a platform, ...","A method and apparatus comprising a platform, ...",1. An apparatus comprising:_x000D_\na platform...,Unmanned aerial vehicle base station,B64C39/028 | B64C39/024 | B64C2201/066 | B64C2...,unmanned aerial vehicle base station,a method and apparatus comprising a platform a...
2,3,US8948935B1,Providing a medical support device via an unma...,Granted | Transfer,WING AVIATION LLC,2013-01-02,G06Q10/00 | B64C39/02 | G16H40/67,"$ 79,000",Embodiments described herein may relate to an ...,Embodiments described herein may relate to an ...,1. An unmanned aerial vehicle (UAV) comprising...,Providing a medical support device via an unma...,A61B5/00 | A61B19/0264 | B64C39/024 | G06F19/3...,providing a medical support device via an unma...,embodiments described herein may relate to an ...
3,4,US20100250022A1,Useful unmanned aerial vehicle,Withdrawn-Deemed,"AIR RECON, INC.",2006-12-29,G05D1/00 | B64C13/20 | G06F3/048,-,An unmanned aerial vehicle (UAV) addresses rem...,An unmanned aerial vehicle (UAV) addresses rem...,1-17. (canceled)_x000D_\n18. A method of opera...,Useful unmanned aerial vehicle,B64C2201/141 | B64C2201/145 | G05D1/0094 | G05...,useful unmanned aerial vehicle,an unmanned aerial vehicle uav addresses remot...
4,5,US20110084162A1,Autonomous Payload Parsing Management System a...,Abandoned-Undetermined,HONEYWELL INTERNATIONAL INC.,2009-10-09,B64C29/00 | G01M1/12 | B64C17/10 | B64D37/14 |...,-,An unmanned aerial vehicle (UAV) for making pa...,An unmanned aerial vehicle (UAV) for making pa...,1. An unmanned aerial vehicle (UAV) for making...,Autonomous Payload Parsing Management System a...,B64C39/024 | B64C2201/027 | B64C2201/088 | B64...,autonomous payload parsing management system a...,an unmanned aerial vehicle uav for making part...


In [11]:
# Install the library using pip
#!pip install sentence-transformers
%pip install sentence-transformers --user

Note: you may need to restart the kernel to use updated packages.


In [12]:
from sentence_transformers import SentenceTransformer

# Load the BERT model. Various models trained on Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md and
# Semantic Textual Similarity are available https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md
#Models to test:
# 'all-mpnet-base-v2'
# 'paraphrase-distilroberta-base-v2'
# and lastly, 'bert-base-nli-mean-tokens'
model = SentenceTransformer('bert-base-nli-mean-tokens')

Downloading .gitattributes:   0%|          | 0.00/445 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [13]:
title_list = patent_data['Cleaned_Title'].tolist()
abstract_list = patent_data['Cleaned_Abstract'].tolist()

In [14]:
combined_list = title_list + abstract_list

In [15]:
# Encode Title data
title_embeddings = model.encode(patent_data['Cleaned_Title'].tolist())

In [16]:
print('Title embedding vector - length', len(title_embeddings[0]))

Title embedding vector - length 768


In [17]:
# Encode Abstract data
abstract_embeddings = model.encode(patent_data['Cleaned_Abstract'].tolist())

In [18]:
print('Abstract embedding vector - length', len(abstract_embeddings[0]))

Abstract embedding vector - length 768


In [21]:
# Encode combinedTitleabstract_list data
combined_list_embeddings = model.encode(combined_list)

In [22]:
combined_embeddings = np.concatenate([title_embeddings, abstract_embeddings], axis=1)

In [23]:
import scipy
#@title Sematic Search Form

# code adapted from https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py

query = 'UAV Model aerospace flight' #@param {type: 'string'}

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 5 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], title_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(title_list[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: UAV Model aerospace flight

Top 5 most similar sentences in corpus:
unmanned aerial vehicle uav takeoff and landing method device and system and uav (Cosine Score: 0.9169)
vertical takeoff and landing model aircraft unmanned aerial vehicle (Cosine Score: 0.9133)
the monitering system for a vessle using a uav (Cosine Score: 0.8977)
unmanned aerial vehicle uav flight control method and device (Cosine Score: 0.8920)
navigating a uav (Cosine Score: 0.8904)
