In [53]:
!pip install elasticsearch



In [54]:
import pandas as pd
from elasticsearch import Elasticsearch

In [55]:
es = Elasticsearch('http://localhost:9200/')

In [56]:
es.ping()

True

In [57]:
import tensorflow

In [58]:
pip show elasticsearch

Name: elasticsearch
Version: 8.11.0
Summary: Python client for Elasticsearch
Home-page: https://github.com/elastic/elasticsearch-py
Author: Elastic Client Library Maintainers
Author-email: client-libs@elastic.co
License: Apache-2.0
Location: c:\users\honor\anaconda3\lib\site-packages
Requires: elastic-transport
Required-by: 
Note: you may need to restart the kernel to use updated packages.


# Prepare the data

In [59]:
df = pd.read_json("../data/HarvardHealthBlog/Harvard Health Blog - Live a Healthier Lifestyle - Harvard Health page 1.json").loc[:100]

In [60]:
# df_pm = pd.read_csv("data/Pubmed_200k_RCT/train.csv").loc[:100]

In [61]:
df.head()

Unnamed: 0,Title,Title_URL,Image,fontbold_URL,fontbold,block,Field
0,"Magnets, sound, and batteries: Choosing safe toys",https://www.health.harvard.edu/blog/magnets-so...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/topics/child-an...,Child & Teen Health,"\n Updated December 13, 2023\n",If you're choosing gifts to give or donate to ...
1,"No-cost, low-cost, and bigger splurges for cli...",https://www.health.harvard.edu/blog/no-cost-lo...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/topics/staying-...,Staying Healthy,"\n Published December 11, 2023\n","If you're looking for gifts to give or donate,..."
2,What to do if you think your child has the flu,https://www.health.harvard.edu/blog/what-to-do...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/topics/child-an...,Child & Teen Health,"\n Updated September 12, 2023\n","If you hear your child start coughing, it's na..."
3,When should you hire in-home help or health ai...,https://www.health.harvard.edu/blog/when-shoul...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/topics/staying-...,Staying Healthy,"\n Published December 6, 2023\n",Most people want to live at home for as long a...
4,"Small pets are delightful, but some carry dang...",https://www.health.harvard.edu/blog/small-pets...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/topics/staying-...,Staying Healthy,"\n Published December 4, 2023\n","Small animals like turtles, iguanas, and frogs..."


In [62]:
df.Image[0]

'https://domf5oio6qrcr.cloudfront.net/medialibrary/10369/GettyImages-853938328.jpg'

In [63]:
df.isna().sum().sum()

0

# Convert text to Vector using S-BERT Model

In [64]:
from tqdm import tqdm
tqdm.pandas()


In [65]:
from sentence_transformers import SentenceTransformer

In [66]:
model = SentenceTransformer('all-mpnet-base-v2')

In [67]:
df["description_vector"] = df["Field"].progress_apply(lambda x: model.encode(x))

100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:08<00:00,  6.68it/s]


In [68]:
df.head(2)

Unnamed: 0,Title,Title_URL,Image,fontbold_URL,fontbold,block,Field,description_vector
0,"Magnets, sound, and batteries: Choosing safe toys",https://www.health.harvard.edu/blog/magnets-so...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/topics/child-an...,Child & Teen Health,"\n Updated December 13, 2023\n",If you're choosing gifts to give or donate to ...,"[-0.0070151784, 0.0033397812, -0.018809972, 0...."
1,"No-cost, low-cost, and bigger splurges for cli...",https://www.health.harvard.edu/blog/no-cost-lo...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/topics/staying-...,Staying Healthy,"\n Published December 11, 2023\n","If you're looking for gifts to give or donate,...","[0.041044563, 0.05344586, 0.017541107, 0.07421..."


In [69]:
df.description_vector[:5]

0    [-0.0070151784, 0.0033397812, -0.018809972, 0....
1    [0.041044563, 0.05344586, 0.017541107, 0.07421...
2    [-0.01417845, 0.00091768993, -0.0012399706, 0....
3    [-0.0047708116, 0.0065720384, 0.013290583, -0....
4    [0.025648113, 0.044721454, 0.010994425, -0.055...
Name: description_vector, dtype: object

In [70]:
es.ping()

True

# Create new index

In [71]:
from indexMapping import indexMapping

In [72]:
es.indices.create(index="all_documents", mappings= indexMapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'all_documents'})

In [73]:
records_list = df.to_dict("records")

In [74]:
for record in records_list:
    try:
        es.index(index="all_documents", document=record)
    except Exception as e:
        print(e)

In [75]:
es.count(index="all_documents")

ObjectApiResponse({'count': 53, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

# Searching documents:

In [78]:
user_input = "toys"
vector_input = model.encode(user_input)

query = {
    "field": "description_vector",
    "query_vector": vector_input,
    "k":5,
    "num_candidates": 60,
}

res = es.knn_search(index="all_documents", knn=query, source=["Title","Field", "Image"])

results = res["hits"]["hits"]

for result in results:
    if "_source" in result:
        try:
            print(f"Document score: {result['_score']}")
            print(f"Document Title: {result['_source']['Title']}")
            print(f"Document Text: {result['_source']['Field']}")
            print(50*"_")
        except Exception as e:
            print(e)
    

results
# relevants_docs
# score = res["hits"][:max_score]
# score

Document score: 0.51011306
Document Title: Magnets, sound, and batteries: Choosing safe toys
Document Text: If you're choosing gifts to give or donate to children this year, be sure safety is on the list. Here are tips for toys to consider — those that encourage creativity, imagination, and movement  — and toys to try to avoid, due to safety concerns or for other reasons.
__________________________________________________
Document score: 0.4094914
Document Title: Treating erectile dysfunction with penile implants
Document Text: Penile implants, an option patients with erectile dysfunction probably hear little about, might offer a lasting and satisfying “cure.” Abraham Morgentaler, M.D., director of Men’s Health Boston, explains how.
__________________________________________________
Document score: 0.38130474
Document Title: Kidneys, eyes, ears, and more: Why do we have a spare?
Document Text: The human body has excess capacity — that is, our organs have more reserve than most of us wi

  res = es.knn_search(index="all_documents", knn=query, source=["Title","Field", "Image"])


[{'_index': 'all_documents',
  '_id': '4kuKlowBIhxR8BoTFHbN',
  '_score': 0.51011306,
  '_ignored': ['Field.keyword'],
  '_source': {'Title': 'Magnets, sound, and batteries: Choosing safe toys',
   'Image': 'https://domf5oio6qrcr.cloudfront.net/medialibrary/10369/GettyImages-853938328.jpg',
   'Field': "If you're choosing gifts to give or donate to children this year, be sure safety is on the list. Here are tips for toys to consider — those that encourage creativity, imagination, and movement\xa0 — and toys to try to avoid, due to safety concerns or for other reasons."}},
 {'_index': 'all_documents',
  '_id': 'GkuKlowBIhxR8BoTKXfJ',
  '_score': 0.4094914,
  '_source': {'Title': 'Treating erectile dysfunction with penile implants',
   'Image': 'https://www.health.harvard.edu/img/misc/image-placeholder.svg',
   'Field': 'Penile implants, an option patients with erectile dysfunction probably hear little about, might offer a lasting and satisfying “cure.” Abraham Morgentaler, M.D., directo