# INDEX THE DOCUMENTS

In [1]:
!pip install elasticsearch



## Import Libraries

In [72]:
import pandas as pd
from elasticsearch import Elasticsearch
import tensorflow
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
tqdm.pandas()

## Connect to elasticsearch

In [36]:
es = Elasticsearch('http://localhost:9200/')

In [37]:
es.ping()

True

# EDA

# Prepare the data

In [38]:
df = pd.read_json("../data/HarvardHealthBlog/HarvardHealthBlog_part2.json")

In [39]:
df.shape

(187, 21)

In [40]:
df.head(2)

Unnamed: 0,Title1,Image1,inlineblock_URL,fontbold_URL1,fontbold1,block1,Field1,Title,Title_URL,Image,fontbold_URL,fontbold,block,Field,Title2,Image2,inlineblock_URL1,fontbold_URL2,fontbold2,block2,Field2
0,Immune boosts or busts? From IV drips and deto...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/immune-boo...,https://www.health.harvard.edu/topics/staying-...,Staying Healthy,"\n Published September 11, 2023\n",Ads for products that promise to supercharge t...,,,,,,,,,,,,,,
1,"After prostate cancer treatment, a new standar...",https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/after-pros...,https://www.health.harvard.edu/topics/mens-health,Men's Health,"\n Published December 20, 2023\n",When prostate cancer recurs after initial trea...,,,,,,,,,,,,,,


### Keep the essential columns

In [41]:
df.columns

Index(['Title1', 'Image1', 'inlineblock_URL', 'fontbold_URL1', 'fontbold1',
       'block1', 'Field1', 'Title', 'Title_URL', 'Image', 'fontbold_URL',
       'fontbold', 'block', 'Field', 'Title2', 'Image2', 'inlineblock_URL1',
       'fontbold_URL2', 'fontbold2', 'block2', 'Field2'],
      dtype='object')

In [42]:
df = df[['Title1', 'Image1', 'inlineblock_URL', 'fontbold1', 'block1', 'Field1']]

In [43]:
df.head(2)

Unnamed: 0,Title1,Image1,inlineblock_URL,fontbold1,block1,Field1
0,Immune boosts or busts? From IV drips and deto...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/immune-boo...,Staying Healthy,"\n Published September 11, 2023\n",Ads for products that promise to supercharge t...
1,"After prostate cancer treatment, a new standar...",https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/after-pros...,Men's Health,"\n Published December 20, 2023\n",When prostate cancer recurs after initial trea...


### Rename columns

In [59]:
df.columns

Index(['title', 'imageURL', 'articleURL', 'category', 'publishedDate',
       'content'],
      dtype='object')

In [57]:
df = df.rename(columns={"Title1": "title",
                   "Image1": "imageURL",
                   "inlineblock_URL": "articleURL",
                   "fontbold1": "category",
                   "block1": "publishedDate",
                   "Field1": "content"
                  })

In [60]:
df.head(2)

Unnamed: 0,title,imageURL,articleURL,category,publishedDate,content
0,Immune boosts or busts? From IV drips and deto...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/immune-boo...,Staying Healthy,"\n Published September 11, 2023\n",Ads for products that promise to supercharge t...
1,"After prostate cancer treatment, a new standar...",https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/after-pros...,Men's Health,"\n Published December 20, 2023\n",When prostate cancer recurs after initial trea...


### Pre-process

In [62]:
df.publishedDate[0]

'\n            Published September 11, 2023\n    '

- remove the newline character ("\n"), spaces, and the word "Published|Updated" from the "df['publishedDate']" column

In [66]:
df['publishedDate'] = df['publishedDate'].str.replace(r'\n|\s|Published|Updated', '', regex=True)

In [67]:
df['publishedDate']

0      September11,2023
1       December20,2023
2       December19,2023
3       December18,2023
4         January5,2022
             ...       
182       August17,2022
183       August16,2022
184       August15,2022
185       August11,2022
186       August23,2022
Name: publishedDate, Length: 187, dtype: object

### Check NAN Values

In [68]:
df.isna().sum().sum()

0

In [69]:
df.head()

Unnamed: 0,title,imageURL,articleURL,category,publishedDate,content
0,Immune boosts or busts? From IV drips and deto...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/immune-boo...,Staying Healthy,"September11,2023",Ads for products that promise to supercharge t...
1,"After prostate cancer treatment, a new standar...",https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/after-pros...,Men's Health,"December20,2023",When prostate cancer recurs after initial trea...
2,Chronic fatigue syndrome is rising,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/chronic-fa...,Diseases & Conditions,"December19,2023",The CDC estimates that 3.3 million Americans h...
3,How healthy is sugar alcohol?,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/how-health...,Nutrition,"December18,2023",Food products advertised as being lower in sug...
4,"Are poinsettias, mistletoe, or holly plants da...",https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/are-poinse...,Staying Healthy,"January5,2022",It's commonly believed that poinsettia plants ...


# Convert text to Vector using S-BERT Model

In [73]:
model = SentenceTransformer('all-mpnet-base-v2')

In [75]:
df["description_vector"] = df["content"].progress_apply(lambda x: model.encode(x))

100%|████████████████████████████████████████████████████████████████████████████████| 187/187 [00:26<00:00,  7.17it/s]


In [76]:
df.head(2)

Unnamed: 0,title,imageURL,articleURL,category,publishedDate,content,description_vector
0,Immune boosts or busts? From IV drips and deto...,https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/immune-boo...,Staying Healthy,"September11,2023",Ads for products that promise to supercharge t...,"[0.03048955, 0.015904073, -0.009512083, 0.0183..."
1,"After prostate cancer treatment, a new standar...",https://domf5oio6qrcr.cloudfront.net/medialibr...,https://www.health.harvard.edu/blog/after-pros...,Men's Health,"December20,2023",When prostate cancer recurs after initial trea...,"[0.03491495, 0.05657588, -0.00996043, -0.02672..."


In [77]:
es.ping()

True

# Create new index

In [78]:
from indexMapping import indexMapping

In [94]:
# es.indices.create(index="all_documents", mappings= indexMapping)

In [98]:
index_name = "medical_documents"
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, mappings=indexMapping)
    print(f"Index '{index_name}' created successfully.")
else:
    print(f"Index '{index_name}' already exists.")

Index 'medical_documents' created successfully.


In [99]:
records_list = df.to_dict("records")

In [100]:
len(records_list)

187

In [101]:
for record in records_list:
    try:
        es.index(index= index_name, document=record)
    except Exception as e:
        print(e)

In [102]:
es.count(index= index_name)

ObjectApiResponse({'count': 182, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

# Searching documents:

In [103]:
df.columns

Index(['title', 'imageURL', 'articleURL', 'category', 'publishedDate',
       'content', 'description_vector'],
      dtype='object')

In [105]:
user_input = "toys"
vector_input = model.encode(user_input)

query = {
    "field": "description_vector",
    "query_vector": vector_input,
    "k":5,
    "num_candidates": 200,
}

res = es.knn_search(index= index_name , knn=query, source=['title', 'imageURL', 'articleURL', 'category', 'publishedDate',
       'content'])

results = res["hits"]["hits"]

for result in results:
    if "_source" in result:
        try:
            print(f"Document score: {result['_score']}")
            print(f"Document Title: {result['_source']['title']}")
            print(f"Document Text: {result['_source']['content']}")
            print(50*"_")
        except Exception as e:
            print(e)
    

# results
# relevants_docs
# score = res["hits"][:max_score]
# score

Document score: 0.51011306
Document Title: Magnets, sound, and batteries: Choosing safe toys
Document Text: If you're choosing gifts to give or donate to children this year, be sure safety is on the list. Here are tips for toys to consider — those that encourage creativity, imagination, and movement  — and toys to try to avoid, due to safety concerns or for other reasons.
__________________________________________________
Document score: 0.42306823
Document Title: Play helps children practice key skills and build their strengths
Document Text: As devices become more pervasive, and as many children become more scheduled with lessons and organized activities, making time for device-free play can be a challenge. Here's why it's important prioritize free play in a child's life.
__________________________________________________
Document score: 0.40329963
Document Title: What happens when a drug goes viral?
Document Text: 
__________________________________________________
Document score: 0

  res = es.knn_search(index= index_name , knn=query, source=['title', 'imageURL', 'articleURL', 'category', 'publishedDate',
