# Setup and Load dataset



In [None]:
%%capture
!pip install elasticsearch==7.14.0
!apt install default-jdk > /dev/null

In [None]:
!pip install sentence_transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3

In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.2-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.2


In [None]:
!gdown --id 1pxg131HVz6t1lGPDiT70SIf8LnVJlVy4

Downloading...
From: https://drive.google.com/uc?id=1pxg131HVz6t1lGPDiT70SIf8LnVJlVy4
To: /content/medium_data.zip
100% 520k/520k [00:00<00:00, 159MB/s]


In [None]:
!unzip -o "medium_data.zip"  -d  "/content"

Archive:  medium_data.zip
  inflating: /content/medium_data.csv  


In [None]:
import os
os.kill(os.getpid(), 9)

## Data Loading

In [None]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

#### Name of the file which contain all the item properties

In [None]:
file="medium_data.csv"

###### Run below cell

In [None]:
content_df  = pd.read_table( file,delimiter=',')

In [None]:
content_df.head(1)

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30


## Details about dataset

In [None]:
itemid="id"

In [None]:
features=['title','subtitle'	]

In [None]:
allcols=[itemid]
for i in features:
  allcols.append(i)

# Setup

In [None]:
content_df['NewTag']=""
for i in features:
  content_df[i] = content_df[i].fillna(' ')
for i in features:
  content_df[i] = content_df[i].fillna(' ')
for i in features:
  content_df['NewTag']+=(' '+content_df[i])
content_df['NewTag']=content_df['NewTag'].astype(str)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [None]:
def clean_text(text):
    
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub( re.compile("\'s"), ' ', text)
    text = re.sub(re.compile("\\r\\n"), ' ', text)
    text = re.sub(re.compile(r"[^\w\s]"), ' ', text)
    return text

In [None]:
stopwords=set(stopwords.words('english'))

In [None]:
def tokenizer(sentence, min_words=4, max_words=200, stopwords=stopwords, lemmatize=True):
    
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    

In [None]:
content_df['clean'] = content_df['NewTag'].apply(clean_text)
# content_df['token_lem_sentence'] = content_df['clean'].apply(
#         lambda x: tokenizer(x))

# Elastic search set up

In [None]:
import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch import helpers


In [None]:
# Download & extract Elasticsearch 7.0.0

!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.0.0

In [None]:
# Creating daemon instance of elasticsearch
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [None]:
# This part is important, since it takes a little amount of time for instance to load
import time
time.sleep(20)

In [None]:
%%bash
# If you get 1 root & 2 daemon process then Elasticsearch instance has started successfully
ps -ef | grep elasticsearch

daemon      1867    1677 99 04:29 ?        00:00:21 /content/elasticsearch-7.0.0/jdk/bin/java -Xms1g -Xmx1g -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=75 -XX:+UseCMSInitiatingOccupancyOnly -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.io.tmpdir=/tmp/elasticsearch-11574500148561208233 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m -Djava.locale.providers=COMPAT -Dio.netty.allocator.type=unpooled -Des.path.home=/content/elasticsearch-7.0.0 -Des.path.conf=/content/elasticsearch-7.0.0/config -Des.distribution.flavor=default

In [None]:
# Check if elasticsearch is running
!curl -sX GET "localhost:9200/"

In [None]:
es = Elasticsearch(hosts = [{"host":"localhost", "port":9200}])
# Check if python is connected to elasticsearch
es.ping()

True

# Search BM25

In [None]:
# Define settings & mappings of Elasticsearch index
Settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "id":{
                "type":"text"
            },
            "longtext":{
                "type":"text"
            }
        }
    }
}

In [None]:
indexname='new_index1'
index_type='_doc'

In [None]:

index = es.indices.create(index=indexname, ignore=[400,404], body=Settings)


In [None]:
json_list = []

for idx, row in content_df.iterrows():
            dic = {}
            dic['_index'] = indexname
            dic['_type'] = index_type
            source = {}
            source[itemid] = row[itemid]
            source['longtext'] = row['clean']
            dic['_source'] = source
            json_list.append(dic)

In [None]:
# For importing Data to elasticsearch we use elasticsearch's bulk API from elasticsearch.helpers
try:
    res = helpers.bulk(es, json_list)
    print("successfully imported to elasticsearch.")
except Exception as e:
    print(f"error: {e}")



successfully imported to elasticsearch.


In [None]:
search_query="graph neural network"

In [None]:
# Complicated query
query = es.search(
    index=indexname,
    body={
        "size":20,
        "query":{
            "bool":{
                "must":[
                        {"match":{"longtext":search_query}}
                ],
                # "should":[
                #         {"match":{"":""}}
                # ]
            }
        }
    }
)

output = pd.json_normalize((query['hits']['hits']))
output

Unnamed: 0,_index,_type,_id,_score,_source.id,_source.longtext
0,new_index1,_doc,JbUWSYYBHhi0keru38uW,11.091132,5860,sep how to build a simple neural network from...
1,new_index1,_doc,rbUWSYYBHhi0keru2775,10.660841,2668,sep an introduction to convolutional neural n...
2,new_index1,_doc,IbUWSYYBHhi0keru2ree,10.546149,736,sep neural networks training with approximate...
3,new_index1,_doc,FrUWSYYBHhi0keru2ree,9.416538,725,sep exploring how neural networks work and ma...
4,new_index1,_doc,hbUWSYYBHhi0keru3cbZ,9.359104,4676,sep practical graph neural networks for molec...
5,new_index1,_doc,UrUWSYYBHhi0keru4Mwo,9.14432,6161,sep neural network optimization sep covering ...
6,new_index1,_doc,ZrUWSYYBHhi0keru27oa,8.81608,1573,sep kohonen self organizing maps sep a specia...
7,new_index1,_doc,KrUWSYYBHhi0keru38uW,8.81608,5865,sep finding the right architecture for neural...
8,new_index1,_doc,JrUWSYYBHhi0keru3cfZ,8.771564,4837,sep evolution of graph neural networks for re...
9,new_index1,_doc,zrUWSYYBHhi0keru2rjY,8.510588,1165,sep is relu reluvant sep questioning very ba...
