# Setup and Load dataset



In [None]:
!pip install sentence_transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3

In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.2-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.2


In [None]:
!gdown --id 15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr

Downloading...
From: https://drive.google.com/uc?id=15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr
To: /content/MIND.zip
100% 44.4M/44.4M [00:01<00:00, 30.6MB/s]


In [None]:
!unzip -o "MIND.zip"  -d  "/content"

Archive:  MIND.zip
  inflating: /content/MIND/behaviors.tsv  
  inflating: /content/MIND/news.tsv  


## Data Loading

In [None]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

#### Name of the file which contain all the item properties

In [None]:
file="MIND/news.tsv"

###### Run below cell

In [None]:
content_df  = pd.read_table( 'MIND/news.tsv',
            names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['newid','vertical', 'subvertical', 'title', 'abstract'])

In [None]:
content_df=content_df

## Details about dataset

In [None]:
itemid="newid"

In [None]:
features=['abstract']

In [None]:
allcols=[itemid]
for i in features:
  allcols.append(i)

# Setup

In [None]:
content_df['NewTag']=""
for i in features:
  content_df[i] = content_df[i].fillna(' ')
for i in features:
  content_df['NewTag']+=(" "+content_df[i])
content_df['NewTag']=content_df['NewTag'].astype(str)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [None]:
def clean_text(text):
    
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub( re.compile("\'s"), ' ', text)
    text = re.sub(re.compile("\\r\\n"), ' ', text)
    text = re.sub(re.compile(r"[^\w\s]"), ' ', text)
    return text

In [None]:
stopwords=set(stopwords.words('english'))

In [None]:
def tokenizer(sentence, min_words=4, max_words=200, stopwords=stopwords, lemmatize=True):
    
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    

In [None]:
content_df['clean'] = content_df['NewTag'].apply(clean_text)
# content_df['token_lem_sentence'] = content_df['clean'].apply(
#         lambda x: tokenizer(x))

# Model train

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')


Downloading (…)b6d67/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)13d78b6d67/README.md:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading (…)d78b6d67/config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)b6d67/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)13d78b6d67/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)78b6d67/modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [None]:
corpus_embeddings = model.encode(content_df.clean.values,show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/1603 [00:00<?, ?it/s]

# hnswlib

In [None]:
!pip install hnswlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hnswlib
  Downloading hnswlib-0.7.0.tar.gz (33 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.7.0-cp38-cp38-linux_x86_64.whl size=2122853 sha256=58d8b10e028f089da2863cc528f4e9975097d26169201724c12d781623195533
  Stored in directory: /root/.cache/pip/wheels/93/0d/13/bbdc55499ef621f8f722fad91050fbb1380709f0c62fa7719a
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.7.0


In [None]:
import hnswlib
import torch

In [None]:

# n_trees=100
# annoy_index = AnnoyIndex(embedding_size, 'angular')

# for i in range(len(corpus_embeddings)):
#         annoy_index.add_item(i, corpus_embeddings[i])

# annoy_index.build(n_trees)


True

In [None]:

embedding_size=corpus_embeddings.shape[1]
index_path='hnswlib'
index = hnswlib.Index(space = 'cosine', dim = embedding_size)
index.init_index(max_elements = len(corpus_embeddings), ef_construction = 400, M = 64)
index.add_items(corpus_embeddings, list(range(len(corpus_embeddings))))
index.save_index(index_path)

In [None]:
query="Shop the notebooks, jackets"
query_embedding = model.encode(query)


In [None]:
#get top queries
TOP_K=10
corpus_ids, distances = index.knn_query(query_embedding, k=TOP_K)
hits = [{'corpus_id': id, 'score': 1-score} for id, score in zip(corpus_ids[0], distances[0])]
hits = sorted(hits, key=lambda x: x['score'], reverse=True)

In [None]:
hnswlib_hit_corpus=[]
print("Results:")
for hit in hits[0:TOP_K]:
        print("\t\t{}".format( content_df.clean.values[hit['corpus_id']]))
        hnswlib_hit_corpus.append(hit['corpus_id'])



Results:
		 i have done flights where passports  laptops  handbags  glasses  crutches and all manner of clothing has been forgotten 
		 you ll see lots of laptop deals in the coming weeks  doorbusters for notebooks under  500  maybe under  400 or  300 if you re lucky  and while there  no shame in putting on three layers in the cold on black friday morning only to elbow someone in best buy in the name of a bargain  no really  it can be fun   these aren t typically the models we d recommend if it didn t happen to be deals season  for our holiday gift guide  we skipped straight to   
		 a small new jersey bookshop got a visit from punk poet laureate patti smith over the weekend  smith on sunday afternoon stopped by haddonfield  inkwood books in what owner julie beddingfied called a  chance encounter  after seeing a copy of her own new book year of the monkey in the shop  window  smith  who grew up in germantown and south jersey  is back in the area for an appearance monday    
		 taylor s

In [None]:
itemids=[]
for i in hnswlib_hit_corpus:
  itemids.append(content_df.iloc[i][itemid])
content_df[content_df[itemid].isin(itemids)][allcols]

Unnamed: 0,newid,abstract
20510,N6842,"I have done flights where passports, laptops, ..."
21338,N41207,Renee Steinaker said she was working on a flig...
22720,N59962,We sorted through Amazon's gift guides to find...
25380,N46510,A small New Jersey bookshop got a visit from p...
27318,N17559,The teacher is shown in a video posted to Twit...
29676,N63411,Meghan Markle stepped out for an important gue...
32279,N45871,Best Buy's Black Friday doorbuster deals conti...
34442,N7090,Taylor Swift made a surprise appearance at Blu...
44461,N57898,Getting back to the days when the MacBook Air ...
46408,N48375,You'll see lots of laptop deals in the coming ...


# Compare hnswlib results with Cosine similarity
- Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity

In [None]:
from sentence_transformers import  util
# Here, we compute the recall of ANN compared to the exact results
correct_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=TOP_K)[0]
correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits])

In [None]:
correct_hits_ids

{0, 535, 11143, 20510, 24802, 25544, 32078, 34074, 40905, 46408}

In [None]:
corect_hit_corpus=[]
for hit in correct_hits[0:TOP_K]:
              corect_hit_corpus.append(hit['corpus_id'])
itemids=[]
for i in corect_hit_corpus:
  itemids.append(content_df.iloc[i][itemid])
content_df[content_df[itemid].isin(itemids)][allcols]


Unnamed: 0,newid,abstract
0,N55528,"Shop the notebooks, jackets, and more that the..."
535,N44067,"Shop these self-care friendly finds, plus info..."
11143,N65055,An Indiana man has a clerk in a clothing store...
20510,N6842,"I have done flights where passports, laptops, ..."
24802,N8724,"Today's open discussion thread, complete with ..."
25544,N45409,"Today's open discussion thread, complete with ..."
32078,N30995,"Today's open discussion thread, complete with ..."
34074,N8866,"Today's open discussion thread, complete with ..."
40905,N33064,"We hereby declare the internet utterly broken,..."
46408,N48375,You'll see lots of laptop deals in the coming ...


In [None]:
hnswlib_hit_corpus=set(hnswlib_hit_corpus)
hnswlib_hit_corpus

{20510, 21338, 22720, 25380, 27318, 29676, 32279, 34442, 44461, 46408}

In [None]:

recall = len(hnswlib_hit_corpus.intersection(correct_hits_ids)) / len(correct_hits_ids)
print("recall: "+str(recall))

recall: 0.2
