# Setup and Load dataset



In [None]:
!pip install sentence_transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.

In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.2-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.2


In [None]:
!gdown --id 15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr

Downloading...
From: https://drive.google.com/uc?id=15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr
To: /content/MIND.zip
100% 44.4M/44.4M [00:00<00:00, 144MB/s]


In [None]:
!unzip -o "MIND.zip"  -d  "/content"

Archive:  MIND.zip
  inflating: /content/MIND/behaviors.tsv  
  inflating: /content/MIND/news.tsv  


## Data Loading

In [None]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

#### Name of the file which contain all the item properties

In [None]:
file="MIND/news.tsv"

###### Run below cell

In [None]:
content_df  = pd.read_table( 'MIND/news.tsv',
            names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['newid','vertical', 'subvertical', 'title', 'abstract'])

In [None]:
content_df=content_df

## Details about dataset

In [None]:
itemid="newid"

In [None]:
features=['abstract']

In [None]:
allcols=[itemid]
for i in features:
  allcols.append(i)

# Setup

In [None]:
content_df['NewTag']=""
for i in features:
  content_df[i] = content_df[i].fillna(' ')
for i in features:
  content_df['NewTag']+=(" "+content_df[i])
content_df['NewTag']=content_df['NewTag'].astype(str)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [None]:
def clean_text(text):
    
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub( re.compile("\'s"), ' ', text)
    text = re.sub(re.compile("\\r\\n"), ' ', text)
    text = re.sub(re.compile(r"[^\w\s]"), ' ', text)
    return text

In [None]:
stopwords=set(stopwords.words('english'))

In [None]:
def tokenizer(sentence, min_words=4, max_words=200, stopwords=stopwords, lemmatize=True):
    
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    

In [None]:
content_df['clean'] = content_df['NewTag'].apply(clean_text)
# content_df['token_lem_sentence'] = content_df['clean'].apply(
#         lambda x: tokenizer(x))

# Model train

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')


Downloading (…)b6d67/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)13d78b6d67/README.md:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading (…)d78b6d67/config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)b6d67/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)13d78b6d67/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)78b6d67/modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [None]:
corpus_embeddings = model.encode(content_df.clean.values, convert_to_tensor=True)

# Search with Cosine Similarity

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
query="Shop the notebooks, jackets"

In [None]:
query_embedding = model.encode(query, convert_to_tensor=True)

# We use cosine-similarity and torch.topk to find the highest 3 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=10)

In [None]:
recommedations_list=[]
for score, idx in zip(top_results[0], top_results[1]):
    score = score.cpu().data.numpy() 
    idx = idx.cpu().data.numpy()
    recommedations_list.append(content_df[[itemid]].iloc[idx][0])

In [None]:
content_df[content_df[itemid].isin(recommedations_list)]

Unnamed: 0,newid,vertical,subvertical,title,abstract,NewTag,clean
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...","Shop the notebooks, jackets, and more that th...",shop the notebooks jackets and more that th...
535,N44067,lifestyle,shop-holidays,Shop 40 Creative Gifts for Mom on Amazon,"Shop these self-care friendly finds, plus info...","Shop these self-care friendly finds, plus inf...",shop these self care friendly finds plus inf...
11143,N65055,news,newsgoodnews,'It feels good to do the right thing': Store c...,An Indiana man has a clerk in a clothing store...,An Indiana man has a clerk in a clothing stor...,an indiana man has a clerk in a clothing stor...
20510,N6842,travel,travelnews,What Happens When You Leave Your Belongings Be...,"I have done flights where passports, laptops, ...","I have done flights where passports, laptops,...",i have done flights where passports laptops ...
24802,N8724,sports,icehockey_nhl,Monday Morning Fly By: Could everyone please s...,"Today's open discussion thread, complete with ...","Today's open discussion thread, complete with...",today open discussion thread complete with ...
25544,N45409,sports,icehockey_nhl,Tuesday Morning Fly By: Oh these guys again,"Today's open discussion thread, complete with ...","Today's open discussion thread, complete with...",today open discussion thread complete with ...
32078,N30995,sports,icehockey_nhl,Monday Morning Fly By: Third place baby!,"Today's open discussion thread, complete with ...","Today's open discussion thread, complete with...",today open discussion thread complete with ...
34074,N8866,sports,icehockey_nhl,Wednesday Morning Fly By: It's big test time,"Today's open discussion thread, complete with ...","Today's open discussion thread, complete with...",today open discussion thread complete with ...
40905,N33064,lifestyle,lifestylecelebstyle,Harry Styles's Princess Diana-Inspired Sheep S...,"We hereby declare the internet utterly broken,...",We hereby declare the internet utterly broken...,we hereby declare the internet utterly broken...
46408,N48375,news,newsscienceandtechnology,The best laptops and 2-in-1s to give as gifts,You'll see lots of laptop deals in the coming ...,You'll see lots of laptop deals in the coming...,you ll see lots of laptop deals in the coming...


# Semantic search


In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
corpus_embeddings = model.encode(content_df.clean.values, convert_to_tensor=True)


In [None]:
query="Shop the notebooks, jackets"
query_embedding = model.encode(query, convert_to_tensor=True)


In [None]:

hits = util.semantic_search(query_embedding, corpus_embeddings, score_function=util.dot_score)

In [None]:
for hit in hits:
        print(content_df.clean.values[hit['corpus_id']])


 shop the notebooks  jackets  and more that the royals can t live without 
 you ll see lots of laptop deals in the coming weeks  doorbusters for notebooks under  500  maybe under  400 or  300 if you re lucky  and while there  no shame in putting on three layers in the cold on black friday morning only to elbow someone in best buy in the name of a bargain  no really  it can be fun   these aren t typically the models we d recommend if it didn t happen to be deals season  for our holiday gift guide  we skipped straight to   
 i have done flights where passports  laptops  handbags  glasses  crutches and all manner of clothing has been forgotten 
 we hereby declare the internet utterly broken  courtesy of harry styles and his quirky printed sweater vest 
 a small new jersey bookshop got a visit from punk poet laureate patti smith over the weekend  smith on sunday afternoon stopped by haddonfield  inkwood books in what owner julie beddingfied called a  chance encounter  after seeing a copy o

# Search with FAISS

In [None]:
!pip install faiss-cpu
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import faiss
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')
encoded_data = model.encode(content_df.clean.tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))
index = faiss.IndexIDMap(faiss.IndexFlatIP(corpus_embeddings.shape[1]))
index.add_with_ids(encoded_data, np.array(range(0, len(content_df))))
faiss.write_index(index, 'item.index')

### fetch details of the document

In [None]:

def search(query, k, index, model):
    qv = model.encode([query])
    top_k = index.search(qv, k)
    top_k_ids = top_k[1].tolist()[0]
    top_k_ids = list(np.unique(top_k_ids))
    results =  [content_df.iloc[idx][itemid] for idx in top_k_ids]
    return results

### search

In [None]:
from pprint import pprint
query="Shop the notebooks, jackets"
results=search(query, k=10, index=index, model=model)
content_df[content_df[itemid].isin(results)][allcols]

Unnamed: 0,newid,abstract
0,N55528,"Shop the notebooks, jackets, and more that the royals can't live without."
11143,N65055,An Indiana man has a clerk in a clothing store to thank for the return of several thousand dollars that he nearly lost in his jacket pocket.
20510,N6842,"I have done flights where passports, laptops, handbags, glasses, crutches and all manner of clothing has been forgotten."
25380,N46510,"A small New Jersey bookshop got a visit from punk poet laureate Patti Smith over the weekend. Smith on Sunday afternoon stopped by Haddonfield's Inkwood Books in what owner Julie Beddingfied called a ""chance encounter"" after seeing a copy of her own new book Year of the Monkey in the shop's window. Smith, who grew up in Germantown and South Jersey, is back in the area for an appearance Monday ..."
27318,N17559,"The teacher is shown in a video posted to Twitter Friday wearing a white turtleneck and black jacket, as Common did in a Microsoft commercial last year."
33108,N17126,"CLEVELAND, Ohio A couple of releases are set at Phoenix Brewing, which has its winter coat drive coming up, Heinen's and Market Garden Brewery have collaborated to mark the store's 90th anniversary, and many other beer-centric events are found in our calendar, which is updated each week. Deadline to send info on events is Thursday; email me. Most dinners do not include tax and tip. Always ..."
34042,N47454,"Checkers, Ruby Tuesday's and Hungry Howie's in Central Florida were among the restaurants temporarily closed by state inspectors over health violations in the past month. Thirteen Central Florida restaurants were temporarily closed between Oct. 4 and Nov. 5 for violations that included rodents, roaches and flying insects. A Checkers at 11816 E. Colonial Drive, Orlando, was temporarily closed Oct. 8 with an inspection finding small, flying..."
40905,N33064,"We hereby declare the internet utterly broken, courtesy of Harry Styles and his quirky printed sweater vest."
46408,N48375,"You'll see lots of laptop deals in the coming weeks: doorbusters for notebooks under $500, maybe under $400 or $300 if you're lucky. And while there's no shame in putting on three layers in the cold on Black Friday morning only to elbow someone in Best Buy in the name of a bargain (no really, it can be fun), these aren't typically the models we'd recommend if it didn't happen to be deals season. For our holiday gift guide, we skipped straight to..."
49280,N46344,"Sam's Club has made it easy in recent years to skip the cashier and purchase items via your own smartphone in-store except when it comes to alcohol. But Sam's Club announced this week Tampa Bay shoppers can skip the line, use the retailer's smartphone app and cash themselves out even if their order includes beer, wine or liquor. This is using the same ""Scan & Go"" feature Sam's Club shoppers ..."
