### Setup and Load dataset



In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!gdown --id 15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr

Downloading...
From: https://drive.google.com/uc?id=15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr
To: /content/MIND.zip
100% 44.4M/44.4M [00:00<00:00, 50.7MB/s]


In [None]:
!unzip -o "MIND.zip"  -d  "/content"

Archive:  MIND.zip
  inflating: /content/MIND/behaviors.tsv  
  inflating: /content/MIND/news.tsv  


## Data Loading

In [None]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

#### Name of the file which contain all the item properties

In [None]:
file="MIND/news.tsv"

###### Run below cell

In [None]:
content_df  = pd.read_table( 'MIND/news.tsv',
            names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['newid','vertical', 'subvertical', 'title', 'abstract'])

In [None]:
content_df=content_df

## Details about dataset

In [None]:
itemid="newid"

In [None]:
features=['abstract']

# Setup

In [None]:
!pip install sentence_transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
content_df['NewTag']=""
for i in features:
  content_df['NewTag']+=(" "+content_df[i])
content_df['NewTag']=content_df['NewTag'].astype(str)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [None]:
def clean_text(text):
    
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub( re.compile("\'s"), ' ', text)
    text = re.sub(re.compile("\\r\\n"), ' ', text)
    text = re.sub(re.compile(r"[^\w\s]"), ' ', text)
    return text

In [None]:
stopwords=set(stopwords.words('english'))

In [None]:
def tokenizer(sentence, min_words=4, max_words=200, stopwords=stopwords, lemmatize=True):
    
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    

In [None]:
content_df['clean'] = content_df['NewTag'].apply(clean_text)
content_df['token_lem_sentence'] = content_df['clean'].apply(
        lambda x: tokenizer(x))

# Transformer

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch


In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
corpus_embeddings = model.encode(content_df.clean.values, convert_to_tensor=True)


# Search

In [None]:
query="Shop the notebooks, jackets"

In [None]:
query_embedding = model.encode(query, convert_to_tensor=True)

# We use cosine-similarity and torch.topk to find the highest 3 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=10)

In [None]:
recommedations_list=[]
for score, idx in zip(top_results[0], top_results[1]):
    score = score.cpu().data.numpy() 
    idx = idx.cpu().data.numpy()
    recommedations_list.append(content_df[[itemid]].iloc[idx][0])

In [None]:
content_df[content_df[itemid].isin(recommedations_list)]

Unnamed: 0,newid,vertical,subvertical,title,abstract,NewTag,clean,token_lem_sentence
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By","Shop the notebooks, jackets, and more that the royals can't live without.","Shop the notebooks, jackets, and more that the royals can't live without.",shop the notebooks jackets and more that the royals can t live without,"[shop, the, notebook, jacket, and, more, that, the, royal, can, t, live, without]"
535,N44067,lifestyle,shop-holidays,Shop 40 Creative Gifts for Mom on Amazon,"Shop these self-care friendly finds, plus info on how to hide your tracks after you browse.","Shop these self-care friendly finds, plus info on how to hide your tracks after you browse.",shop these self care friendly finds plus info on how to hide your tracks after you browse,"[shop, these, self, care, friendly, find, plus, info, on, how, to, hide, your, track, after, you, browse]"
3236,N65016,lifestyle,lifestylehoroscope,What's In Each Zodiac Sign's Bag?,What are the belongings that can be found in your sign's bag?,What are the belongings that can be found in your sign's bag?,what are the belongings that can be found in your sign bag,"[what, are, the, belonging, that, can, be, found, in, your, sign, bag]"
4873,N5954,sports,football_nfl,"Worthington, Minn., schools a test of immigration policy","Worthington, Minn. As students grab their bags and stream out of the science classroom at Worthington High School, Ellen Baker-Merrigan packs up, too. She gathers notes, student papers, pens and markers, and loads them onto a two-shelf cart. Then she trundles down the hall to her desk in the old storage room she shares with two other teachers. Boxes of books line the wall under a metal ...","Worthington, Minn. As students grab their bags and stream out of the science classroom at Worthington High School, Ellen Baker-Merrigan packs up, too. She gathers notes, student papers, pens and markers, and loads them onto a two-shelf cart. Then she trundles down the hall to her desk in the old storage room she shares with two other teachers. Boxes of books line the wall under a metal ...",worthington minn as students grab their bags and stream out of the science classroom at worthington high school ellen baker merrigan packs up too she gathers notes student papers pens and markers and loads them onto a two shelf cart then she trundles down the hall to her desk in the old storage room she shares with two other teachers boxes of books line the wall under a metal,"[worthington, minn, a, student, grab, their, bag, and, stream, out, of, the, science, classroom, at, worthington, high, school, ellen, baker, merrigan, pack, up, too, she, gather, note, student, paper, pen, and, marker, and, load, them, onto, a, two, shelf, cart, then, she, trundle, down, the, hall, to, her, desk, in, the, old, storage, room, she, share, with, two, other, teacher, box, of, book, line, the, wall, under, a, metal]"
5274,N54857,autos,autosenthusiasts,Exclusive Access: Hut Garage of Thailand,Let's make one thing abundantly clear: This is one man's personal collection and not a tuning shop.,Let's make one thing abundantly clear: This is one man's personal collection and not a tuning shop.,let make one thing abundantly clear this is one man personal collection and not a tuning shop,"[let, make, one, thing, abundantly, clear, this, is, one, man, personal, collection, and, not, a, tuning, shop]"
5319,N34234,lifestyle,lifestylefashion,"6 things you should never buy secondhand, according to a professional stylist","From fake designer pieces to clothes that need major alterations, leave these things on the rack during your next thrift store expedition.","From fake designer pieces to clothes that need major alterations, leave these things on the rack during your next thrift store expedition.",from fake designer pieces to clothes that need major alterations leave these things on the rack during your next thrift store expedition,"[from, fake, designer, piece, to, clothes, that, need, major, alteration, leave, these, thing, on, the, rack, during, your, next, thrift, store, expedition]"
5671,N1578,lifestyle,shop-holidays,You Can Actually Shop Walmart's Pre-Black Friday Sales Right Now,"Right now you can save on clothing, electronics, and more.","Right now you can save on clothing, electronics, and more.",right now you can save on clothing electronics and more,"[right, now, you, can, save, on, clothing, electronics, and, more]"
7594,N44785,foodanddrink,newstrends,Everything You Need For Holiday Baking Is In This Line And It's All Less Than $30 A Piece,Shop Delish Essentials today!,Shop Delish Essentials today!,shop delish essentials today,"[shop, delish, essential, today]"
19685,N24061,lifestyle,lifestylehomeandgarden,25 Beautiful Bookcases You'll Want in Your House,Add storage and style with one of these amazing bookcases for your home.,Add storage and style with one of these amazing bookcases for your home.,add storage and style with one of these amazing bookcases for your home,"[add, storage, and, style, with, one, of, these, amazing, bookcase, for, your, home]"
46764,N43016,finance,finance-savemoney,Online Shopping Secrets Retailers Don't Want You To Know,Use these hacks to save every time you shop online.,Use these hacks to save every time you shop online.,use these hacks to save every time you shop online,"[use, these, hack, to, save, every, time, you, shop, online]"


In [None]:
pd.set_option('display.max_colwidth', None)
content_df[content_df[itemid].isin(['N42317'])]

Unnamed: 0,newid,vertical,subvertical,title,abstract,NewTag,clean,token_lem_sentence
993,N42317,foodanddrink,foodnews,All the Snacks Bon Appétit Staffers Stash at Their Desks,Will work for snacks.,Will work for snacks.,will work for snacks,"[will, work, for, snack]"
