### Setup and Load dataset



In [1]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.0


In [2]:
!gdown --id 15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr

Downloading...
From: https://drive.google.com/uc?id=15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr
To: /content/MIND.zip
100% 44.4M/44.4M [00:01<00:00, 38.0MB/s]


In [3]:
!unzip -o "MIND.zip"  -d  "/content"

Archive:  MIND.zip
  inflating: /content/MIND/behaviors.tsv  
  inflating: /content/MIND/news.tsv  


## Data Loading

In [4]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

#### Name of the file which contain all the item properties

In [5]:
file="MIND/news.tsv"

###### Run below cell

In [6]:
content_df  = pd.read_table( 'MIND/news.tsv',
            names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['newid','vertical', 'subvertical', 'title', 'abstract'])

In [7]:
content_df=content_df[:10000]

## Details about dataset

In [8]:
itemid="newid"

In [9]:
features=['vertical',	'subvertical','title']

# Content based filtering

In [None]:
!pip install sentence_transformers 

In [11]:
content_df['NewTag']=""
for i in features:
  content_df['NewTag']+=(" "+content_df[i])
content_df['NewTag']=content_df['NewTag'].astype(str)

In [24]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [18]:
def clean_text(text):
    
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub( re.compile("\'s"), ' ', text)
    text = re.sub(re.compile("\\r\\n"), ' ', text)
    text = re.sub(re.compile(r"[^\w\s]"), ' ', text)
    return text

In [15]:
stopwords=set(stopwords.words('english'))

In [16]:
def tokenizer(sentence, min_words=4, max_words=200, stopwords=stopwords, lemmatize=True):
    
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    

In [25]:
content_df['clean'] = content_df['NewTag'].apply(clean_text)
content_df['token_lem_sentence'] = content_df['clean'].apply(
        lambda x: tokenizer(x))

# Transformer

In [26]:
from sentence_transformers import SentenceTransformer, util
import torch


In [28]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
corpus_embeddings = model.encode(content_df.clean.values, convert_to_tensor=True)


## Recommendation

In [29]:
content_df[content_df.newid=='N55528']

Unnamed: 0,newid,vertical,subvertical,title,abstract,NewTag,clean,token_lem_sentence
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",lifestyle lifestyleroyals The Brands Queen El...,lifestyle lifestyleroyals the brands queen el...,"[lifestyle, lifestyleroyals, the, brand, queen..."


In [32]:
query=content_df[content_df.newid=='N55528'].clean[0]

In [35]:
query_embedding = model.encode(query, convert_to_tensor=True)

# We use cosine-similarity and torch.topk to find the highest 3 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=10)

In [37]:
for score, idx in zip(top_results[0], top_results[1]):
    score = score.cpu().data.numpy() 
    idx = idx.cpu().data.numpy()
    display(content_df[['newid', 'vertical', 'subvertical','title']].iloc[idx])

newid                                                     N55528
vertical                                               lifestyle
subvertical                                      lifestyleroyals
title          The Brands Queen Elizabeth, Prince Charles, an...
Name: 0, dtype: object

newid                                                      N7716
vertical                                               lifestyle
subvertical                                      lifestyleroyals
title          What Do Prince George & Princess Charlotte Kno...
Name: 397, dtype: object

newid                                        N42777
vertical                                  lifestyle
subvertical                         lifestyleroyals
title          Prince George's Royal Life in Photos
Name: 46, dtype: object

newid                                                     N28614
vertical                                               lifestyle
subvertical                                      lifestyleroyals
title          A Look Back at Prince Charles and Camilla's Re...
Name: 1667, dtype: object

newid                                                     N23937
vertical                                               lifestyle
subvertical                                      lifestyleroyals
title          6 gorgeous royal family heirlooms that Kate Mi...
Name: 208, dtype: object

newid                                                     N17326
vertical                                               lifestyle
subvertical                                      lifestyleroyals
title          Here's what Kate Middleton and Prince William'...
Name: 3452, dtype: object

newid                                                     N18957
vertical                                               lifestyle
subvertical                                      lifestyleroyals
title          People Can't Stop Debating Who Princess Charlo...
Name: 4414, dtype: object

newid                                                     N61811
vertical                                               lifestyle
subvertical                                      lifestyleroyals
title          Archie's Photo Album: Prince Harry, Duchess Me...
Name: 2503, dtype: object

newid                                              N47552
vertical                                        lifestyle
subvertical                               lifestyleroyals
title          20 of Princess Diana's most daring outfits
Name: 7219, dtype: object

newid                                                      N5672
vertical                                               lifestyle
subvertical                                      lifestyleroyals
title          Why Prince William Is Making Headlines for His...
Name: 6980, dtype: object