### Setup and Load dataset



In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.0


In [None]:
!gdown --id 15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr

Downloading...
From: https://drive.google.com/uc?id=15hmSSQBP0hPCJHrQRBjyfqd_bPU29Rwr
To: /content/MIND.zip
100% 44.4M/44.4M [00:01<00:00, 23.4MB/s]


In [None]:
!unzip -o "MIND.zip"  -d  "/content"

Archive:  MIND.zip
  inflating: /content/MIND/behaviors.tsv  
  inflating: /content/MIND/news.tsv  


## Data Loading

In [None]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

#### Name of the file which contain all the item properties

In [None]:
file="MIND/news.tsv"

###### Run below cell

In [None]:
content_df  = pd.read_table( 'MIND/news.tsv',
            names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['newid','vertical', 'subvertical', 'title', 'abstract'])

In [None]:
content_df

Unnamed: 0,newid,vertical,subvertical,title,abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."
...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b..."
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ..."


## Details about dataset

In [None]:
itemid="newid"

In [None]:
features=['vertical',	'subvertical','title']

# Content based filtering

In [None]:
content_df['NewTag']=""
for i in features:
  content_df['NewTag']+=(" "+content_df[i])
content_df['NewTag']=content_df['NewTag'].astype(str)

## Word2Vec

In [None]:
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import string
from io import BytesIO
import matplotlib.pyplot as plt
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.phrases import Phrases, Phraser
from matplotlib import pyplot
# Function for removing ASCII characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

# Function for converting to lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# Function for removing html
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

content_df['Cleaned'] = content_df['NewTag'].apply(_removeNonAscii)
content_df['Cleaned'] = content_df.Cleaned.apply(func = make_lower_case)
content_df['Cleaned'] = content_df.Cleaned.apply(func = remove_stop_words)
content_df['Cleaned'] = content_df.Cleaned.apply(func = remove_punctuation)
content_df['Cleaned'] = content_df.Cleaned.apply(func = remove_html)

In [None]:
corpus = []
for words in content_df['Cleaned']:
    corpus.append(words.split())

In [None]:
# Downloading the Google's pretrained Word2Vec Model
!wget --save-cookies cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/Code: \1\n/p'
!wget --load-cookies cookies.txt 'https://docs.google.com/uc?export=download&confirm=YOURCODEID&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM' -O GoogleNews-vectors-negative300.bin.gz



--2023-02-08 10:04:03--  https://docs.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
Resolving docs.google.com (docs.google.com)... 74.125.196.101, 74.125.196.100, 74.125.196.102, ...
Connecting to docs.google.com (docs.google.com)|74.125.196.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘STDOUT’

-                       [<=>                 ]       0  --.-KB/s               -                       [ <=>                ]   2.24K  --.-KB/s    in 0s      

2023-02-08 10:04:03 (57.8 MB/s) - written to stdout [2289]

Code: t
--2023-02-08 10:04:03--  https://docs.google.com/uc?export=download&confirm=YOURCODEID&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
Resolving docs.google.com (docs.google.com)... 74.125.196.101, 74.125.196.100, 74.125.196.102, ...
Connecting to docs.google.com (docs.google.com)|74.125.196.101|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0g-8s-docs.go

In [None]:
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary = True)

# Training our corpus with the model
google_model = Word2Vec(size = 300, window = 5, min_count = 2, workers = -1)
google_model.build_vocab(corpus)

google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf = 1.0, binary = True)
google_model.train(corpus, total_examples = google_model.corpus_count, epochs = 5)



(0, 0)

In [None]:
word_embeddings = []
# Reading the each 'Description'
for line in content_df['Cleaned']:
    avgword2vec = None
    count = 0
    for word in line.split():
        if word in google_model.wv.vocab:
            count += 1
            if avgword2vec is None:
                avgword2vec = google_model[word]
            else:
                avgword2vec = avgword2vec + google_model[word]
            
    if avgword2vec is not None:
        avgword2vec = avgword2vec / count
        word_embeddings.append(avgword2vec)

  avgword2vec = google_model[word]
  avgword2vec = avgword2vec + google_model[word]


In [None]:
 # Finding cosine similarity for the vectors
cosine_similarities = cosine_similarity(word_embeddings, word_embeddings)

In [None]:
# Recommending the Top 5 similar movies
def recommendations(itemkey,k=10):
    
    # Calling the function vectors
  
    # Taking the Title and Movie Image Link and store in new dataframe called 'movies'
    items = content_df[[itemid]]

    # Reverse mapping of the index
    indices = pd.Series(content_df.index, index = content_df[itemid]).drop_duplicates()
     
    idx = indices[itemkey]
 
    sim_scores = list(enumerate(cosine_similarities[idx]))
   
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:k]
    item_indices = [i[0] for i in sim_scores]
    recommend = items.iloc[item_indices]
    return recommend

## Recommendation

In [None]:
recomm=list(recommendations('N55528',10)['newid'])

In [None]:
content_df[content_df.newid=='N55528']

Unnamed: 0,newid,vertical,subvertical,title,abstract,NewTag,Cleaned
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",lifestyle lifestyleroyals The Brands Queen El...,lifestyle lifestyleroyals brands queen elizabe...


In [None]:
content_df[content_df.newid.isin(recomm)]

Unnamed: 0,newid,vertical,subvertical,title,abstract,NewTag,Cleaned
397,N7716,lifestyle,lifestyleroyals,What Do Prince George & Princess Charlotte Kno...,Do Prince William and Kate Middleton's kids kn...,lifestyle lifestyleroyals What Do Prince Geor...,lifestyle lifestyleroyals prince george prince...
4414,N18957,lifestyle,lifestyleroyals,People Can't Stop Debating Who Princess Charlo...,The 4-year-old is a beautiful little girl who ...,lifestyle lifestyleroyals People Can't Stop D...,lifestyle lifestyleroyals people can t stop de...
14081,N3046,lifestyle,lifestyleroyals,See all the best photos of Prince William and ...,"In October 2019, the Duke and Duchess of Cambr...",lifestyle lifestyleroyals See all the best ph...,lifestyle lifestyleroyals see best photos prin...
22924,N12732,lifestyle,lifestyleroyals,"Meghan Markle, Prince Harry, Kate Middleton an...","Meghan Markle, Prince Harry, Kate, William Reu...","lifestyle lifestyleroyals Meghan Markle, Prin...",lifestyle lifestyleroyals meghan markle prince...
28360,N9056,lifestyle,lifestyleroyals,This Is What Queen Elizabeth Is Doing About th...,"According to royal insiders, Queen Elizabeth h...",lifestyle lifestyleroyals This Is What Queen ...,lifestyle lifestyleroyals queen elizabeth prin...
29974,N60671,lifestyle,lifestyleroyals,Prince Charles Teared Up When Prince William T...,"Frankly, it reduced me to tears, he said.",lifestyle lifestyleroyals Prince Charles Tear...,lifestyle lifestyleroyals prince charles teare...
31743,N38133,lifestyle,lifestyleroyals,The cutest photos of royal children and their ...,See all the cute photos of royal children with...,lifestyle lifestyleroyals The cutest photos o...,lifestyle lifestyleroyals cutest photos royal ...
43853,N56051,lifestyle,lifestyleroyals,Meghan Markle and Prince Harry Reunite with Ka...,"Meghan Markle, Harry Reunite Kate and William ...",lifestyle lifestyleroyals Meghan Markle and P...,lifestyle lifestyleroyals meghan markle prince...
43870,N42457,lifestyle,lifestyleroyals,Meghan Markle and Prince Harry Won't Spend Chr...,They'll hang out with baby Archie and Meghan's...,lifestyle lifestyleroyals Meghan Markle and P...,lifestyle lifestyleroyals meghan markle prince...
