In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string
import re

In [2]:
import gensim
from gensim import corpora, models, similarities

import nltk
from nltk.corpus import stopwords



In [3]:
import scipy
from scipy.spatial.distance import cdist
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from scipy.linalg import svd

In [4]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.model_selection import train_test_split

# Reading data sets

1. news_articles - Contains raw articles without user data amalgamation
2. user_interest - Contains raw articles mixed with user data as an amalgamation with ArticleID as Foreign Key

## news_articles

In [5]:
data = pd.read_csv(r'/Users/harman/Desktop/EDA_NLP/Recommendation System/data/0_news_articles.csv')
data.head()

Unnamed: 0,Article_id,Title,Description,Date,Category,URL
0,0,Fire at Vaishno Devi shrine complex; cash coun...,"No one was injured in the fire, which broke ou...","June 8, 2021 7:28:32 pm",India,https://indianexpress.com/article/india/vaishn...
1,1,"Had not gone to meet Nawaz Sharif, says Uddhav...",Uddhav Thackeray led a delegation of his cabin...,"June 8, 2021 6:56:40 pm",India,https://indianexpress.com/article/india/had-no...
2,2,Corruption case: Former Haryana I-T deputy com...,It was in 2016 that the CBI had arrested Nitin...,"June 8, 2021 6:25:24 pm",India,https://indianexpress.com/article/india/corrup...
3,3,Kannur MP K Sudhakaran appointed chief of Cong...,Sudhakaran will replace Ramachandran who had a...,"June 8, 2021 5:04:40 pm",India,https://indianexpress.com/article/india/sudhak...
4,4,"Kerala girl of Class 5 writes to CJI, lauds SC...",Chief Justice N V Ramana responded to the Clas...,"June 8, 2021 4:43:10 pm",India,https://indianexpress.com/article/india/kerala...


## user_interest

In [10]:
user = pd.read_csv(r'/Users/harman/Desktop/EDA_NLP/Recommendation System/data/2_user_interest.csv')
print(user.shape)
user.drop(columns= user.columns[0], 
        axis=1, 
        inplace=True)
user.head()


(2250, 11)


Unnamed: 0,Article_id,Title,Description,Date,URL,UserId,SessionId,Article Rank,Click,Time Spent (seconds)
0,0,Fire at Vaishno Devi shrine complex; cash coun...,"No one was injured in the fire, which broke ou...","June 8, 2021 7:28:32 pm",https://indianexpress.com/article/india/vaishn...,1,1,1,False,0
1,1,"Had not gone to meet Nawaz Sharif, says Uddhav...",Uddhav Thackeray led a delegation of his cabin...,"June 8, 2021 6:56:40 pm",https://indianexpress.com/article/india/had-no...,1,1,2,True,53
2,2,Corruption case: Former Haryana I-T deputy com...,It was in 2016 that the CBI had arrested Nitin...,"June 8, 2021 6:25:24 pm",https://indianexpress.com/article/india/corrup...,1,1,3,False,0
3,3,Kannur MP K Sudhakaran appointed chief of Cong...,Sudhakaran will replace Ramachandran who had a...,"June 8, 2021 5:04:40 pm",https://indianexpress.com/article/india/sudhak...,1,1,4,False,0
4,4,"Kerala girl of Class 5 writes to CJI, lauds SC...",Chief Justice N V Ramana responded to the Clas...,"June 8, 2021 4:43:10 pm",https://indianexpress.com/article/india/kerala...,1,1,5,True,27


## Content Based

In [11]:
vectorizer = TfidfVectorizer(analyzer = 'word')
tfidf_matrix = vectorizer.fit_transform(user['Title'])
tfidf_matrix.shape

(2250, 6723)

In [12]:
# cosine-similarity (linear kernel)
cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

# index of Title
indices = pd.Series(user['Title'].index)

In [13]:
# making predictions

def recommendation(index, method):
    id = indices[index]
    
    # fetching the top 10 articles
    
    similarity_score = list(enumerate(method[id]))
    similarity_score = sorted(similarity_score, key = lambda x: x[1], reverse = True)
    similarity_score = similarity_score[1:11]
    
    # get the article index
    news_index = [i[0] for i in similarity_score]
    
    # returning the top 10 most similar books
    return user['Title'].iloc[news_index]

In [14]:
# obtaining random input
input = user.sample()
input

Unnamed: 0,Article_id,Title,Description,Date,URL,UserId,SessionId,Article Rank,Click,Time Spent (seconds)
1740,1740,Monsoon tips: Here’s how to keep your skin hea...,It is essential to tweak your routine accordin...,"May 28, 2021 4:40:14 pm",https://indianexpress.com/article/lifestyle/li...,1730,1725,1,False,0


In [15]:
# passing the input
recommendation(224, cosine_similarity)

172     Editors Guild welcomes SC judgment on sedition...
356     SC restrains AP police from coercive action ag...
455     Reasoned order must for granting protection fr...
337     Dumping of bodies in rivers echos in SC, court...
183     Supreme Court underlines its ruling to protect...
111     J&K journalist booked for WhatsApp status over...
2102    What type of mask should you wear for better p...
151     SC rejects plea against bail to Varun Hiremath...
679     Narada case: CBI goes to SC, tells HC of Benga...
1654    Genetic health conditions every woman should k...
Name: Title, dtype: object