In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string
import re

In [2]:
import gensim
from gensim import corpora, models, similarities

import nltk
from nltk.corpus import stopwords



In [3]:
from lightfm import LightFM

ModuleNotFoundError: No module named 'lightfm'

In [4]:
import scipy
from scipy.spatial.distance import cdist
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from scipy.linalg import svd

In [5]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.model_selection import train_test_split

# Reading data sets

1. rated_articles - Contains user_interests with ratings
2. news_articles - Contains raw articles without user data amalgamation

## news_articles

In [6]:
data = pd.read_csv(r'/Users/harman/Desktop/EDA_NLP/Recommendation System/data/0_news_articles.csv')
data.head()

Unnamed: 0,Article_id,Title,Description,Date,Category,URL
0,0,Fire at Vaishno Devi shrine complex; cash coun...,"No one was injured in the fire, which broke ou...","June 8, 2021 7:28:32 pm",India,https://indianexpress.com/article/india/vaishn...
1,1,"Had not gone to meet Nawaz Sharif, says Uddhav...",Uddhav Thackeray led a delegation of his cabin...,"June 8, 2021 6:56:40 pm",India,https://indianexpress.com/article/india/had-no...
2,2,Corruption case: Former Haryana I-T deputy com...,It was in 2016 that the CBI had arrested Nitin...,"June 8, 2021 6:25:24 pm",India,https://indianexpress.com/article/india/corrup...
3,3,Kannur MP K Sudhakaran appointed chief of Cong...,Sudhakaran will replace Ramachandran who had a...,"June 8, 2021 5:04:40 pm",India,https://indianexpress.com/article/india/sudhak...
4,4,"Kerala girl of Class 5 writes to CJI, lauds SC...",Chief Justice N V Ramana responded to the Clas...,"June 8, 2021 4:43:10 pm",India,https://indianexpress.com/article/india/kerala...


## Collaborative Filtering

**Need:** Ratings Matrix so I generated user profile with ratings

In [7]:
rating = pd.read_csv(r'data/3_user_rated_articles.csv')
print(rating.shape)
rating.drop(columns= rating.columns[0], 
        axis=1, 
        inplace=True)
rating.head()

(2250, 8)


Unnamed: 0,Article_id,Title,UserId,SessionId,Article Rank,Time Spent (seconds),Ratings
0,0,Fire at Vaishno Devi shrine complex; cash coun...,1,1,1,0,3
1,1,"Had not gone to meet Nawaz Sharif, says Uddhav...",1,1,2,53,1
2,2,Corruption case: Former Haryana I-T deputy com...,1,1,3,0,2
3,3,Kannur MP K Sudhakaran appointed chief of Cong...,1,1,4,0,2
4,4,"Kerala girl of Class 5 writes to CJI, lauds SC...",1,1,5,27,3


In [8]:
rating.tail()

Unnamed: 0,Article_id,Title,UserId,SessionId,Article Rank,Time Spent (seconds),Ratings
2245,2245,"Malaika Arora shares yoga asanas for healthy, ...",2235,2230,6,0,4
2246,2246,COVID-19 diet: Khichdi is a ‘safe’ option; oth...,2236,2231,7,14,3
2247,2247,‘Keep listening. The world wants to hear your ...,2237,2232,8,0,1
2248,2248,"Forget cold drinks, switch to refreshing bael ...",2238,2233,9,0,1
2249,2249,‘Love wins’: Rita Wilson and Tom Hanks celebra...,2239,2234,10,0,5


In [9]:
n_users = int(rating.UserId.nunique())
n_article = int(rating.Article_id.nunique())
print("Number of users: " , n_users)
print("Number of articles: ", n_article)

Number of users:  2239
Number of articles:  2250


In [10]:
user_pivot = rating.pivot_table(index = 'UserId', columns = 'Article_id', values = 'Ratings')
user_pivot.head()

Article_id,0,1,2,3,4,5,6,7,8,9,...,2240,2241,2242,2243,2244,2245,2246,2247,2248,2249
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.0,1.0,2.0,2.0,3.0,2.0,5.0,5.0,3.0,2.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [11]:
user_pivot.shape

(2239, 2250)

In [12]:
user_pivot = user_pivot.fillna(0)
user_pivot_matrix = user_pivot.values
user_pivot_matrix

array([[3., 1., 2., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [13]:
user_item_pivot_sparse = csr_matrix(user_pivot)

In [14]:
n_factors = 150
U, sigma, Vt = svds(user_pivot_matrix, k = n_factors)

sigma = np.diag(sigma)
sigma.shape

(150, 150)

In [15]:
all_user_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_ratings_norm = (all_user_ratings - all_user_ratings.min()) / (all_user_ratings.max() - all_user_ratings.min())
all_user_ratings_norm

array([[0.65056069, 0.30112139, 0.47584104, ..., 0.12640174, 0.12640174,
        0.12640174],
       [0.12640174, 0.12640174, 0.12640174, ..., 0.12640174, 0.12640174,
        0.12640174],
       [0.12640174, 0.12640174, 0.12640174, ..., 0.12640174, 0.12640174,
        0.12585864],
       ...,
       [0.12640174, 0.12640174, 0.12640174, ..., 0.12640174, 0.12640174,
        0.12640174],
       [0.12640174, 0.12640174, 0.12640174, ..., 0.12640174, 0.12640174,
        0.12640174],
       [0.12640174, 0.12640174, 0.12640174, ..., 0.12640174, 0.12640174,
        0.12650087]])

In [16]:
cf_preds_df = pd.DataFrame(all_user_ratings_norm, columns = user_pivot.columns).transpose()

In [17]:
class Collaborative:
    
    name = "Collaborative Filter"
    
    def __init__(self, predictions, items = None):
        self.predictions = predictions
        self.items = items
        
    def get_model_name(self):
        return self.name
            
    def recommend_items(self, user_id, items_ignore = [], topn = 10, verbose = False):
        sorted_preds = self.predictions[user_id].sort_values(ascending = False).reset_index()

        recommendations = sorted_preds[~sorted_preds['Article_id'].isin(items_ignore)].head(topn)

        if verbose:
            if self.items is None:
                raise Exception('"items" required in verbose mode')

            recommendations = recommendations.merge(self.items, how = 'left', left_on = 'Article_id', 
                                                    right_on = 'Article_id')[['Article_id', 'Title']]


        return recommendations

In [18]:
model = Collaborative(cf_preds_df, data)

In [19]:
model.recommend_items(user_id = 224, verbose = True)

Unnamed: 0,Article_id,Title
0,51,"BSY to BJP MLAs: Focus on Covid, refrain from ..."
1,222,Serum Institute of India seeks DCGI’s nod to m...
2,319,Row over wedding party in UP village: many say...
3,371,"Govt’s Year 8 begins with dented public trust,..."
4,2015,Why is the second dose of COVID-19 vaccine so ...
5,1052,"Champions League: Havertz sorry for F-bomb, Ev..."
6,742,Post COVID complication among children a new c...
7,203,"Parents, elders biggest counsellors: Goa BJP o..."
8,1297,"Shapovalov, Ruud advance to final at clay-cour..."
9,948,"No pressure on me, time to enjoy the WTC final..."


## Hybrid Recommendor System

**Using:** LightRF

In [20]:
user_pivot.head()

Article_id,0,1,2,3,4,5,6,7,8,9,...,2240,2241,2242,2243,2244,2245,2246,2247,2248,2249
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.0,1.0,2.0,2.0,3.0,2.0,5.0,5.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
u_id = list(user_pivot.index)
u_dict = {}
counter = 0

for i in u_id:
    u_dict[i] = counter
    counter += 1

In [22]:
len(u_dict)

2239

In [23]:
# convert to csr matrix
u_interaction_csr = csr_matrix(user_pivot.values)
u_interaction_csr

<2239x2250 sparse matrix of type '<class 'numpy.float64'>'
	with 2250 stored elements in Compressed Sparse Row format>

In [24]:
item_dict ={}
df = rating[['Article_id', 'Title']].sort_values('Article_id').reset_index()

for i in range(df.shape[0]):
    item_dict[(df.loc[i,'Article_id'])] = df.loc[i,'Title']

In [25]:
len(item_dict)

2250

### LightFM not working, so hopes low

In [None]:
model = LightFM(loss = 'warp', random_state = 2016, learning_rate = 0.90, no_components = 150, user_alpha = 0.000005)
model = model.fit(u_interaction_csr, epochs = 100, num_threads = 16, verbose = False)

In [26]:
title = list(data['Title'])
title[:10]

['Fire at Vaishno Devi shrine complex; cash counter damaged',
 'Had not gone to meet Nawaz Sharif, says Uddhav Thackeray as he plays down one-on-one meeting with PM Modi',
 'Corruption case: Former Haryana I-T deputy commissioner gets 4 years in prison',
 'Kannur MP K Sudhakaran appointed chief of Congress in Kerala',
 'Kerala girl of Class 5 writes to CJI, lauds SC for saving lives in fight with Covid',
 'Madhya Pradesh govt gets HC notice on communal clashes during fundraising for Ram temple',
 'Uddhav Thackeray meets PM Modi; discusses Maratha quota issue, GST compensation',
 'New Covid-19 vaccination guidelines out, allocation based on state population',
 'Dantewada: 24-year-old tribal woman killed in ‘maoist encounter’; family claims it was staged, alleges rape',
 'Fire at TMC MLA Madan Mitra’s residence in Kolkata']

In [27]:
total = data.isnull().sum().sort_values(ascending = False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total, percent], axis = 1, keys = ['Total', 'Percent'])
missing_data.head()

Unnamed: 0,Total,Percent
Article_id,0,0.0
Title,0,0.0
Description,0,0.0
Date,0,0.0
Category,0,0.0
