In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
## Details 

"""

Function 1:

      Call
      recommend(diversity,article_id)
      diversity = 1 after pop_up if the user selects to diversify
      diversity =0 otherwise

      article_id is the index of article, for which the user clicked "show related"

      returns 
      1. indices of news articles to be recommended
      2. ILD for the recommended list

      Example call: recommended_list, ild = recommend(0,313)
      **Note: Example run is also shown in last section 

"""

'\n\nFunction 1:\n\n      Call\n      recommend(diversity,article_id)\n      diversity = 1 after pop_up if the user selects to diversify\n      diversity =0 otherwise\n\n      article_id is the index of article, for which the user clicked "show related"\n\n      returns \n      1. indices of news articles to be recommended\n      2. ILD for the recommended list\n\n      Example call: recommended_list, ild = recommend(0,313)\n      **Note: Example run is also shown in last section \n\n\n'

# 1. Importing libraries

In [2]:
import numpy as np
import pandas as pd
import string

import os
import math
import time
import random

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

#for storing the model
import pickle

# Below libraries are for text processing using NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download("wordnet")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Definig functions

In [3]:
# fuction for text preprocessing
def clean_text(text):
      le=WordNetLemmatizer()
      word_tokens=word_tokenize(text)
      tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
      tokens_cleaned = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
      cleaned_text=" ".join(tokens_cleaned)
      return cleaned_text


In [4]:
# news_articles_with_features = pd.read_pickle('/content/drive/MyDrive/IR Project - Group 49/Pickle Files/dataset_features.pkl')

In [5]:
# Function for recommendation

def tfidf_based_model(row_index, num_similar_items,cs_given):
    couple_dist = cs_given[row_index]
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    return indices

In [6]:
# Function to calculate cosine similarity of one vs all 
def cosine_similarity_ova (one, all,cs):
  sum = 0
  n = len(all)
  cs_list = cs[one][all]
  for item in cs_list:
    sum += item
  c = sum/n
  return c

In [7]:
# recommended list is the indices of top 50 of similarity based result of an item.

def diversify(recommended_list, thresh,cs):

  # initializing the diversified list
  diversified_list = []
  diversified_list.append(recommended_list[0])

  # initialising candidate_list 
  candidate_list = recommended_list

  # loop from 2 to n in candidate_list
  for i in range(1,len(candidate_list)):
    candidate_item = candidate_list[i]

    # set of items in candidate_list that didnot occur in diversifeid list so far(i.e in diversified_list[:i])
    candidate_temp = [article for article in candidate_list if article not in diversified_list[:i]]

    diversity_values = []
    #compute diversity metric of each ele in candidate temp wrt all items in diversified list
    for ele in candidate_temp:
      t = (1-cosine_similarity_ova(ele,diversified_list,cs),ele)
      diversity_values.append(t)

    #sort the diversity score in reverse
    diversity_values.sort(reverse=True)

    list1={}
    for i in range(len(diversity_values)):
      list1[diversity_values[i][1]] = i

    # calculate weights 
    weights = []
    for i in range(len(candidate_temp)):
      pos_og = recommended_list.index(candidate_temp[i])
      pos_div = list1[candidate_temp[i]]
      # pos_div = diversity_values.index(candidate_list[i])
      w = ( (pos_og*(1-thresh)) + (pos_div)*thresh, candidate_temp[i] )
      weights.append(w)

    weights.sort()

    if (len(weights) != 0):
      diversified_list.append(weights[0][1])

  return diversified_list

In [8]:
 # Function to calculate ILD

def ILD(li,cs):
  sum = 0
  n = len(li)

  for i in range(len(li)):
    for j in range(i):
      if(i!=j):
        sum += cs[li[i]][li[j]]

  ILD = sum/(n*(n-1))

  return (1-ILD)

In [9]:
def recommend(is_diverse, article_id):

  news_articles_with_features = pd.read_pickle('/content/drive/MyDrive/IR Project - Group 49/Pickle Files/dataset_features.pkl')

  col_extracted = ['TextBlob_Subjectivity', 'TextBlob_Polarity','TextBlob_Analysis','topic'] 
  col_given = news_articles_with_features.columns[:520].tolist()
  col_given.append('category')
  cs_given = cosine_similarity(news_articles_with_features[col_given])
  cs = cosine_similarity(news_articles_with_features[col_extracted])

  recomm_list, ild = helper(is_diverse, article_id, cs, cs_given)
  return recomm_list, ild

In [10]:
# Defining function for final recommendations

def helper(is_diverse, article_id,cs,cs_given):

  if(is_diverse not in [0,1]):
    raise Exception("Wrong value of is_diverse! Enter 0 or 1 as is_dierse")

  if(article_id <0 or article_id>815):
    raise Exception("Wrong value of article_id! Enter value between 0 to815")

  top_200 =  list(tfidf_based_model(article_id, 200,cs_given))

  if(is_diverse == 0):
    
    recomm_list = top_200[:10]
    ild = ILD(recomm_list,cs)
    return recomm_list, ild

  elif(is_diverse ==1):
    recomm_list = diversify(top_200,0.9,cs)

    # check
    if (len(recomm_list) !=10):
      for i in range(10-len(recomm_list)):
        recomm_list.sppend(top_200[-i])
    ild = ILD(recomm_list[:10],cs)
    return recomm_list, ild

## Example run

In [24]:
li, ild = recommend(0,112)
li,ild

([494, 254, 256, 257, 258, 259, 127, 126, 125, 124], 0.5261398453626552)

In [25]:
li2, ild2 = recommend(1,112)
ild2

0.7382830452933069

In [26]:
articles.iloc[li[:10]]['category']

494    0
254    0
256    0
257    0
258    0
259    0
127    0
126    0
125    0
124    0
Name: category, dtype: int64

In [27]:
articles.iloc[li2[:10]]['category']

494    0
197    0
378    0
321    0
126    0
416    1
322    0
121    0
23     1
67     1
Name: category, dtype: int64

In [28]:
articles.iloc[li[:10]]['topic']

494    3
254    7
256    5
257    3
258    3
259    5
127    3
126    3
125    5
124    3
Name: topic, dtype: int64

In [29]:
articles.iloc[li2[:10]]['topic']

494    3
197    0
378    5
321    1
126    3
416    0
322    5
121    3
23     0
67     8
Name: topic, dtype: int64

In [13]:
# get data 
articles = pd.read_pickle('/content/drive/MyDrive/IR Project - Group 49/Final_project_deliverables/dataset_preprocessed.pkl')

In [14]:
articles.head()

Unnamed: 0,index,short_description,link,headlines,category,topic,article_text,TextBlob_Subjectivity,TextBlob_Polarity,TextBlob_Analysis,cleaned_text,clean_headlines
0,0,Uorfi Javed criticised Sonali Kulkarni for her...,https://www.freepressjournal.in/amp/entertainm...,How insensitive: Uorfi on Sonali's 'many girls...,2,3,Bigg Boss OTT fame Urfi Javed lashed out at po...,0.569923,0.122619,2,Uorfi Javed criticised Sonali Kulkarni recent ...,insensitive Uorfi Sonali many girl lazy remark
1,1,Shakti Arora spoke about quitting the show 'Ku...,https://www.hindustantimes.com/entertainment/t...,Didn't want to play father to 28-yr-old: Shakt...,2,5,Actor Shakti Arora has confirmed that his jour...,0.455716,0.143154,2,Shakti Arora spoke quitting show Kundali Bhagy...,want play father 28yrold Shakti quitting show
2,2,"'Naatu Naatu' singer Kaala Bhairava, who misse...",https://twitter.com/kaalabhairava7/status/1636...,Singer Kaala Bhairava apologises for not thank...,2,2,JavaScript is not available.\n\nWe’ve detected...,0.225,-0.15,0,Naatu Naatu singer Kaala Bhairava missed ackno...,Singer Kaala Bhairava apologises thanking Char...
3,3,The makers of Ishaan Khatter and Mrunal Thakur...,https://indianexpress.com/article/entertainmen...,Makers of 'Pippa' refute reports of film's dir...,2,6,News Entertainment Bollywood Ishaan Khatter's ...,0.503175,0.101587,2,maker Ishaan Khatter Mrunal Thakurstarrer Pipp...,Makers Pippa refute report film direct release
4,4,"Guneet Monga, the producer of Oscar-winning do...",https://www.news18.com/amp/movies/guneet-monga...,India's moment taken away: Guneet Monga on not...,2,5,Producer Guneet Monga finally reacted to not b...,0.474411,0.259848,2,Guneet Monga producer Oscarwinning documentary...,India moment taken away Guneet Monga allowed s...


In [17]:
get_data = articles.iloc[li]
get_data

Unnamed: 0,index,short_description,link,headlines,category,topic,article_text,TextBlob_Subjectivity,TextBlob_Polarity,TextBlob_Analysis,cleaned_text,clean_headlines
314,314,Apple supplier Pegatron is in discussions to e...,https://www.reuters.com/technology/apple-inc-s...,Apple supplier Pegatron planning to open 2nd I...,0,3,"\n\n\n\n\n\n\n\n\n\n\n\n\n\nNEW DELHI, March 2...",0.345241,0.100816,2,Apple supplier Pegatron discussion establish s...,Apple supplier Pegatron planning open Indian p...
634,634,Adani Total Gas Limited (ATGL) has slashed the...,https://www.hindustantimes.com/business/adani-...,"Adani Total Gas cuts CNG price by ₹8.13/kg, PN...",0,3,Adani Total Gas Limited (ATGL) has reduced the...,0.367777,0.083832,2,Adani Total Limited ATGL slashed price compres...,Adani Total cut price ₹813kg rate ₹506SCM
254,254,The government is reportedly looking to tax ga...,https://www.bqprime.com/business/government-mo...,Govt likely to remove long-term tax benefits f...,0,7,The Indian government is moving to treat gains...,0.343524,0.119238,2,government reportedly looking gain made debt m...,Govt likely remove longterm benefit debt mutua...
253,253,The investments made in three Adani subsidiari...,https://www.reuters.com/business/finance/total...,Investments in Adani firms comply with Indian ...,0,3,"\n\n\n\n\n\n\n\n\n\n\n\n\n\nPARIS, March 23 (R...",0.436111,0.1875,2,investment made three Adani subsidiary includi...,Investments Adani firm comply Indian law Total...
563,563,UN Food and Agriculture Organization (FAO) sai...,https://www.reuters.com/business/retail-consum...,World food prices decline for 12th straight mo...,0,3,"\n\n\n\n\n\n\n\n\n\n\n\n\n\nROME, April 7 (Reu...",0.35012,0.069816,2,Food Agriculture Organization said world food ...,World food price decline 12th straight month M...
315,315,Maruti Suzuki India Chairman RC Bhargava said ...,https://www.moneycontrol.com/news/business/jap...,IIMs must teach Japan's manufacturing competit...,0,2,"Maruti Suzuki India Ltd, which commands over 4...",0.370338,0.090222,2,Maruti Suzuki India Chairman Bhargava said Jap...,IIMs must teach Japan manufacturing competitiv...
252,252,"Hyundai and Kia are recalling over 5,70,000 US...",https://www.reuters.com/business/autos-transpo...,"Hyundai, Kia ask 5 lakh US owners to park outs...",0,9,"\n\n\n\n\n\n\n\n\n\n\n\n\n\nWASHINGTON, March ...",0.509253,0.04724,2,Hyundai recalling 570000 vehicle fire risk own...,Hyundai lakh owner park outside fire risk
251,251,The Lok Sabha on Friday passed the Finance Bil...,https://indianexpress.com/article/india/parlia...,Lok Sabha passes Finance Bill 2023,0,5,"Parliament Budget Session 2023, Highlights, Ma...",0.335714,0.057143,2,Sabha Friday passed Finance Bill 2023 tabled U...,Sabha pass Finance Bill 2023
554,554,The RBI said on Friday that India's foreign ex...,https://www.outlookindia.com/business/india-s-...,India's forex reserves fall by $329 million to...,0,3,India's Forex Reserves Drop By $329 Million To...,0.450455,0.058724,2,said Friday India foreign exchange reserve dro...,India forex reserve fall million 57845 billion
316,316,Nykaa's Chief Commercial Operations Officer Ma...,https://www.reuters.com/world/india/five-execu...,"Nykaa's CEO of wholesale business, 4 other exe...",0,2,"[1/4] Beauty products by Nykaa, an Indian beau...",0.38572,0.049508,2,Nykaa Chief Commercial Operations Officer Mano...,Nykaa wholesale business executive resign
