## 1. Importing necessary Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

#for storing the model
import pickle

# Below libraries are for text processing using NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## 2. Loading Data

In [None]:
# Shamba code 




# #news_articles = pd.read_json("/content/News_Category_Dataset_v2.json", lines = True)
# news_articles = pd.read_json("/content/drive/MyDrive/News_Category_Dataset_v2.json", lines = True)
# df=news_articles
db ={}
folder = '/content/drive/MyDrive/NewsData/'
for filename in os.listdir(folder):  
  myfile = open(folder+filename,"rb")
  db[os.path.splitext(filename)[0]]= pickle.load(myfile)
  #os.path.splitext(filename)[0] = pd.dataframe(db[os.path.splitext(filename)[0]])
  myfile.close()
  # print(db[os.path.splitext(filename)[0]])
db

In [None]:
res = pd.DataFrame(columns=['short_description', 'link', 'headlines', 'category'])
for keys, values in db.items():
  #print(values)
  keys = pd.DataFrame.from_dict(values) 
  #print(keys)
  frames = [res, keys]
  res = pd.concat(frames, ignore_index = True)
#news_articles_temp

## 5. Text Preprocessing

### 5.a Stopwords removal

Stop words are not much helpful in analyis and also their inclusion consumes much time during processing so let's remove these. 

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
res

Unnamed: 0,short_description,link,headlines,category
0,Filmmaker Ramesh Talwar said that actress Jaya...,https://www.hindustantimes.com/entertainment/b...,Jaya said 'I shouldn't flop in play': Ramesh o...,ENTERTAINMENT
1,"Actress Juhi Chawla, who signed a ₹1 lakh bond...",https://www.news18.com/amp/movies/ipl-rinku-si...,Thought it was right thing to do: Juhi on help...,ENTERTAINMENT
2,Actress Shehnaaz Gill said she cried a lot aft...,https://www.hindustantimes.com/entertainment/b...,"Got rejected for my 1st music video, working w...",ENTERTAINMENT
3,Actor Zeeshan Khan reacted to an Instagram use...,https://www.hindustantimes.com/entertainment/t...,Instagram user calls Zeeshan Khan a 'terrorist...,ENTERTAINMENT
4,Salman Khan shared that Bhumika Chawla was pro...,https://indianexpress.com/article/entertainmen...,"Bhumika was scared, thought I'd run after her:...",ENTERTAINMENT
...,...,...,...,...
1165,Chinese Foreign Ministry said the US has made ...,https://www.moneycontrol.com/news/world/china-...,US presumption of guilt against TikTok is base...,TECHNOLOGY
1166,Chinese search giant Baidu on Monday cancelled...,https://www.reuters.com/technology/chinas-baid...,China's Baidu cancels public launch of its Cha...,TECHNOLOGY
1167,Twitter CEO Elon Musk responded after Canadian...,https://twitter.com/elonmusk/status/1640188239...,No different standard for celebs: Musk to acto...,TECHNOLOGY
1168,"Parts of Twitter's source code, the computer c...",https://apnews.com/article/twitter-source-code...,Computer code used to run Twitter leaked online,TECHNOLOGY


In [None]:
def clean_text(headline):
      le=WordNetLemmatizer()
      word_tokens=word_tokenize(headline)
      tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
      cleaned_text=" ".join(tokens)
      return cleaned_text


rev={}
res['cleaned_text']=res['short_description'].apply(clean_text)
res['clean_headlines']=res['headlines'].apply(clean_text)


In [None]:
res.head

<bound method NDFrame.head of                                                                                                                                                                                                                                                                                                                                                                                                   short_description  \
245   Stylist Law Roach, who has styled several celebrities including Megan Thee Stallion, Priyanka Chopra and Zendaya, has announced his retirement on Instagram. "If this business was just about the clothes, I'd do it for...rest of my life but unfortunately it's not! The politics...lies...false narratives finally got me! You win. I'm out," he captioned the post.                                         
1149  Stylist Law Roach, who has styled several celebrities including Megan Thee Stallion, Priyanka Chopra and Zendaya, has announced his retirement on Inst

In [None]:
#removing duplicates
res.drop_duplicates(inplace = True)

In [None]:
print("value counts:", res.category.value_counts())

value counts: EDUCATION        170
BUSINESS         130
ENTERTAINMENT    127
SPORTS           126
POLITICS         122
TECHNOLOGY       112
FASHION          29 
Name: category, dtype: int64


In [None]:
res = res.reset_index()

## 6. Headline based similarity on new articles

Generally, we assess **similarity** based on **distance**. If the **distance** is minimum then high **similarity** and if it is maximum then low **similarity**.
To calculate the **distance**, we need to represent the headline as a **d-dimensional** vector. Then we can find out the **similarity** based on the **distance** between vectors.

There are multiple methods to represent a **text** as **d-dimensional** vector like **Bag of words**, **TF-IDF method**, **Word2Vec embedding** etc. Each method has its own advantages and disadvantages. 

Let's see the feature representation of headline through all the methods one by one.

### 6.a Using Bag of Words method

A **Bag of Words(BoW)** method represents the occurence of words within a **document**. Here, each headline can be considered as a **document** and set of all headlines form a **corpus**.

Using **BoW** approach, each **document** is represented by a **d-dimensional** vector, where **d** is total number of **unique words** in the corpus. The set of such unique words forms the **Vocabulary**.

In [None]:
headline_vectorizer = CountVectorizer()
headline_features   = headline_vectorizer.fit_transform(res['cleaned_text'])

#### Saving the model

In [None]:
# # save the model to disk
# filename = 'Count-Vectorizer features.sav'
# pickle.dump(headline_features, open(filename, 'wb'))

In [None]:
headline_features

<816x6692 sparse matrix of type '<class 'numpy.int64'>'
	with 23961 stored elements in Compressed Sparse Row format>

The output **BoW matrix**(headline_features) is a sparse matrix.

In [None]:
pd.set_option('display.max_colwidth', -1)  # To display a very long headline completely

  pd.set_option('display.max_colwidth', -1)  # To display a very long headline completely


In [None]:
def bag_of_words_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(headline_features,headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'headline':res['headlines'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',res['headlines'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,1]
    return df.iloc[1:,]

bag_of_words_based_model(100, 11) # Change the row index for any other queried article

headline :  Took a break, now I'm in a good headspace: Krunal on 3/18 vs SRH



Unnamed: 0,headline,Euclidean similarity with the queried article
1,SRH record lowest total of IPL 2023 as LSG register their 2nd win,6.480741
2,It's like Manchester United vs Liverpool: Moeen Ali on CSK-MI game,6.855655
3,Don't know how to exaggerate about my work: Yami Gautam on PR,6.855655
4,"Senior guys need to step up, starting with me: Rohit Sharma",6.855655
5,I don't care: Sara on being trolled for visiting Shiva temples,6.928203
6,"'Lost half my money in banking crisis,' says Sharon Stone; video shows her breaking down",7.0
7,"Playing the way I used to now, still room to get to my best: Kohli",7.071068
8,"When I fell in love, I couldn't see the red flags: Dhawan on divorce with Aesha Mukerji",7.071068
9,India-Australia ODI series is not going to be a 3-0 series: Aakash,7.071068
10,Don't think Tottenham Hotspur are going to sack me: Manager Conte,7.071068


In [None]:
bag_of_words_based_model(10, 11)

headline :  World food prices decline for 12th straight month in March: FAO



Unnamed: 0,headline,Euclidean similarity with the queried article
1,"Bonuses will be paid, offices remain open: Credit Suisse to staff",8.306624
2,"When I fell in love, I couldn't see the red flags: Dhawan on divorce with Aesha Mukerji",8.485281
3,I don't care: Sara on being trolled for visiting Shiva temples,8.485281
4,"Took a break, now I'm in a good headspace: Krunal on 3/18 vs SRH",8.485281
5,"Adani Total Gas cuts CNG price by ₹8.13/kg, PNG rates by ₹5.06/SCM",8.544004
6,I've no regrets about marrying Reena Roy: Ex-Pak cricketer Mohsin Khan on divorce,8.544004
7,Rainfall to continue in Mumbai till March 23; AQI 'satisfactory',8.602325
8,Indian Health Ministry's site targeted by Russian hackers: Report,8.602325
9,I'll stop doing films: Jr NTR on being asked about his next film,8.602325
10,It's like Manchester United vs Liverpool: Moeen Ali on CSK-MI game,8.660254


#### Load the saved data

In [None]:
# To LOAD THE SAVED MODEL
filename = 'Count-Vectorizer features.sav'
BoW_vectorizer_features = pickle.load(open(filename,'rb'))

def bag_of_words_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(BoW_vectorizer_features,BoW_vectorizer_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'headline': res['headlines'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',res['headlines'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,1]
    return df.iloc[1:,]

bag_of_words_based_model(500, 11)

FileNotFoundError: ignored

Above function recommends **10 similar** articles to the **queried**(read) article based on the headline. It accepts two arguments - index of already read artile and the total number of articles to be recommended.

Based on the **Euclidean distance** it finds out 10 nearest neighbors and recommends. 

**Disadvantages**
1. It gives very low **importance** to less frequently observed words in the corpus. Few words from the queried article like "employer", "flip", "fire" appear less frequently in the entire corpus so **BoW** method does not recommend any article whose headline contains these words. Since **trump** is commonly observed word in the corpus so it is recommending the articles with headline containing "trump".   
2. **BoW** method doesn't preserve the order of words.

To overcome the first disadvantage we use **TF-IDF** method for feature representation. 


### 6.b Using TF-IDF method

**TF-IDF** method is a weighted measure which gives more importance to less frequent words in a corpus. It assigns a weight to each term(word) in a document based on **Term frequency(TF)** and **inverse document frequency(IDF)**.

**TF(i,j)** = (# times word i appears in document j) / (# words in document j)

**IDF(i,D)** = log_e(#documents in the corpus D) / (#documents containing word i)

weight(i,j) = **TF(i,j)** x **IDF(i,D)**

So if a word occurs more number of times in a document but less number of times in all other documents then its **TF-IDF** value will be high.


In [None]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(res['cleaned_text'])

#### Saving the model

In [None]:
# # save the model to disk
# filename = 'tf-idf_vectorizer features.sav'
# pickle.dump(tfidf_headline_features, open(filename, 'wb'))

In [None]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'headline':res['headlines'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',res['headlines'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]
tfidf_based_model(133, 11)

headline :  Take it down immediately: Adani to FT on story questioning its funding



Unnamed: 0,headline,Euclidean similarity with the queried article
1,Investments in Adani firms comply with Indian laws: TotalEnergies,1.249276
2,Should consider Ambani-Adani's contribution to country: Pawar,1.283818
3,Govt hasn't intervened anywhere: EAC-PM member Sanjeev Sanyal on Adani crisis,1.293161
4,"Rahul, Pawar's different views on Adani don't show Opp'n disunity: Sibal",1.315812
5,I try to take stories of Indian women to global platform: Rani,1.316096
6,LIC to cap investment exposures after Adani firms' stock rout: Report,1.318122
7,"Adani Group suspends work on ₹34,900 crore petchem project in Gujarat: Report",1.318949
8,Which countries have the highest number of billionaires?,1.333649
9,Lok Sabha & Rajya Sabha adjourned again amid ruckus,1.336169
10,"Sensex down over 800 points, Nifty below 16,900; Adani stocks fall",1.341726


In [None]:
tfidf_based_model(235, 11)

headline :  Rahul paying price for speaking fearlessly, govt rattled: Singhvi



Unnamed: 0,headline,Euclidean similarity with the queried article
1,"Rahul paying price for speaking fearlessly, govt rattled: Singhvi",0.0
2,Congress to launch Jan-Andolan across India as Rahul disqualified,1.210548
3,Sonia reaches Rahul's home after he is disqualified from Lok Sabha,1.213935
4,Rahul Gandhi's Kolar rally postponed again,1.222595
5,"Indira was disqualified too, then she got massive majority: Gehlot",1.222849
6,"Indira was disqualified too, then she got massive majority: Gehlot",1.222849
7,He's not Gandhi but 'Rahul gandagi': BJP MP over Savarkar remark,1.229784
8,He's not Gandhi but 'Rahul gandagi': BJP MP over Savarkar remark,1.229784
9,Rahul never demanded foreign forces to save our democracy: Tharoor,1.232257
10,It will take many lives for Rahul to be like Savarkar: Giriraj,1.235115


#### Load the saved data

In [None]:
# To LOAD THE SAVED MODEL
filename = 'tf-idf_vectorizer features.sav'
vectorizer_features = pickle.load(open(filename,'rb'))

def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(vectorizer_features,vectorizer_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'headline':res['headlines'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',res['headlines'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]
tfidf_based_model(133, 11)

headline :  Take it down immediately: Adani to FT on story questioning its funding



Unnamed: 0,headline,Euclidean similarity with the queried article
1,Investments in Adani firms comply with Indian laws: TotalEnergies,1.249276
2,Should consider Ambani-Adani's contribution to country: Pawar,1.283818
3,Govt hasn't intervened anywhere: EAC-PM member Sanjeev Sanyal on Adani crisis,1.293161
4,"Rahul, Pawar's different views on Adani don't show Opp'n disunity: Sibal",1.315812
5,I try to take stories of Indian women to global platform: Rani,1.316096
6,LIC to cap investment exposures after Adani firms' stock rout: Report,1.318122
7,"Adani Group suspends work on ₹34,900 crore petchem project in Gujarat: Report",1.318949
8,Which countries have the highest number of billionaires?,1.333649
9,Lok Sabha & Rajya Sabha adjourned again amid ruckus,1.336169
10,"Sensex down over 800 points, Nifty below 16,900; Adani stocks fall",1.341726


Compared to **BoW** method, here **TF-IDF** method recommends the articles with headline containing words like "employer", "fire", "flip" in top 5 recommendations and these words occur less frequently in the corpus.   

**Disadvantages :- **

**Bow** and **TF-IDF** method do not capture **semantic** and **syntactic** similarity of a given word with other words but this can be captured using **Word embeddings**.

For example: there is a good association between words like "trump" and "white house", "office and employee", "tiger" and "leopard", "USA" and "Washington D.C" etc. Such kind of **semantic** similarity can be captured using **word embedding** techniques.
**Word embedding** techniques like **Word2Vec**, **GloVe** and **fastText** leverage semantic similarity between words. 