**Loading dataset**

In [1]:
# Import the kagglehub module
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rmisra/news-category-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\gokul\.cache\kagglehub\datasets\rmisra\news-category-dataset\versions\3


In [2]:
import json
import pandas as pd
import numpy as np

# Open the JSON file
json_file_path = path + r"\News_Category_Dataset_v3.json"

with open(json_file_path, 'r') as file:
    data = [json.loads(line) for line in file]

# Convert the JSON file to a pandas DataFrame
df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
df.describe()

Unnamed: 0,link,headline,category,short_description,authors,date
count,209527,209527,209527,209527.0,209527.0,209527
unique,209486,207996,42,187022.0,29169.0,3890
top,https://www.huffingtonpost.comhttps://www.wash...,Sunday Roundup,POLITICS,,,2014-03-25
freq,2,90,35602,19712.0,37418.0,100


**Vectorize input dataset**

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Declare the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Create a matrix of input word vectors
tfidf_matrix = vectorizer.fit_transform(df['short_description']+ " " + df['headline'])

In [6]:
tfidf_matrix.shape

(209527, 88200)

**Recommender Function**

In [11]:
#Function to recommend articles based on the query
def recommender(query, top_n=5):
    '''
    params: query - a string containing the query
            top_n - an integer specifying the number of results to return
    returns: a DataFrame containing the top_n results
    '''
    
    #Vectorize the query
    query_vec = vectorizer.transform([query])   
    
    #Calculate the cosine similarity score
    results = np.dot(tfidf_matrix, query_vec.T) 
    results = np.squeeze(results.toarray())
    
    #Filter the top_n results
    top_idx = results.argsort()[-top_n:][::-1]

    #Append the similarity score to the results
    df['similarity_score'] = results

    return df.iloc[top_idx]


#Function to display the results
def display_results(results):
    '''
    params: results - a DataFrame containing the results
    returns: None
    '''
    c = 1
    for i, row in results.iterrows():
        print("Result #", c)
        print(f"{'HEADLINE'} - {row['headline']}")
        print(f"{'DESCRIPTION'} - {row['short_description']}")
        print(f"{'SIMILIARITY SCORE'} - {results.loc[i, 'similarity_score']}")
        print(f"{'LINK'} - {row['link']}")
        c += 1

**Sample Test Queries**

In [12]:
user_query = "health benefits of green tea"
display_results(recommender(user_query))

Result # 1
HEADLINE - Tea Health Benefits: 8 Ways It Could Benefit Our Bodies
DESCRIPTION - A cup of tea anyone? It could do wonders for your health. A lot of research has focused on green tea in particular, Health.com
SIMILIARITY SCORE - 0.7114795090361802
LINK - https://www.huffingtonpost.com/entry/tea-health-benefits-cancer-heart-disease_us_5b9c2e55e4b03a1dcc7ce60b
Result # 2
HEADLINE - Green Tea Benefits: How The Drink Improves Your Health
DESCRIPTION - While experts agree that more research is still needed, this only adds to the body of work linking green tea to healthy, happy
SIMILIARITY SCORE - 0.6391623122003975
LINK - https://www.huffingtonpost.com/entry/green-tea-benefits-health_us_5b9c6ec9e4b03a1dcc7e965a
Result # 3
HEADLINE - Green Tea Could Help Functioning In Old Age: Study
DESCRIPTION - Green tea has long been eyed for possible health benefits, including its potential to decrease the risk of certain cancers
SIMILIARITY SCORE - 0.6155933359328156
LINK - https://www.huffin

In [11]:
user_query = "Latest advancements in artificial intelligence"
display_results(recommender(user_query))

Result # 1
HEADLINE - Artificial Intelligence Is Here To Change Your Life
DESCRIPTION - The revolution began before you even realized it.
SIMILIARITY SCORE - 0.419997516127092
LINK - https://www.huffingtonpost.com/entry/artificial-intelligence-mit-tech-conference_us_56cb20ade4b0928f5a6c7463
Result # 2
HEADLINE - Google Just 'Open Sourced' Its Artificial Intelligence Engine
DESCRIPTION - Tech pundit Tim O’Reilly had just tried the new Google Photos app, and he was amazed by the depth of its artificial intelligence
SIMILIARITY SCORE - 0.38976526991553073
LINK - https://www.huffingtonpost.comhttp://www.wired.com/2015/11/google-open-sources-its-artificial-intelligence-engine/
Result # 3
HEADLINE - How To Stop Worrying And Love Artificial Intelligence
DESCRIPTION - A new book finds the heart in our cold, robotic future.
SIMILIARITY SCORE - 0.3283044407769491
LINK - https://www.huffingtonpost.com/entry/heartificial-intelligence-john-havens_us_56bb4bfee4b0b40245c4beac
Result # 4
HEADLINE - Wo

In [10]:
user_query = "music festival"
display_results(recommender(user_query))

Result # 1
HEADLINE - This Woman Put A Game-Changing Twist On Your Average Music Festival
DESCRIPTION - The Other Festival makes up for the lack of gender inclusion in the festival circuit.
SIMILIARITY SCORE - 0.6237387836456741
LINK - https://www.huffingtonpost.com/entry/the-other-festival-new-york-city_us_575ab6b7e4b0ced23ca7c18d
Result # 2
HEADLINE - This Music Festival Is Helping To Combat Anti-LGBT Discrimination In NC
DESCRIPTION - "In some ways, we’re queering the typical music festival."
SIMILIARITY SCORE - 0.6079321697131563
LINK - https://www.huffingtonpost.com/entry/moogfest-anti-lgbt-discrimination_us_56feaadee4b0daf53aef8f53
Result # 3
HEADLINE - WATCH: Music Festival Dancing: The Best Of The Best
DESCRIPTION - 
SIMILIARITY SCORE - 0.5734272625670541
LINK - https://www.huffingtonpost.com/entry/music-festival-dancing_n_5329798.html
Result # 4
HEADLINE - Oyster's Guide to the ULTRA Music Festival (PHOTOS)
DESCRIPTION - Electronic music fans from around the world will be floc