# **1. Import Libraries**

In [1]:
# Import required libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# **2. Load and Explore Dataset**

In [2]:
# Load the News.csv file into a Pandas DataFrame
df = pd.read_csv('News.csv')

In [3]:
# Read the News.csv file into a Pandas DataFrame
df = pd.read_csv('News.csv')

# Print the first 5 rows of the dataset
print(df.head())

       ID News Category                                              Title  \
0  N88753     lifestyle  The Brands Queen Elizabeth, Prince Charles, an...   
1  N45436          news    Walmart Slashes Prices on Last-Generation iPads   
2  N23144        health                      50 Worst Habits For Belly Fat   
3  N86255        health  Dispose of unwanted prescription drugs during ...   
4  N93187          news  The Cost of Trump's Aid Freeze in the Trenches...   

                                             Summary  
0  Shop the notebooks, jackets, and more that the...  
1  Apple's new iPad releases bring big deals on l...  
2  These seemingly harmless habits are holding yo...  
3                                                NaN  
4  Lt. Ivan Molchanets peeked over a parapet of s...  


In [4]:
# Get the shape of the dataset (number of rows, number of columns)
print("Shape of dataset: {} rows, {} columns".format(df.shape[0], df.shape[1]))

# Get the column names
print("Column names:\n{}".format(', '.join(df.columns.tolist())))

# Get information about the dataset
print("\nInformation about the dataset:")
print(df.info())

Shape of dataset: 32409 rows, 4 columns
Column names:
ID, News Category, Title, Summary

Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32409 entries, 0 to 32408
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             32409 non-null  object
 1   News Category  32409 non-null  object
 2   Title          32409 non-null  object
 3   Summary        30847 non-null  object
dtypes: object(4)
memory usage: 1012.9+ KB
None


# **3. Data Preprocessing**

In [5]:
# Check for missing values
missing_values_count = df.isnull().sum()

# Print the number of missing values in each column
print(missing_values_count)

ID                  0
News Category       0
Title               0
Summary          1562
dtype: int64


In [6]:
# Drop rows with missing values in the 'text' column
df.dropna(subset=df.columns, how='any', inplace=True)

# Print the first 5 rows of the dataset after removing missing values
print(df.head())

       ID News Category                                              Title  \
0  N88753     lifestyle  The Brands Queen Elizabeth, Prince Charles, an...   
1  N45436          news    Walmart Slashes Prices on Last-Generation iPads   
2  N23144        health                      50 Worst Habits For Belly Fat   
4  N93187          news  The Cost of Trump's Aid Freeze in the Trenches...   
5  N75236        health  I Was An NBA Wife. Here's How It Affected My M...   

                                             Summary  
0  Shop the notebooks, jackets, and more that the...  
1  Apple's new iPad releases bring big deals on l...  
2  These seemingly harmless habits are holding yo...  
4  Lt. Ivan Molchanets peeked over a parapet of s...  
5  I felt like I was a fraud, and being an NBA wi...  


# **4. Calculate TF-IDF and Cosine Similarity**

In [7]:
# Create a TfidfVectorizer object to calculate the TF-IDF scores for the article titles
vectorizer = TfidfVectorizer(stop_words='english')
title_matrix = vectorizer.fit_transform(df['Title'])

In [8]:
# Calculate the cosine similarity matrix between all pairs of article titles
title_similarity_matrix = cosine_similarity(title_matrix)

# **5. Define Article Recommendation Function**

In [9]:
# Define a function to recommend articles based on a given article title
def recommend_articles(article_title, num_articles=5):
    # Find the index of the given article title in the DataFrame
    index = df[df['Title'] == article_title].index[0]
    
    # Calculate the cosine similarity scores between the given article and all other articles
    similarity_scores = list(enumerate(title_similarity_matrix[index]))
    
    # Sort the list of similarity scores in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Select the top N articles with the highest similarity scores
    top_article_indices = [i[0] for i in similarity_scores[1:num_articles+1]]
    
    # Return the titles of the recommended articles
    return df['Title'].iloc[top_article_indices]

# **6. Test Article Recommendation Function**

In [10]:
recommended_articles = recommend_articles('50 Worst Habits For Belly Fat')
print(recommended_articles)

304      Those Grueling Workouts May Not Help You Lose ...
14161              10 Ways to Burn Belly Fat in 10 Minutes
9463     If You Can't Seem to Lose Belly Fat, Experts A...
14619    I Used to Obsess About My Belly Fat, Until I S...
32225    3 Foods You Should Eat at Breakfast If You Wan...
Name: Title, dtype: object
