<a href="https://colab.research.google.com/github/dipensedawat/Complete-Python-Bootcamp/blob/main/BlogRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("blog_collection.csv")

df = df[[
    "_id",
    "title",
    "blogData",
    "blogMetaTag",
    "uniqueUrl"
]]

for col in ["title", "blogData", "blogMetaTag"]:
    df[col] = df[col].fillna("")

df = df[df["title"].str.len() > 5].reset_index(drop=True)

print("Total blogs:", len(df))


FileNotFoundError: [Errno 2] No such file or directory: 'blog_collection.csv'

In [None]:
df["final_text"] = (
    df["title"] + " " +
    df["blogMetaTag"] + " " +
    df["blogData"]
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf.fit_transform(df["final_text"])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (1148, 5000)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tfidf_matrix)

print("Similarity matrix shape:", similarity_matrix.shape)

Similarity matrix shape: (1148, 1148)


In [None]:
from sklearn.cluster import KMeans

NUM_CLUSTERS = 8

kmeans = KMeans(
    n_clusters=NUM_CLUSTERS,
    random_state=42
)

df["topic_cluster"] = kmeans.fit_predict(tfidf_matrix)

df[["title", "topic_cluster"]].head()


Unnamed: 0,title,topic_cluster
0,How Custom Software is Solving Major Problems ...,3
1,Agriculture Software Development in 2025: How ...,3
2,ERP vs Custom CRM in Saudi Arabia: What Should...,3
3,Why Web Apps Are Becoming Popular Than Standal...,2
4,How to Deescalate an Angry Customer: Turning T...,3


In [None]:
import pickle

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

with open("similarity_matrix.pkl", "wb") as f:
    pickle.dump(similarity_matrix, f)

df.to_csv("blog_content_intelligence.csv", index=False)

In [None]:
def recommend_similar_blogs(blog_index, top_n=5):
    similarity_scores = list(enumerate(similarity_matrix[blog_index]))

    similarity_scores = sorted(
        similarity_scores,
        key=lambda x: x[1],
        reverse=True
    )

    top_matches = similarity_scores[1:top_n+1]

    results = []
    for idx, score in top_matches:
        results.append({
            "blog_id": df.iloc[idx]["_id"],
            "title": df.iloc[idx]["title"],
            "url": df.iloc[idx]["uniqueUrl"],
            "similarity_score": round(score, 3)
        })

    return results


In [None]:
test_blog_index = 10

print("Current Blog:")
print(df.iloc[test_blog_index]["title"])

print("\nRecommended Blogs:")
for rec in recommend_similar_blogs(test_blog_index):
    print(rec)

Current Blog:
Useful things to consider when hiring a web application developer 

Recommended Blogs:
{'blog_id': '6439146451d0fb05d13b2e94', 'title': 'What is a Web Application? Key Benefits for Your Business', 'url': 'web-apps', 'similarity_score': np.float64(0.538)}
{'blog_id': '5cd156b951d0fb568566bf36', 'title': 'Top Web Application Development Trends', 'url': 'Top-Web-Application-Development-Trends', 'similarity_score': np.float64(0.535)}
{'blog_id': '5cd6b2b551d0fb568566bf3c', 'title': 'Choosing The Best Technology Stack For Web App Development', 'url': 'How-to-Choose-The-Best-Technology-Stack-for-Web-App-Development-', 'similarity_score': np.float64(0.528)}
{'blog_id': '5cb6f05651d0fb5049e82b8c', 'title': 'Remote web developers from agencies vs freelancers, Whom to hire?', 'url': 'Remote-web-developers-from-agencies-vs-freelancers--whom-to-hire-', 'similarity_score': np.float64(0.521)}
{'blog_id': '62468d0551d0fb0641c82965', 'title': 'Web Application Development Process', 'url':

In [None]:
test_blog_index = 16

print("Current Blog:")
print(df.iloc[test_blog_index]["title"])

print("\nRecommended Blogs:")
for rec in recommend_similar_blogs(test_blog_index):
    print(rec)


Current Blog:
How to hire a UI/UX designer?

Recommended Blogs:
{'blog_id': '66f2ab0951d0fb06468402f1', 'title': 'How to Hire UI UX Designer and Developer', 'url': 'how-to-hire-ui-ux-designer-developer', 'similarity_score': np.float64(0.748)}
{'blog_id': '65fbd17551d0fb05b25901b1', 'title': 'Latest UI/UX Design Trends for 2024', 'url': 'ui-ux-design-trends', 'similarity_score': np.float64(0.507)}
{'blog_id': '616804d351d0fb066cd05415', 'title': 'UX Design Problem For Students And Enterprises', 'url': 'ux-challenges', 'similarity_score': np.float64(0.408)}
{'blog_id': '6706491651d0fb06395920c2', 'title': 'Why User Experience is Important: A Comprehensive Guide', 'url': 'enhancing-user-experience', 'similarity_score': np.float64(0.404)}
{'blog_id': '614c3d4451d0fb066647d37c', 'title': 'Top 5 User Experience Trends in Web Application Design', 'url': 'user-experience-trends', 'similarity_score': np.float64(0.389)}


TAG NAME AND CATEGORY NAME

In [None]:
import pandas as pd

df = pd.read_csv("blog_collection_with_category_tag.csv")

df = df[[
    "_id",
    "title",
    "blogData",
    "blogMetaTag",
    "uniqueUrl",
    "Tag Name",
    "Category Name"
]]

for col in ["title", "blogData", "blogMetaTag", "Tag Name", "Category Name"]:
    df[col] = df[col].fillna("")

df = df[df["title"].str.len() > 5].reset_index(drop=True)

print("Total blogs:", len(df))


Total blogs: 1148


In [None]:
df["final_text"] = (
    df["title"] + " " +
    df["blogMetaTag"] + " " +
    df["Tag Name"] + " " +
    df["Category Name"] + " " +
    df["blogData"]
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=6000,
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf.fit_transform(df["final_text"])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tfidf_matrix)


In [None]:
def get_tag_set(tags):
    return set(t.strip().lower() for t in tags.split(",") if t.strip())


In [None]:
def recommend_similar_blogs(blog_index, top_n=10):
    base_scores = list(enumerate(similarity_matrix[blog_index]))

    current_category = df.iloc[blog_index]["Category Name"].lower()
    current_tags = get_tag_set(df.iloc[blog_index]["Tag Name"])

    recommendations = []

    for idx, sim_score in base_scores:
        if idx == blog_index:
            continue

        candidate_category = df.iloc[idx]["Category Name"].lower()
        candidate_tags = get_tag_set(df.iloc[idx]["Tag Name"])

        category_match = (current_category == candidate_category and current_category != "")

        tag_overlap = len(current_tags.intersection(candidate_tags))

        final_score = (
            0.7 * sim_score +
            0.2 * tag_overlap +
            0.1 * (1 if category_match else 0)
        )

        recommendations.append((idx, final_score))

    recommendations = sorted(
        recommendations,
        key=lambda x: x[1],
        reverse=True
    )[:top_n]

    results = []
    for idx, score in recommendations:
        results.append({
            "blog_id": df.iloc[idx]["_id"],
            "title": df.iloc[idx]["title"],
            "url": df.iloc[idx]["uniqueUrl"],
            "category": df.iloc[idx]["Category Name"],
            "tags": df.iloc[idx]["Tag Name"],
            "final_score": round(score, 3)
        })

    return results


In [None]:
test_blog = 10

print("Current Blog:")
print(df.iloc[test_blog]["title"])
print("Category:", df.iloc[test_blog]["Category Name"])
print("Tags:", df.iloc[test_blog]["Tag Name"])

print("\nRecommended Blogs:")
for rec in recommend_similar_blogs(test_blog):
    print(rec)


Current Blog:
Useful things to consider when hiring a web application developer 
Category: Web & SaaS
Tags: web development

Recommended Blogs:
{'blog_id': '5cbfffd451d0fb5bba89c650', 'title': 'Why to Choose Popular Frameworks for Web Application Development ?', 'url': 'Why-to-Choose-Popular-Frameworks-for-Web-Application-Development--', 'category': 'Web & SaaS', 'tags': 'web development, choose popular frameworks', 'final_score': np.float64(0.663)}
{'blog_id': '6439146451d0fb05d13b2e94', 'title': 'What is a Web Application? Key Benefits for Your Business', 'url': 'web-apps', 'category': 'Web & SaaS', 'tags': 'web development', 'final_score': np.float64(0.663)}
{'blog_id': '5cd156b951d0fb568566bf36', 'title': 'Top Web Application Development Trends', 'url': 'Top-Web-Application-Development-Trends', 'category': 'Web & SaaS', 'tags': 'web development, trends', 'final_score': np.float64(0.661)}
{'blog_id': '5cd6b2b551d0fb568566bf3c', 'title': 'Choosing The Best Technology Stack For Web Ap

In [None]:
test_blog = 16

print("Current Blog:")
print(df.iloc[test_blog]["title"])
print("Category:", df.iloc[test_blog]["Category Name"])
print("Tags:", df.iloc[test_blog]["Tag Name"])

print("\nRecommended Blogs:")
for rec in recommend_similar_blogs(test_blog):
    print(rec)


Current Blog:
How to hire a UI/UX designer?
Category: Other
Tags: hire designer

Recommended Blogs:
{'blog_id': '66f2ab0951d0fb06468402f1', 'title': 'How to Hire UI UX Designer and Developer', 'url': 'how-to-hire-ui-ux-designer-developer', 'category': 'Other', 'tags': 'hire designer developer', 'final_score': np.float64(0.621)}
{'blog_id': '65fbd17551d0fb05b25901b1', 'title': 'Latest UI/UX Design Trends for 2024', 'url': 'ui-ux-design-trends', 'category': 'Other', 'tags': 'trends, latest design trends', 'final_score': np.float64(0.443)}
{'blog_id': '616804d351d0fb066cd05415', 'title': 'UX Design Problem For Students And Enterprises', 'url': 'ux-challenges', 'category': 'Other', 'tags': 'design problem students', 'final_score': np.float64(0.381)}
{'blog_id': '6481d22651d0fb065b5543de', 'title': 'Fundamentals of UX Design That Impact Your Business Site Rankings', 'url': 'site-rankings', 'category': 'Other', 'tags': 'fundamentals design impact', 'final_score': np.float64(0.283)}
{'blog_id

How to Hire UI UX Designer and Developer → 24/09/2024

Latest UI/UX Design Trends for 2024 → 21/03/2024

UX Design Problem For Students And Enterprises → 14/10/2021

Fundamentals of UX Design That Impact Your Business Site Rankings → 08/06/2023

Why User Experience is Important: A Comprehensive Guide → 09/10/2024

Unveiling the Masters of the Digital Canvas → 18/07/2023

Top 5 User Experience Trends in Web Application Design → 23/09/2021

Transpiring Web Development Technology Trends for 2023 → 30/11/2022

Design Excellence: An In-Depth Look at the Core Principles of Design → 16/08/2024

Top 10 Front-End Programming Technologies for 2024 → 05/03/2024

Updated Blogs

In [None]:
import pandas as pd
from datetime import datetime, timedelta

df = pd.read_csv("blog_collection_with_category_tag.csv")


df = df[[
    "_id",
    "title",
    "blogData",
    "blogMetaTag",
    "uniqueUrl",
    "Tag Name",
    "Category Name",
    "publishDate"
]]

for col in ["title", "blogData", "blogMetaTag", "Tag Name", "Category Name"]:
    df[col] = df[col].fillna("")

df["publishDate"] = pd.to_datetime(
    df["publishDate"],
    errors="coerce",
    utc=True
)

df = df.dropna(subset=["publishDate"]).reset_index(drop=True)

print("Total blogs after cleaning:", len(df))


Total blogs after cleaning: 1148


In [None]:
LATEST_DAYS = 100000
cutoff_date = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=LATEST_DAYS)

print("Cutoff date:", cutoff_date)


print("Latest blogs count:", (df["publishDate"] >= cutoff_date).sum())


Cutoff date: 1752-03-03 07:09:04.124333+00:00
Latest blogs count: 1148


In [None]:
df["final_text"] = (
    df["title"] + " " +
    df["blogMetaTag"] + " " +
    df["Tag Name"] + " " +
    df["Category Name"] + " " +
    df["blogData"]
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=6000,
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf.fit_transform(df["final_text"])
similarity_matrix = cosine_similarity(tfidf_matrix)


In [None]:
def get_tag_set(tags):
    return set(t.strip().lower() for t in tags.split(",") if t.strip())


In [None]:
def recommend_latest_similar_blogs(blog_index, top_n=5):

    valid_indices = df.index[df["publishDate"] >= cutoff_date].tolist()

    current_category = df.iloc[blog_index]["Category Name"].lower()
    current_tags = get_tag_set(df.iloc[blog_index]["Tag Name"])

    recommendations = []

    for idx in valid_indices:
        if idx == blog_index:
            continue

        sim_score = similarity_matrix[blog_index][idx]

        candidate_category = df.iloc[idx]["Category Name"].lower()
        candidate_tags = get_tag_set(df.iloc[idx]["Tag Name"])

        category_match = (current_category == candidate_category and current_category != "")
        tag_overlap = len(current_tags.intersection(candidate_tags))

        final_score = (
            0.7 * sim_score +
            0.2 * tag_overlap +
            0.1 * (1 if category_match else 0)
        )

        recommendations.append((idx, final_score))

    recommendations = sorted(
        recommendations,
        key=lambda x: x[1],
        reverse=True
    )[:top_n]

    return [{
        "blog_id": df.iloc[idx]["_id"],
        "title": df.iloc[idx]["title"],
        "url": df.iloc[idx]["uniqueUrl"],
        "category": df.iloc[idx]["Category Name"],
        "tags": df.iloc[idx]["Tag Name"],
        "publishDate": str(df.iloc[idx]["publishDate"]),
        "score": round(score, 3)
    } for idx, score in recommendations]


In [None]:
test_blog = 16

print("Current blog:")
print(df.iloc[test_blog]["title"])
print("Publish date:", df.iloc[test_blog]["publishDate"])

print("\nRecommended LATEST blogs:")
recs = recommend_latest_similar_blogs(test_blog)

for r in recs:
    print(r["publishDate"], " → ", r["title"])


Current blog:
How to hire a UI/UX designer?
Publish date: 1970-01-01 00:25:56.150400+00:00

Recommended LATEST blogs:
1970-01-01 00:28:47.179529722+00:00  →  How to Hire UI UX Designer and Developer
1970-01-01 00:28:31.001973137+00:00  →  Latest UI/UX Design Trends for 2024
1970-01-01 00:27:14.169600+00:00  →  UX Design Problem For Students And Enterprises
1970-01-01 00:28:06.182400+00:00  →  Fundamentals of UX Design That Impact Your Business Site Rankings
1970-01-01 00:28:48.465174360+00:00  →  Why User Experience is Important: A Comprehensive Guide
