In [40]:
from utils.embedding_api import embedding_request_ada002
from utils.completion_api import completion_request
import os
import json
import requests
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import silhouette_score

import plotly.express as px
import pandas as pd

from sklearn.manifold import TSNE
import umap.umap_ as umap

In [None]:
# Process JSON input to extract embeddings
def process_json(json_data):
    embeddings = []
    labels = []
    for article in json_data:
        title = article.get("title", "")
        description = article.get("description", "")
        html = article.get("html", "")
        combined_text = title + " " + description + " " + html
        print(f"Generating embedding for article: {title[:10]}...")
        embedding = embedding_request_ada002(combined_text)
        if embedding is not None:
            embeddings.append(embedding)
            labels.append(title)  # Use the title as a label
    return np.array(embeddings), labels

In [18]:
# Dimensionality reduction using t-SNE or UMAP
def perform_dimensionality_reduction(embeddings, method="umap", n_components=3):
    print(f"Performing dimensionality reduction using {method.upper()}...")
    if method == "tsne":
        reducer = TSNE(n_components=n_components, random_state=42, perplexity=10, n_iter=1000)
    elif method == "umap":
        reducer = umap.UMAP(n_components=n_components, random_state=42, n_neighbors=15, min_dist=0.1)
    else:
        raise ValueError("Unsupported dimensionality reduction method. Use 'tsne' or 'umap'.")
    reduced_embeddings = reducer.fit_transform(embeddings)
    return reduced_embeddings

In [4]:
# Perform clustering
def perform_clustering(embeddings, n_clusters):
    print(f"Performing KMeans clustering with {n_clusters} clusters...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)
    return cluster_labels


In [10]:
# Automatically choose the number of clusters using silhouette score
def choose_optimal_clusters(embeddings, max_clusters=10):
    print("Finding optimal number of clusters...")
    best_score = -1
    best_k = 2  # Minimum number of clusters
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        cluster_labels = kmeans.fit_predict(embeddings)
        score = silhouette_score(embeddings, cluster_labels)
        print(f"Silhouette score for {k} clusters: {score:.4f}")
        if score > best_score:
            best_score = score
            best_k = k
    print(f"Optimal number of clusters: {best_k}")
    return best_k


In [None]:
def visualize_embeddings_3d(embeddings, labels, clusters=None):
    print("Visualizing embeddings interactively in 3D...")
    # Convert embeddings and metadata into a DataFrame
    data = pd.DataFrame(embeddings, columns=["PC1", "PC2", "PC3"])
    data["Label"] = [s[:10] for s in labels] 
    if clusters is not None:
        data["Cluster"] = clusters
    else:
        data["Cluster"] = "None"
    
    # Create the 3D scatter plot
    fig = px.scatter_3d(
        data,
        x="PC1",
        y="PC2",
        z="PC3",
        color="Cluster",
        text="Label",
        title="Interactive 3D Embedding Visualization",
        labels={"Cluster": "Cluster ID"}
    )
    fig.update_traces(marker=dict(size=5, opacity=0.8))
    fig.show()

In [38]:
# Example JSON input (replace this with reading from a file or API)
json_file = "../aggregator/output.json"  # Replace with your JSON file path
with open(json_file, "r", encoding="utf-8") as file:
    json_data = json.load(file)
embeddings, labels = process_json(json_data)
# Automatically determine the optimal number of clusters

max_clusters = len(embeddings) - 1  # Set a reasonable upper limit for clusters
optimal_clusters = choose_optimal_clusters(embeddings, max_clusters)

# Perform clustering on raw embeddings
cluster_labels = perform_clustering(embeddings, optimal_clusters)

# Perform dimensionality reduction
reduction_method = "umap"  # Choose between 'umap' and 'tsne'
reduced_embeddings = perform_dimensionality_reduction(embeddings, method=reduction_method)

# Visualize reduced embeddings with clusters
visualize_embeddings_3d(reduced_embeddings, labels, clusters=cluster_labels)

Generating embedding for article: 'Accident ...
Generating embedding for article: ‘There was...
Generating embedding for article: Electric s...
Generating embedding for article: Jeff Bezos...
Generating embedding for article: Nissan war...
Generating embedding for article: How electr...
Generating embedding for article: EU battery...
Generating embedding for article: BMW Upgrad...
Generating embedding for article: Kia EV6 GT...
Generating embedding for article: Trump’s an...
Generating embedding for article: Elon Musk ...
Generating embedding for article: 'Pothole g...
Generating embedding for article: Northvolt ...
Generating embedding for article: Northvolt ...
Generating embedding for article: Britons fa...
Generating embedding for article: Peter Carl...
Generating embedding for article: The new BM...
Generating embedding for article: Pilot for ...
Generating embedding for article: 2025 BMW F...
Finding optimal number of clusters...
Silhouette score for 2 clusters: 0.1332
Silhouette















Silhouette score for 7 clusters: 0.1831
Silhouette score for 8 clusters: 0.1874
Silhouette score for 9 clusters: 0.1834
Silhouette score for 10 clusters: 0.1494
Silhouette score for 11 clusters: 0.1487
Silhouette score for 12 clusters: 0.1541
















Silhouette score for 13 clusters: 0.1297
Silhouette score for 14 clusters: 0.1259
Silhouette score for 15 clusters: 0.0732
Silhouette score for 16 clusters: 0.0501
Silhouette score for 17 clusters: 0.0433














Silhouette score for 18 clusters: 0.0243
Optimal number of clusters: 8
Performing KMeans clustering with 8 clusters...
Performing dimensionality reduction using UMAP...
Visualizing embeddings interactively in 3D...





n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [32]:
print(cluster_labels)

[4 0 3 2 3 0 0 1 3 3 2 4 0 0 4 0 1 5 1]


In [35]:
print(cluster_labels)

[4 0 3 2 3 0 0 1 3 3 2 4 0 0 4 0 1 5 1]


### Generate an article for each cluster

In [None]:
def generate_article_from_cluster(cluster_labels, labels, descriptions, htmls, output_folder="output_articles"):
    """
    This function generates an article from each cluster and saves it as a .txt file, including HTML content.
    
    Args:
        cluster_labels (list): The cluster labels for each article.
        labels (list): The list of article titles.
        descriptions (list): The list of article descriptions.
        htmls (list): The list of HTML content for each article.
        output_folder (str): The folder where generated articles will be saved. Default is "cluster_articles".
    
    Returns:
        None
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Iterate over each cluster
    for cluster_id in set(cluster_labels):
        # Collect the titles, descriptions, and HTMLs of articles in the current cluster
        cluster_text = []
        for i, cluster in enumerate(cluster_labels):
            if cluster == cluster_id:
                article_text = f"Title: {labels[i]}\nDescription: {descriptions[i]}\n\nHTML Content:\n{htmls[i]}\n\n"
                cluster_text.append(article_text)
        
        # Combine the text for the cluster
        article_text = f"--- Cluster {cluster_id} ---\n\n" + "\n".join(cluster_text)
        prompt = "Given the following list of news articles with its title, description and html content, please generate a novel news article that elaborates" +\
            " them providing insightful opinions and making a very SEO optimized news article. Don't mention them directly, just use the knowledge gained from them" +\
                " Provide the output in a json format with the following style format: \n" +\
                " [ title: \"..\", body:[{\"title\":\"...\", \"text\":\"...\"}] ]" +\
                " Please provide only the json as response" +\
                    article_text
        response = completion_request(prompt).replace("```json","").replace("```", "")
        
        
        # Save the generated article to a text file
        output_file = os.path.join(output_folder, f"cluster_{cluster_id}.json")
        with open(output_file, "w", encoding="utf-8") as file:
            file.write(response)
        
        print(f"Article for Cluster {cluster_id} saved to {output_file}")


In [51]:
# Example JSON input (replace this with reading from a file or API)
json_file = "../aggregator/output.json"  # Replace with your JSON file path
with open(json_file, "r", encoding="utf-8") as file:
    json_data = json.load(file)

embeddings, labels = process_json(json_data)

# Automatically determine the optimal number of clusters
max_clusters = 10  # Set a reasonable upper limit for clusters
optimal_clusters = choose_optimal_clusters(embeddings, max_clusters)

# Perform clustering on raw embeddings
cluster_labels = perform_clustering(embeddings, optimal_clusters)

# Extract article descriptions and HTML content
descriptions = [article.get("description", "") for article in json_data]
htmls = [article.get("html", "") for article in json_data]

# Generate articles from clusters and save to files
generate_article_from_cluster(cluster_labels, labels, descriptions, htmls)

Generating embedding for article: 'Accident ...
Generating embedding for article: ‘There was...
Generating embedding for article: Electric s...
Generating embedding for article: Jeff Bezos...
Generating embedding for article: Nissan war...
Generating embedding for article: How electr...
Generating embedding for article: EU battery...
Generating embedding for article: BMW Upgrad...
Generating embedding for article: Kia EV6 GT...
Generating embedding for article: Trump’s an...
Generating embedding for article: Elon Musk ...
Generating embedding for article: 'Pothole g...
Generating embedding for article: Northvolt ...
Generating embedding for article: Northvolt ...
Generating embedding for article: Britons fa...
Generating embedding for article: Peter Carl...
Generating embedding for article: The new BM...
Generating embedding for article: Pilot for ...
Generating embedding for article: 2025 BMW F...
Finding optimal number of clusters...
Silhouette score for 2 clusters: 0.1332
Silhouette









Silhouette score for 4 clusters: 0.1437
Silhouette score for 5 clusters: 0.1838








Silhouette score for 6 clusters: 0.1864
Silhouette score for 7 clusters: 0.1831
Silhouette score for 8 clusters: 0.1874










Silhouette score for 9 clusters: 0.1834
Silhouette score for 10 clusters: 0.1494
Optimal number of clusters: 8
Performing KMeans clustering with 8 clusters...








Article for Cluster 0 saved to cluster_articles/cluster_0.json
Article for Cluster 1 saved to cluster_articles/cluster_1.json
Article for Cluster 2 saved to cluster_articles/cluster_2.json
Article for Cluster 3 saved to cluster_articles/cluster_3.json
Article for Cluster 4 saved to cluster_articles/cluster_4.json
Article for Cluster 5 saved to cluster_articles/cluster_5.json
Article for Cluster 6 saved to cluster_articles/cluster_6.json
Article for Cluster 7 saved to cluster_articles/cluster_7.json
