# NLTK based Keyword Clustering.
This worked well when the no.of clusters to be formed is explicitly mentioned but started to fail when the no.of clusters to be formed are automically decided by the algorithm.

In [14]:
# import nltk
# nltk.download('wordnet')

from nltk.cluster import KMeansClusterer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
import numpy as np

def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text.lower())
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    return " ".join(filtered_tokens)

def determine_optimal_clusters(keywords):
    preprocessed_keywords = [preprocess_text(keyword) for keyword in keywords]
    
    # Vectorize the preprocessed keywords using TF-IDF
    vectorizer = TfidfVectorizer()
    keyword_vectors = vectorizer.fit_transform(preprocessed_keywords).toarray()
    
    # Determine the optimal number of clusters using silhouette score
    max_clusters = len(keywords) // 2  # Set an upper limit for the number of clusters
    optimal_clusters = 2  # Default to 2 clusters
    max_silhouette_score = -1
    
    for num_clusters in range(2, max_clusters + 1):
        kmeans_clusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
        assigned_clusters = kmeans_clusterer.cluster(keyword_vectors, assign_clusters=True)
        silhouette_avg = silhouette_score(keyword_vectors, assigned_clusters)
        
        if silhouette_avg > max_silhouette_score:
            max_silhouette_score = silhouette_avg
            optimal_clusters = num_clusters
    
    return optimal_clusters

def cluster_keywords(keywords):
    optimal_clusters = determine_optimal_clusters(keywords)
    preprocessed_keywords = [preprocess_text(keyword) for keyword in keywords]
    
    # Vectorize the preprocessed keywords using TF-IDF
    vectorizer = TfidfVectorizer()
    keyword_vectors = vectorizer.fit_transform(preprocessed_keywords).toarray()
    
    # Perform clustering using K-means
    kmeans_clusterer = KMeansClusterer(optimal_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
    assigned_clusters = kmeans_clusterer.cluster(keyword_vectors, assign_clusters=True)
    
    # Print the clustered keywords
    clusters = {}
    for keyword, cluster_id in zip(keywords, assigned_clusters):
        if cluster_id not in clusters:
            clusters[cluster_id] = []
        clusters[cluster_id].append(keyword)
    
    for cluster_id, cluster_keywords in clusters.items():
        print("Cluster {}: {}".format(cluster_id, ", ".join(cluster_keywords)))

keyword_list = ["apple", "banana", "orange", "cat", "dog", "elephant", "car", "bike", "train"]

cluster_keywords(keyword_list)