<a href="https://colab.research.google.com/github/fahimku2020/fahimku2020/blob/main/Fast_while_clusterand_query_based_keywords_extractor_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faiss-gpu
!pip install sentence_transformers
!pip install  wikipedia



In [None]:
import numpy as np
import faiss
import wikipedia
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
import re
import unicodedata
import functools
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

class KeywordExtractor:
    def __init__(self, topic, num_clusters=5):
        # Initialize models and parameters
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.stop_words = set(stopwords.words('english'))
        self.num_clusters = num_clusters
        self.topic = topic

        # Fetch and preprocess document
        self.document = self.fetch_wikipedia_document()
        self.sentences = sent_tokenize(self.document)
        self.clean_sentences = [self.clean_text(sent) for sent in self.sentences]

        # Compute embeddings
        self.embeddings = self.model.encode(self.clean_sentences)

        # Clustering
        self.clusters = self.perform_clustering()

        # Extract keywords
        self.keywords_per_cluster = self.extract_keywords_per_cluster()

        # Create FAISS index for fast similarity search
        self.faiss_index = self.create_faiss_index()

    def fetch_wikipedia_document(self):
        """Fetch Wikipedia document for the given topic."""
        try:
            page = wikipedia.page(self.topic)
            return page.content
        except wikipedia.exceptions.DisambiguationError as e:
            print(f"Multiple matches found. Using first option: {e.options[0]}")
            page = wikipedia.page(e.options[0])
            return page.content

    @functools.lru_cache(maxsize=1000)
    def clean_text(self, text):
        """Preprocess and clean text."""
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Convert to lowercase
        text = text.lower()

        # Remove extra whitespaces
        text = ' '.join(text.split())

        # Remove stopwords
        words = [word for word in text.split() if word not in self.stop_words]

        return ' '.join(words)

    def perform_clustering(self):
        """Perform K-means clustering on sentence embeddings."""
        kmeans = KMeans(n_clusters=self.num_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(self.embeddings)
        return cluster_labels

    def extract_keywords_per_cluster(self):
        """Extract keywords for each cluster using n-gram."""
        vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')

        keywords_per_cluster = {}
        for cluster in range(self.num_clusters):
            cluster_sentences = [
                sent for sent, label in zip(self.clean_sentences, self.clusters)
                if label == cluster
            ]

            # Vectorize cluster sentences
            if cluster_sentences:
                vectorized = vectorizer.fit_transform(cluster_sentences)
                keywords = vectorizer.get_feature_names_out()
                keywords_per_cluster[cluster] = keywords

        return keywords_per_cluster

    def create_faiss_index(self):
        """Create FAISS index for efficient similarity search."""
        index = faiss.IndexFlatL2(self.embeddings.shape[1])
        index.add(self.embeddings)
        return index

    def find_most_relevant_sentences(self):
        """Find most relevant sentences for each cluster."""
        cluster_sentences = {}
        for cluster in range(self.num_clusters):
            # Get cluster sentences
            cluster_mask = self.clusters == cluster
            cluster_sent_embeddings = self.embeddings[cluster_mask]
            cluster_sent_texts = [
                sent for sent, label in zip(self.clean_sentences, self.clusters)
                if label == cluster
            ]

            # Compute max-sum diversity
            selected_indices = self.max_sum_diversity(cluster_sent_embeddings)

            # Store results
            cluster_sentences[cluster] = {
                'keywords': self.keywords_per_cluster.get(cluster, []),
                'sentences': [cluster_sent_texts[idx] for idx in selected_indices]
            }

        return cluster_sentences

    def max_sum_diversity(self, embeddings, num_sentences=3):
        """Select diverse sentences using max-sum method."""
        if len(embeddings) <= num_sentences:
            return list(range(len(embeddings)))

        selected_indices = [0]  # Start with first sentence

        while len(selected_indices) < num_sentences:
            remaining_indices = list(set(range(len(embeddings))) - set(selected_indices))

            # Compute diversity score
            diversity_scores = []
            for idx in remaining_indices:
                candidate_embedding = embeddings[idx]
                min_distance = np.min([
                    np.linalg.norm(candidate_embedding - embeddings[sel_idx])
                    for sel_idx in selected_indices
                ])
                diversity_scores.append(min_distance)

            # Select most diverse sentence
            best_idx = remaining_indices[np.argmax(diversity_scores)]
            selected_indices.append(best_idx)

        return selected_indices

def main():
    # User input for topic
    topic = input("Enter Wikipedia topic for keyword extraction: ")

    # Initialize extractor
    extractor = KeywordExtractor(topic)

    # Find and print cluster sentences
    cluster_results = extractor.find_most_relevant_sentences()

    print("\n--- Cluster Analysis ---")
    for cluster, data in cluster_results.items():
        print(f"\nCluster {cluster}:")
        print("Keywords:", data['keywords'][:5])
        print("Representative Sentences:")
        for sent in data['sentences']:
            print(f"  - {sent}")

    # Interactive query
    while True:
        query = input("\nEnter query to search clusters (or 'exit' to quit): ")
        if query.lower() == 'exit':
            break

        # Find most similar cluster to query
        query_embedding = extractor.model.encode([extractor.clean_text(query)])[0]
        distances, indices = extractor.faiss_index.search(
            query_embedding.reshape(1, -1), k=5
        )

        most_similar_cluster = extractor.clusters[indices[0][0]]

        print(f"\nMost Relevant Cluster (Cluster {most_similar_cluster}):")
        print("Keywords:", cluster_results[most_similar_cluster]['keywords'][:5])
        print("Representative Sentences:")
        for sent in cluster_results[most_similar_cluster]['sentences'][:10 ]:
            print(f"  - {sent}")

if __name__ == "__main__":
    main()

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter Wikipedia topic for keyword extraction: Amitabh Bachan 


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



--- Cluster Analysis ---

Cluster 0:
Keywords: ['abcl main' 'abcl reported' 'abcl various' 'abcls operations'
 'abcls strategy']
Representative Sentences:
  - taking break acting resurgence marked mohabbatein
  - june became first living asian modelled wax londons madame tussauds wax museum
  - girls hang incredulous desperation bats

Cluster 1:
Keywords: ['ab legend' 'abcl event' 'abhishek actor' 'according raja'
 'accused using']
Representative Sentences:
  - amitabh bachchan born allahabad prayagraj hindi poet harivansh rai bachchan wife social activist teji bachchan
  - bachchan named panama papers paradise papers leaked confidential documents relating offshore investment
  - writing ndtv troy ribeiro indoasian news service ians stated amitabh bachchan deepak sehgall aged defence lawyer shines always restrained powerful performance

Cluster 2:
Keywords: ['aag released' 'aaj ka' 'aamir khan' 'aankhen baghban' 'aankhen kaante']
Representative Sentences:
  - educated sherwood college