<a href="https://colab.research.google.com/github/fahimku2020/fahimku2020/blob/main/Fast_keywords_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wikipedia
!pip install sentence_transformers
!pip install nltk
!pip install faiss-gpu
!pip install torch
!pip install tqdm
!pip install lru_cache


Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=0fe2f2367fbd07afe6ee315de091e926b7f210e07793ce578beeb0a4d14e6884
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Succes

In [None]:
import wikipedia
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
import nltk
import re
from typing import List, Dict, Tuple
from functools import lru_cache
import pandas as pd
from collections import defaultdict
import faiss
import torch
from concurrent.futures import ThreadPoolExecutor
import time
from tqdm import tqdm
nltk.download ('punkt_tab')

class AdvancedKeywordExtractor:
    def __init__(self, model_name='all-MiniLM-L6-v2', num_clusters=5, cache_size=128):
        """
        Initialize with specified transformer model and clustering parameters
        """
        # Initialize NLTK resources
        nltk.download('punkt')
        nltk.download('stopwords')

        # Initialize sentence transformer model
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = SentenceTransformer(model_name, device=self.device)

        # Clustering parameters
        self.num_clusters = num_clusters
        self.stop_words = set(nltk.corpus.stopwords.words('english'))

        # Initialize FAISS index
        self.dimension = self.model.get_sentence_embedding_dimension()
        self.index = faiss.IndexFlatIP(self.dimension)

        # Cache for embeddings and similarities
        self.cache_size = cache_size

    @lru_cache(maxsize=128)
    def fetch_wikipedia_content(self, topic: str) -> str:
        """
        Fetch and cache Wikipedia content
        """
        try:
            page = wikipedia.page(topic)
            return page.content
        except Exception as e:
            print(f"Error fetching Wikipedia content: {e}")
            return ""

    def clean_text(self, text: str) -> str:
        """
        Clean and preprocess text
        """
        # Convert to lowercase
        text = text.lower()

        # Remove special characters and numbers
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)

        # Remove extra whitespace
        text = ' '.join(text.split())

        # Remove stopwords
        words = word_tokenize(text)
        words = [word for word in words if word not in self.stop_words]

        return ' '.join(words)

    @lru_cache(maxsize=128)
    def get_embedding(self, text: str) -> np.ndarray:
        """
        Get and cache sentence embeddings
        """
        return self.model.encode(text, convert_to_tensor=True).cpu().numpy()

    def generate_ngrams(self, text: str, n_range: Tuple[int, int] = (2, 2)) -> List[str]:
        """
        Generate n-grams efficiently
        """
        tokens = word_tokenize(text)
        all_ngrams = []

        for n in range(n_range[0], n_range[1] + 1):
            n_grams = list(ngrams(tokens, n))
            all_ngrams.extend([' '.join(gram) for gram in n_grams])

        return all_ngrams

    def cluster_sentences(self, sentences: List[str]) -> Dict[int, List[str]]:
        """
        Cluster sentences using FAISS for efficient similarity search
        """
        # Get embeddings for all sentences
        embeddings = np.vstack([self.get_embedding(sent) for sent in sentences])

        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)

        # Perform clustering
        kmeans = faiss.Kmeans(embeddings.shape[1], self.num_clusters, niter=20, gpu=torch.cuda.is_available())
        kmeans.train(embeddings)
        _, labels = kmeans.index.search(embeddings, 1)

        # Group sentences by cluster
        clusters = defaultdict(list)
        for sent, label in zip(sentences, labels):
            clusters[int(label[0])].append(sent)

        return dict(clusters)

    def calculate_similarity_matrix(self, sentences: List[str], keywords: List[str]) -> np.ndarray:
        """
        Calculate similarity matrix using FAISS
        """
        sent_embeddings = np.vstack([self.get_embedding(sent) for sent in sentences])
        keyword_embeddings = np.vstack([self.get_embedding(keyword) for keyword in keywords])

        # Normalize embeddings
        faiss.normalize_L2(sent_embeddings)
        faiss.normalize_L2(keyword_embeddings)

        # Calculate similarities using FAISS
        similarities = keyword_embeddings @ sent_embeddings.T

        return similarities

    def max_sum_similarity(self, similarities: np.ndarray, top_n: int = 5) -> List[int]:
        """
        Select top sentences using max sum similarity
        """
        row_sums = similarities.sum(axis=0)
        return row_sums.argsort()[-top_n:][::-1]

    def extract_keywords_and_sentences(self, topic: str,
                                    max_keywords: int = 10,
                                    sentences_per_cluster: int = 3) -> Dict:
        """
        Main function to extract keywords and relevant sentences
        """
        start_time = time.time()

        # Fetch and preprocess content
        content = self.fetch_wikipedia_content(topic)
        if not content:
            return {}

        # Split into sentences and clean
        sentences = sent_tokenize(content)
        cleaned_sentences = [self.clean_text(sent) for sent in sentences]

        # Cluster sentences
        clusters = self.cluster_sentences(cleaned_sentences)

        # Process each cluster in parallel
        results = defaultdict(dict)

        def process_cluster(cluster_id, cluster_sentences):
            # Generate n-grams for the cluster
            cluster_text = ' '.join(cluster_sentences)
            ngrams_list = self.generate_ngrams(cluster_text)

            # Get embeddings and calculate similarities
            similarities = self.calculate_similarity_matrix(cluster_sentences, ngrams_list)

            # Select top keywords and sentences
            top_indices = self.max_sum_similarity(similarities, top_n=max_keywords)
            top_keywords = [ngrams_list[i] for i in top_indices]

            # Get most relevant sentences
            sent_similarities = self.calculate_similarity_matrix(cluster_sentences, top_keywords)
            top_sent_indices = self.max_sum_similarity(sent_similarities, top_n=sentences_per_cluster)
            top_sentences = [sentences[i] for i in top_sent_indices]

            return {
                'keywords': top_keywords,
                'sentences': top_sentences,
                'size': len(cluster_sentences)
            }

        # Process clusters in parallel
        with ThreadPoolExecutor() as executor:
            future_to_cluster = {
                executor.submit(process_cluster, cluster_id, cluster_sentences): cluster_id
                for cluster_id, cluster_sentences in clusters.items()
            }

            for future in tqdm(future_to_cluster, desc="Processing clusters"):
                cluster_id = future_to_cluster[future]
                results[cluster_id] = future.result()

        # Add execution time
        execution_time = time.time() - start_time
        results['metadata'] = {
            'execution_time': execution_time,
            'num_clusters': len(clusters),
            'total_sentences': len(sentences)
        }

        return dict(results)

# Example usage
if __name__ == "__main__":
    # Initialize extractor
    extractor = AdvancedKeywordExtractor()

    # Extract keywords and sentences
    topic = "Amitabh  Bachan "
    results = extractor.extract_keywords_and_sentences(
        topic,
        max_keywords=10,
        sentences_per_cluster=5
    )

    # Print results
    print(f"\nResults for topic: {topic}")
    print(f"Execution time: {results['metadata']['execution_time']:.2f} seconds")
    print(f"Number of clusters: {results['metadata']['num_clusters']}")

    for cluster_id, cluster_data in results.items():
        if cluster_id != 'metadata':
            print(f"\nCluster {cluster_id} (Size: {cluster_data['size']})")
            print("\nTop Keywords:")
            for keyword in cluster_data['keywords'][:5]:
                print(f"- {keyword}")
            print("\nRelevant Sentences:")
            for sentence in cluster_data['sentences']:
                print(f"- {sentence}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Processing clusters: 100%|██████████| 5/5 [00:45<00:00,  9.04s/it]


Results for topic: Amitabh  Bachan 
Execution time: 48.03 seconds
Number of clusters: 5

Cluster 1 (Size: 58)

Top Keywords:
- hailed shahenshah
- awards sixteen
- shahenshah bollywood
- six national
- bachchan often

Relevant Sentences:
- With a cinematic career spanning over five decades, he has played pivotal roles in over 200 films.
- During this time, he made a guest appearance in the film Guddi which starred his future wife Jaya Bhaduri.
- His first acting role was as one of the seven protagonists in the film Saat Hindustani, directed by Khwaja Ahmad Abbas and featuring Utpal Dutt, Anwar Ali (brother of comedian Mehmood), Madhu and Jalal Agha.
- Amitabh Bachchan (pronounced [əmɪˈt̪ɑːbʱ ˈbətːʃən] ; born Amitabh Srivastava; 11 October 1942) is an Indian actor who works in Hindi cinema.
- Harivansh's ancestors came from a village called Babupatti, in the Raniganj tehsil, in the Pratapgarh district, in the present-day state of Uttar Pradesh, in India.

Cluster 2 (Size: 81)

Top Keyw


