<a href="https://colab.research.google.com/github/fahimku2020/fahimku2020/blob/main/Fast_faiss_based_keywords_extractor_and_by_query.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wikipedia
!pip install sentence_transformers
!pip install nltk


Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=c81b02e34fa9c97aaf46ce23054fb4b401195b28b999bf5dbc54503f299fe4b1
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [None]:
import wikipedia
import numpy as np
import nltk
import torch
from typing import List, Dict
from functools import lru_cache
from nltk.util import ngrams
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt_tab')
nltk.download ('stopwords')

class AdvancedKeywordExtractor:
    def __init__(self, model_name='all-MiniLM-L6-v2', language='english'):
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)

        self.model = SentenceTransformer(model_name)
        self.stop_words = set(stopwords.words(language))

    @lru_cache(maxsize=100)
    def fetch_wikipedia_content(self, topic: str) -> str:
        try:
            page = wikipedia.page(topic)
            return page.content
        except Exception as e:
            print(f"Error fetching content: {e}")
            return ""

    def clean_text(self, text: str) -> str:
        text = ''.join(char.lower() for char in text if char.isalnum() or char.isspace())
        return text

    def split_into_sentences(self, text: str) -> List[str]:
        return nltk.sent_tokenize(text)

    def generate_ngrams(self, text: str, n: int = 2) -> List[str]:
        tokens = text.split()
        filtered_tokens = [token for token in tokens if token not in self.stop_words]
        return [' '.join(gram) for gram in list(ngrams(filtered_tokens, n))]

    def max_sum_keyword_selection(self, keywords: List[str], embeddings: np.ndarray, top_k: int = 5) -> List[str]:
        if len(keywords) <= top_k:
            return keywords

        similarity_matrix = cosine_similarity(embeddings)
        selected = [0]

        while len(selected) < top_k:
            diversity_scores = []
            for i in range(len(keywords)):
                if i not in selected:
                    min_sim = min(similarity_matrix[i][j] for j in selected)
                    diversity_scores.append(min_sim)
                else:
                    diversity_scores.append(-1)

            next_keyword = np.argmax(diversity_scores)
            selected.append(next_keyword)

        return [keywords[i] for i in selected]

    def extract_keywords_with_clustering(self, topic: str, n_clusters: int = 5, top_k_sentences: int = 3) -> Dict:
        content = self.fetch_wikipedia_content(topic)
        sentences = self.split_into_sentences(content)
        clean_sentences = [self.clean_text(sent) for sent in sentences]

        sentence_embeddings = self.model.encode(clean_sentences)

        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(sentence_embeddings)

        cluster_results = {}

        for cluster in range(n_clusters):
            cluster_mask = (cluster_labels == cluster)
            cluster_sentences = [clean_sentences[i] for i in range(len(clean_sentences)) if cluster_mask[i]]
            cluster_sentence_embeddings = sentence_embeddings[cluster_mask]

            potential_keywords = []
            for sent in cluster_sentences:
                potential_keywords.extend(self.generate_ngrams(sent))

            potential_keywords = list(set(potential_keywords))
            keyword_embeddings = self.model.encode(potential_keywords)

            selected_keywords = self.max_sum_keyword_selection(potential_keywords, keyword_embeddings)

            keyword_relevant_sentences = {}
            for keyword, keyword_emb in zip(selected_keywords, keyword_embeddings):
                similarities = cosine_similarity(keyword_emb.reshape(1, -1), cluster_sentence_embeddings)[0]

                top_sentence_indices = similarities.argsort()[-top_k_sentences:][::-1]
                top_sentences = [
                    {
                        'sentence': cluster_sentences[idx],
                        'similarity_score': similarities[idx]
                    }
                    for idx in top_sentence_indices
                ]

                keyword_relevant_sentences[keyword] = top_sentences

            cluster_results[cluster] = {
                'keywords': selected_keywords,
                'relevant_sentences': keyword_relevant_sentences
            }

        return cluster_results

    def find_most_relevant_cluster(self, results: Dict, user_query: str) -> Dict:
        query_embedding = self.model.encode([user_query])[0]

        cluster_similarities = {}
        for cluster, data in results.items():
            keywords = data['keywords']
            keyword_embeddings = self.model.encode(keywords)

            # Compute average similarity between query and cluster keywords
            cluster_sim = np.mean([cosine_similarity(query_embedding.reshape(1, -1), ke.reshape(1, -1))[0][0]
                                   for ke in keyword_embeddings])
            cluster_similarities[cluster] = cluster_sim

        # Find cluster with highest similarity
        most_relevant_cluster = max(cluster_similarities, key=cluster_similarities.get)
        return results[most_relevant_cluster]

def main():
    extractor = AdvancedKeywordExtractor()
    topic = input("Enter Wikipedia topic to analyze: ")

    # Extract keywords and clusters
    results = extractor.extract_keywords_with_clustering(topic)

    # User query for cluster retrieval
    user_query = input("Enter a query to find relevant cluster: ")

    # Find most relevant cluster based on user query
    relevant_cluster = extractor.find_most_relevant_cluster(results, user_query)

    print("\nMost Relevant Cluster:")
    print("Keywords:", relevant_cluster['keywords'])

    print("\nRelevant Sentences:")
    for keyword, sentences in relevant_cluster['relevant_sentences'].items():
        print(f"\nKeyword: {keyword}")
        for sent_data in sentences:
            print(f"  Sentence: {sent_data['sentence']}")
            print(f"  Highest Similarity Score: {sent_data['similarity_score']:.4f}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter Wikipedia topic to analyze: Amitabh Bachan 
Enter a query to find relevant cluster: Films

Most Relevant Cluster:
Keywords: ['bachchan family', 'family delhi', 'family also', 'family choose', '2013 family']

Relevant Sentences:

Keyword: bachchan family
  Sentence: the bachchan family also bought shares worth 252000 in meridian tech a consulting company in the us
  Highest Similarity Score: 0.5040
  Sentence: he and his family choose to stay away from the limelight
  Highest Similarity Score: 0.3647
  Sentence: in 2013 he and his family donated 25 million 42664 to a charitable trust plan india that works for the betterment of young girls in india
  Highest Similarity Score: 0.2691

Keyword: family delhi
  Sentence: despite significant expectations it had poor returns at the box office
  Highest Similarity Score: 0.5965
  Sentence: the fiasco and the consequent legal battles surrounding abcl and various entities after the event coupled with the fact that abcl was reported to have 

In [3]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/27.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/27.5 MB[0m [31m4.1 MB/s[0m eta [36m0:00:07[0m[2K   [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/27.5 MB[0m [31m17.4 MB/s[0m eta [36m0:00:02[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/27.5 MB[0m [31m47.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/27.5 MB[0m [31m65.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m14.7/27.5 MB[0m [31m133.9 MB/s[0m eta [36m

In [14]:
import wikipedia
import numpy as np
import nltk
import faiss
from typing import List, Dict
from functools import lru_cache
from nltk.util import ngrams
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt_tab')
nltk.download('stopwords')

class AdvancedKeywordExtractor:
    def __init__(self, model_name='all-MiniLM-L6-v2', language='english', batch_size=32):
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)

        self.model = SentenceTransformer(model_name)
        self.stop_words = set(stopwords.words(language))
        self.batch_size = batch_size

    @lru_cache(maxsize=100)
    def fetch_wikipedia_content(self, topic: str) -> str:
        try:
            page = wikipedia.page(topic)
            return page.content
        except Exception as e:
            print(f"Error fetching content: {e}")
            return ""

    def clean_text(self, text: str) -> str:
        return ''.join(char.lower() for char in text if char.isalnum() or char.isspace())

    def split_into_sentences(self, text: str) -> List[str]:
        return nltk.sent_tokenize(text)

    def generate_ngrams(self, text: str, n: int = 2) -> List[str]:
        tokens = text.split()
        filtered_tokens = [token for token in tokens if token not in self.stop_words]
        return [' '.join(gram) for gram in list(ngrams(filtered_tokens, n))]

    def batch_encode(self, texts: List[str]) -> np.ndarray:
        """Encode texts in batches for memory efficiency"""
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i+self.batch_size]
            batch_embeddings = self.model.encode(batch)
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

    def max_sum_keyword_selection(self, keywords: List[str], embeddings: np.ndarray, top_k: int = 5) -> List[str]:
        if len(keywords) <= top_k:
            return keywords

        similarity_matrix = cosine_similarity(embeddings)
        selected = [0]

        while len(selected) < top_k:
            diversity_scores = []
            for i in range(len(keywords)):
                if i not in selected:
                    min_sim = min(similarity_matrix[i][j] for j in selected)
                    diversity_scores.append(min_sim)
                else:
                    diversity_scores.append(-1)

            next_keyword = np.argmax(diversity_scores)
            selected.append(next_keyword)

        return [keywords[i] for i in selected]

    def create_faiss_index(self, embeddings: np.ndarray):
        """Create FAISS index for efficient similarity search"""
        index = faiss.IndexFlatL2(embeddings.shape[1])
        index.add(embeddings)
        return index

    def extract_keywords_with_clustering(self, topic: str, n_clusters: int = 10, top_k_sentences: int = 3) -> Dict:
        content = self.fetch_wikipedia_content(topic)
        sentences = self.split_into_sentences(content)
        clean_sentences = [self.clean_text(sent) for sent in sentences]

        sentence_embeddings = self.batch_encode(clean_sentences)

        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(sentence_embeddings)

        cluster_results = {}

        for cluster in range(n_clusters):
            cluster_mask = (cluster_labels == cluster)
            cluster_sentences = [clean_sentences[i] for i in range(len(clean_sentences)) if cluster_mask[i]]
            cluster_sentence_embeddings = sentence_embeddings[cluster_mask]

            potential_keywords = []
            for sent in cluster_sentences:
                potential_keywords.extend(self.generate_ngrams(sent))

            potential_keywords = list(set(potential_keywords))
            keyword_embeddings = self.batch_encode(potential_keywords)

            # Create FAISS index for efficient similarity search
            faiss_index = self.create_faiss_index(cluster_sentence_embeddings)

            selected_keywords = self.max_sum_keyword_selection(potential_keywords, keyword_embeddings)

            keyword_relevant_sentences = {}
            for keyword, keyword_emb in zip(selected_keywords, keyword_embeddings):
                # Use FAISS for similarity search
                _, indices = faiss_index.search(keyword_emb.reshape(1, -1), top_k_sentences)

                top_sentences = [
                    {
                        'sentence': cluster_sentences[idx],
                        'similarity_score': cosine_similarity(keyword_emb.reshape(1, -1),
                                                             cluster_sentence_embeddings[idx].reshape(1, -1))[0][0]
                    }
                    for idx in indices[0]
                ]

                keyword_relevant_sentences[keyword] = top_sentences

            cluster_results[cluster] = {
                'keywords': selected_keywords,
                'sentences': cluster_sentences,
                'relevant_sentences': keyword_relevant_sentences
            }

        return cluster_results

    def find_most_relevant_cluster(self, results: Dict, user_query: str) -> Dict:
        query_embedding = self.model.encode([user_query])[0]

        cluster_similarities = {}
        for cluster, data in results.items():
            keywords = data['keywords']
            keyword_embeddings = self.batch_encode(keywords)

            cluster_sim = np.mean([cosine_similarity(query_embedding.reshape(1, -1), ke.reshape(1, -1))[0][0]
                                   for ke in keyword_embeddings])
            cluster_similarities[cluster] = cluster_sim

        most_relevant_cluster = max(cluster_similarities, key=cluster_similarities.get)
        return results[most_relevant_cluster]

def main():
    extractor = AdvancedKeywordExtractor()

    # Get Wikipedia topic
    topic = input("Enter Wikipedia topic to analyze: ")

    # Extract keywords and clusters
    results = extractor.extract_keywords_with_clustering(topic)

    # Print all cluster information
    print("\nAll Cluster Details:")
    for cluster, data in results.items():
        print(f"\nCluster {cluster}:")
        print("Keywords:", data['keywords'])
        print("\nCluster Sentences:")
        for sent in data['sentences'][:5]:  # Print first 3 sentences
            print(f"  - {sent}")

    # User query for cluster retrieval
    user_query = input("\nEnter a query to find relevant cluster: ")

    # Find most relevant cluster based on user query
    relevant_cluster = extractor.find_most_relevant_cluster(results, user_query)

    print("\nMost Relevant Cluster:")
    print("Keywords:", relevant_cluster['keywords'])

    print("\nRelevant Sentences:")
    for keyword, sentences in relevant_cluster['relevant_sentences'].items():
        print(f"\nKeyword: {keyword}")
        for sent_data in sentences:
            print(f"  Sentence: {sent_data['sentence'][:200]}")
            print(f"  Highest Similarity Score: {sent_data['similarity_score']:.4f}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter Wikipedia topic to analyze: Amitabh Bachan 

All Cluster Details:

Cluster 0:
Keywords: ['bachchans character', 'bachchan nearfatal', 'coolie bachchan', 'coolie bachchans', 'amitabh bachchan']

Cluster Sentences:
  - it is thought that his mother might have had some influence on his choice of career for she always insisted that he should take centre stage
  - however they were struggling to find an actor for the lead angry young man role it was turned down by several actors owing to it going against the romantic hero image dominant in the industry at the time
  - on 26 july 1982 while filming a fight scene with coactor puneet issar for coolie bachchan had a nearfatal intestinal injury
  - however as he jumped towards the table the corner of the table struck his abdomen resulting in a splenic rupture from which he lost a significant amount of blood
  - the director manmohan desai altered the ending of coolie bachchans character was originally intended to have been killed off but a