<a href="https://colab.research.google.com/github/fahimku2020/fahimku2020/blob/main/Best_LSI_keywords_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sklearn
!pip install pandas
!pip install numpy

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

class LSIKeywordExtractor:
    def __init__(self, max_features=1000, n_components=100, top_k_keywords=10):
        """
        Initialize LSI Keyword Extractor

        Parameters:
        - max_features: Maximum number of features (words) to consider
        - n_components: Number of semantic dimensions to reduce to
        - top_k_keywords: Number of top keywords to extract
        """
        self.max_features = max_features
        self.n_components = n_components
        self.top_k_keywords = top_k_keywords

        # TF-IDF Vectorizer
        self.vectorizer = TfidfVectorizer(
            max_features=self.max_features,
            stop_words='english',
            use_idf=True
        )

        # LSI Components
        self.lsi = None
        self.feature_names = None

    def fit(self, documents):
        """
        Fit the LSI model to the documents

        Parameters:
        - documents: List of text documents
        """
        # Create TF-IDF matrix
        tfidf_matrix = self.vectorizer.fit_transform(documents)
        self.feature_names = self.vectorizer.get_feature_names_out()

        # Perform Truncated SVD (LSI)
        self.lsi = TruncatedSVD(n_components=self.n_components, random_state=42)
        lsi_matrix = self.lsi.fit_transform(tfidf_matrix)

        # Normalize the LSI matrix
        self.lsi_normalized = Normalizer(copy=False).fit_transform(lsi_matrix)

        return self

    def extract_keywords(self, document):
        """
        Extract top keywords for a given document

        Parameters:
        - document: Text document to extract keywords from

        Returns:
        - List of top keywords with their weights
        """
        if self.lsi is None:
            raise ValueError("Model must be fit before extracting keywords")

        # Transform the document using TF-IDF
        doc_tfidf = self.vectorizer.transform([document])

        # Transform to LSI space
        doc_lsi = self.lsi.transform(doc_tfidf)
        doc_lsi_normalized = Normalizer().fit_transform(doc_lsi)

        # Calculate keyword importance
        keyword_importances = np.abs(self.lsi.components_.T @ doc_lsi_normalized.T).flatten()

        # Get top keywords
        top_indices = keyword_importances.argsort()[-self.top_k_keywords:][::-1]

        # Create results with keywords and their importances
        keywords = [
            (self.feature_names[idx], keyword_importances[idx])
            for idx in top_indices
        ]

        return keywords

    def extract_corpus_keywords(self, documents):
        """
        Extract top keywords for an entire corpus

        Parameters:
        - documents: List of text documents

        Returns:
        - DataFrame with top keywords across the corpus
        """
        # Fit the model first
        self.fit(documents)

        # Extract keywords for each document
        corpus_keywords = [
            self.extract_keywords(doc) for doc in documents
        ]

        # Create a DataFrame for better visualization
        df_keywords = pd.DataFrame({
            'document': range(len(documents)),
            'keywords': corpus_keywords
        })

        return df_keywords

# Example usage
def main():
    # Sample documents
    documents = [
        "Machine learning is a method of data analysis that automates analytical model building.",
        "Python is a popular programming language for data science and artificial intelligence.",
        "Natural language processing helps computers understand and interpret human language.",
        "Deep learning is a subset of machine learning based on artificial neural networks."
    ]

    # Initialize and fit the LSI Keyword Extractor
    extractor = LSIKeywordExtractor(
        max_features=1000,
        n_components=3,
        top_k_keywords=5
    )

    # Extract keywords for the corpus
    corpus_keywords = extractor.extract_corpus_keywords(documents)
    print("Corpus Keywords:")
    print(corpus_keywords)

    # Extract keywords for a specific document
    test_document = "Deep learning algorithms are revolutionizing artificial intelligence research."
    keywords = extractor.extract_keywords(test_document)
    print("\nKeywords for test document:")
    for keyword, weight in keywords:
        print(f"{keyword}: {weight:.4f}")

if __name__ == "__main__":
    main()

Corpus Keywords:
   document                                           keywords
0         0  [(learning, 0.5196712021288857), (machine, 0.3...
1         1  [(science, 0.38139984838473195), (python, 0.38...
2         2  [(language, 0.5120242788456416), (natural, 0.3...
3         3  [(learning, 0.5235880361379652), (machine, 0.3...

Keywords for test document:
learning: 0.3942
data: 0.3137
artificial: 0.3008
machine: 0.2671
science: 0.2203


In [None]:
import requests
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.stem import WordNetLemmatizer

class WikipediaLSIKeywordExtractor:
    def __init__(self):
        # Download necessary NLTK resources
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)

        # Initialize lemmatizer and stopwords
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def preprocess_text(self, text):
        """
        Preprocess the text by:
        1. Lowercasing
        2. Removing special characters
        3. Tokenization
        4. Removing stopwords
        5. Lemmatization
        """
        # Convert to lowercase
        text = text.lower()

        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        processed_tokens = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token not in self.stop_words and len(token) > 2
        ]

        return processed_tokens

    def generate_bigrams(self, tokens):
        """
        Generate bigrams from tokens
        """
        return list(ngrams(tokens, 2))

    def extract_keywords_lsi(self, text, num_keywords=20, num_topics=10):
        """
        Extract keywords using LSI technique
        """
        # Preprocess text
        processed_tokens = self.preprocess_text(text)

        # Generate unigrams and bigrams
        unigrams = processed_tokens
        bigrams = self.generate_bigrams(processed_tokens)

        # Convert bigrams to strings
        bigram_strings = [' '.join(bg) for bg in bigrams]

        # Combine unigrams and bigrams
        all_tokens = unigrams + bigram_strings

        # Create TF-IDF vectorizer
        vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b|\b\w+\s\w+\b')
        tfidf_matrix = vectorizer.fit_transform(all_tokens)

        # Perform LSI (Truncated SVD)
        lsi = TruncatedSVD(n_components=num_topics, random_state=42)
        lsi_matrix = lsi.fit_transform(tfidf_matrix)

        # Get feature names
        feature_names = vectorizer.get_feature_names_out()

        # Calculate keyword scores based on LSI components
        keyword_scores = np.abs(lsi.components_).sum(axis=0)

        # Get top keywords
        top_keyword_indices = keyword_scores.argsort()[-num_keywords:][::-1]
        top_keywords = [feature_names[i] for i in top_keyword_indices]

        return top_keywords

def fetch_wikipedia_content(topic):
    """
    Manually fetch Wikipedia content by constructing a URL
    Note: This is a simplified approach and might not work for all topics
    """
    base_url = "https://en.wikipedia.org/w/index.php"
    params = {
        "title": topic.replace(" ", "_"),
        "action": "render"
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()

        # Simple text extraction (very basic)
        text = re.sub(r'<.*?>', '', response.text)
        text = re.sub(r'\n+', ' ', text)

        return text
    except Exception as e:
        print(f"Error fetching Wikipedia content: {e}")
        return ""

def main():
    # User input for Wikipedia topic
    topic = input("Enter a Wikipedia topic to extract keywords: ")

    # Fetch Wikipedia content
    wiki_text = fetch_wikipedia_content(topic)

    if not wiki_text:
        print("Could not fetch content. Please try another topic.")
        return

    # Initialize keyword extractor
    extractor = WikipediaLSIKeywordExtractor()

    # Extract keywords
    keywords = extractor.extract_keywords_lsi(wiki_text)

    print(f"\nTop Keywords for '{topic}':")
    for keyword in keywords:
        print(keyword)

if __name__ == "__main__":
    main()

Enter a Wikipedia topic to extract keywords: Amitabh Bachan 

Top Keywords for 'Amitabh Bachan ':
archived
original
october
july
amitabh
bachchan
retrieved
india
film
khan
time
december
november
rukh
september
office
february
march
shah
award


In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.util import ngrams
import re

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)

def fetch_wikipedia_content(topic):
    """
    Fetch Wikipedia content by scraping the first section of the article.

    Args:
        topic (str): Wikipedia topic to search

    Returns:
        str: Extracted text content
    """
    # Construct Wikipedia URL
    url = f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}"

    try:
        # Send request to Wikipedia
        response = requests.get(url)
        response.raise_for_status()

        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract text from the first paragraphs
        paragraphs = soup.select('#mw-content-text p')

        # Combine paragraphs, limit to first 5 for processing
        content = ' '.join([p.get_text() for p in paragraphs[:5]])

        return content

    except Exception as e:
        print(f"Error fetching Wikipedia content: {e}")
        return ""

def preprocess_text(text):
    """
    Preprocess text by removing special characters and converting to lowercase.

    Args:
        text (str): Input text

    Returns:
        str: Preprocessed text
    """
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    return text

def extract_lsi_bigram_keywords(text, num_keywords=10):
    """
    Extract LSI keywords using bigrams.

    Args:
        text (str): Input text
        num_keywords (int): Number of keywords to extract

    Returns:
        list: Extracted bigram keywords
    """
    # Preprocess text
    preprocessed_text = preprocess_text(text)

    # Generate bigrams
    tokens = nltk.word_tokenize(preprocessed_text)
    bigram_tokens = list(ngrams(tokens, 2))

    # Convert bigrams to strings
    bigram_strings = [' '.join(bg) for bg in bigram_tokens]

    # Create TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(ngram_range=(2, 2))
    tfidf_matrix = vectorizer.fit_transform(bigram_strings)

    # Perform LSI (Truncated SVD)
    lsi = TruncatedSVD(n_components=1, random_state=42)
    lsi_matrix = lsi.fit_transform(tfidf_matrix)

    # Get feature names (bigrams)
    feature_names = vectorizer.get_feature_names_out()

    # Calculate importance scores
    importance_scores = np.abs(lsi.components_[0])

    # Sort bigrams by importance
    sorted_indices = importance_scores.argsort()[::-1]

    # Remove duplicates while preserving order
    unique_keywords = []
    for idx in sorted_indices:
        keyword = feature_names[idx]
        if keyword not in unique_keywords:
            unique_keywords.append(keyword)

    # Return top N unique keywords
    return unique_keywords[:num_keywords]

def main():
    # Get user input for Wikipedia topic
    topic = input("Enter a Wikipedia topic to extract keywords: ")

    # Fetch Wikipedia content
    wiki_content = fetch_wikipedia_content(topic)

    if wiki_content:
        # Extract LSI bigram keywords
        keywords = extract_lsi_bigram_keywords(wiki_content)

        print(f"\nTop Bigram Keywords for '{topic}':")
        for i, keyword in enumerate(keywords, 1):
            print(f"{i}. {keyword}")
    else:
        print("Could not retrieve content for the given topic.")

if __name__ == "__main__":
    main()

Enter a Wikipedia topic to extract keywords: Amitabh Bachan 

Top Bigram Keywords for 'Amitabh Bachan ':
1. in the
2. national film
3. best actor
4. star of
5. he is
6. bachchan has
7. he has
8. of cinema
9. roles in
10. of his


In [None]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=6c3b2e44e724b588632f70a1e5b87a8b35e62d8fa7a5970ab0b91db7d44260cc
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [None]:
import numpy as np
import nltk
import wikipedia
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def preprocess_text(text):
    """
    Preprocess text by:
    1. Splitting into sentences
    2. Removing stopwords
    3. Converting to lowercase
    """
    # Get English stopwords
    stop_words = set(stopwords.words('english'))

    # Tokenize sentences
    sentences = sent_tokenize(text)

    # Preprocess each sentence
    processed_sentences = []
    for sentence in sentences:
        # Tokenize words
        words = word_tokenize(sentence.lower())

        # Remove stopwords and non-alphabetic tokens
        filtered_words = [
            word for word in words
            if word.isalpha() and word not in stop_words
        ]

        processed_sentences.append(' '.join(filtered_words))

    return processed_sentences

def extract_ngrams(words, n):
    """
    Extract n-grams from a list of words
    """
    return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

def lsi_keyword_extraction(text, n_keywords=20, n_grams=2):
    """
    Extract keywords using Latent Semantic Indexing

    Parameters:
    - text: Input text
    - n_keywords: Number of keywords to extract
    - n_grams: Length of n-grams to consider
    """
    # Preprocess text
    processed_sentences = preprocess_text(text)

    # Create TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)

    # Perform LSI (SVD)
    lsi = TruncatedSVD(n_components=min(n_keywords, len(processed_sentences)-1))
    lsi_matrix = lsi.fit_transform(tfidf_matrix)

    # Normalize the LSI matrix
    lsi_matrix_normalized = Normalizer(copy=False).fit_transform(lsi_matrix)

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Extract top keywords based on LSI components
    keywords = []
    for component in lsi.components_:
        # Get top indices for this component
        top_indices = component.argsort()[::-1][:n_keywords]
        top_keywords = [feature_names[idx] for idx in top_indices]
        keywords.extend(top_keywords)

    # Remove duplicates while preserving order
    keywords = list(dict.fromkeys(keywords))

    # Generate n-grams
    all_words = []
    for sentence in processed_sentences:
        all_words.extend(sentence.split())

    ngrams = extract_ngrams(all_words, n_grams)

    return {
        'keywords': keywords[:n_keywords],
        'ngrams': ngrams[:n_keywords]
    }

def main():
    # Get Wikipedia article
    try:
        # Prompt user for Wikipedia topic
        topic = input("Enter a Wikipedia topic to extract keywords from: ")

        # Fetch Wikipedia page
        page = wikipedia.page(topic)

        # Extract keywords
        result = lsi_keyword_extraction(page.content)

        # Print results
        print("\n--- Keywords ---")
        for keyword in result['keywords']:
            print(keyword)

        print("\n--- N-grams ---")
        for ngram in result['ngrams']:
            print(ngram)

    except wikipedia.exceptions.DisambiguationError as e:
        print("Multiple pages found. Please be more specific.")
        print("Possible options:", e.options[:5])
    except wikipedia.exceptions.PageError:
        print("No Wikipedia page found for the given topic.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

Enter a Wikipedia topic to extract keywords from: Amitabh Bachan 

--- Keywords ---
best
award
actor
filmfare
film
bachchan
performance
role
national
supporting
critics
second
year
awards
nominated
fourth
box
office
films
amitabh

--- N-grams ---
amitabh bachchan
bachchan pronounced
pronounced ˈbətːʃən
ˈbətːʃən born
born amitabh
amitabh srivastava
srivastava october
october indian
indian actor
actor works
works hindi
hindi cinema
cinema often
often considered
considered one
one greatest
greatest accomplished
accomplished commercially
commercially successful
successful actors


In [62]:
import wikipedia
import nltk
import gensim
from gensim import corpora, models, similarities
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import string
# Download necessary NLTK data (only needs to be done once)
nltk.download('punkt')
nltk.download('stopwords')

def extract_lsi_keywords(text, num_topics=5, num_keywords=5):
    """
    Extracts keywords from text using Latent Semantic Indexing (LSI).

    Args:
        text: The input text.
        num_topics: The number of topics to extract.
        num_keywords: The number of keywords per topic.

    Returns:
        A list of keywords.
    """
    sentences = sent_tokenize(text)

    additional_stopwords = ["may","used","use","us","said", "says", "would", "could", "should", "get", "go", "one", "two", "three", "many", "much", "also", "well", "even", "however", "therefore", "since", "although", "because", "though"]
    stop_words = set(stopwords.words('english')) | set(additional_stopwords)
    punctuation = set(string.punctuation)


    # Preprocessing: Tokenization, lowercasing, stop word removal, punctuation removal
    processed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word.isalnum() and word not in stop_words ]
        processed_sentences.append(words)

    # Create a dictionary and corpus
    dictionary = corpora.Dictionary(processed_sentences)
    corpus = [dictionary.doc2bow(sentence) for sentence in processed_sentences]

    # Train LSI model
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics)

    # Extract keywords (top words from each topic)

    keywords = []
    for topic_id in range(num_topics):
        topic = lsi.print_topic(topic_id, num_keywords)
        topic_keywords = [word.split("*")[1].strip() for word in topic.split("+")]  #Extract words from gensim output.
        keywords.extend(topic_keywords)

    return keywords


def get_ngrams(tokens, n):
  """Generates n-grams from a list of tokens."""
  ngrams = []
  for i in range(len(tokens) - n + 1):
    ngrams.append(tuple(tokens[i:i+n]))
  return ngrams


def main():
    search_term = input("Enter a Wikipedia search term: ")
    try:
        page = wikipedia.page(search_term)
        text = page.content

        #Extract bigrams
        sentences = sent_tokenize(text)
        all_tokens = []
        for sentence in sentences:
            tokens = word_tokenize(sentence.lower())
            tokens = [w for w in tokens if w.isalnum() and w not in stopwords.words('english')]
            all_tokens.extend(tokens)
        bigrams = get_ngrams(all_tokens, 2)

        print("\nBigrams (2,2) ngrams:")
        print(bigrams[:10])

        keywords = extract_lsi_keywords(text)
        print("\nLSI Keywords:", keywords[:10])

    except wikipedia.exceptions.PageError:
        print(f"Error: Wikipedia page not found for '{search_term}'")
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Error: Disambiguation error for '{search_term}'.  Possible options: {e.options}")


if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter a Wikipedia search term: Amitabh Bachan 

Bigrams (2,2) ngrams:
[('amitabh', 'bachchan'), ('bachchan', 'pronounced'), ('pronounced', 'ˈbətːʃən'), ('ˈbətːʃən', 'born'), ('born', 'amitabh'), ('amitabh', 'srivastava'), ('srivastava', '11'), ('11', 'october'), ('october', '1942'), ('1942', 'indian')]

LSI Keywords: ['"bachchan"', '"film"', '"amitabh"', '"actor"', '"best"', '"film"', '"bachchan"', '"best"', '"actor"', '"award"']


In [57]:
import string

sentences = [
    "This is sentence 1 with 2023 and January.",
    "Another sentence with some special characters like !@#$%^&*()_-+={}[]:;\"'<,>.?/~`.",
    "Sentence 3.",
]

months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
punctuation = set(string.punctuation)


processed_sentences = [
    " ".join([word for word in sentence.lower().split()
              if not any(char in punctuation for char in word) and not word.isdigit() and word not in months])
    for sentence in sentences
]

print(processed_sentences)

['this is sentence with and', 'another sentence with some special characters like', 'sentence']
