<a href="https://colab.research.google.com/github/fahimku2020/fahimku2020/blob/main/Best_wordnet_keywordsextraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

class KeywordExtractor:
    def __init__(self, language='english', min_word_length=3, max_keywords=20):
        """
        Initialize the KeywordExtractor with configurable parameters

        :param language: Language for stopwords and processing
        :param min_word_length: Minimum length of words to consider
        :param max_keywords: Maximum number of keywords to return
        """
        # Download necessary NLTK resources
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('averaged_perceptron_tagger', quiet=True)
        nltk.download('wordnet', quiet=True)

        self.stop_words = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
        self.min_word_length = min_word_length
        self.max_keywords = max_keywords

    def preprocess_text(self, text):
        """
        Preprocess the input text: lowercase, remove special characters

        :param text: Input text to preprocess
        :return: Cleaned text
        """
        # Convert to lowercase and remove special characters
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        return text

    def tokenize_and_clean(self, text):
        """
        Tokenize text and remove stopwords

        :param text: Preprocessed text
        :return: List of cleaned tokens
        """
        # Tokenize and remove stopwords
        tokens = word_tokenize(text)
        tokens = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token not in self.stop_words and
               len(token) >= self.min_word_length
        ]
        return tokens

    def get_pos_filtered_tokens(self, tokens):
        """
        Filter tokens based on part of speech (nouns and adjectives)

        :param tokens: List of tokens
        :return: Filtered tokens with their POS tags
        """
        # Tag tokens with parts of speech
        pos_tags = nltk.pos_tag(tokens)

        # Filter for nouns (NN*) and adjectives (JJ*)
        filtered_tokens = [
            word for word, pos in pos_tags
            if pos.startswith('NN') or pos.startswith('JJ')
        ]
        return filtered_tokens

    def calculate_word_scores(self, tokens):
        """
        Calculate word scores based on frequency

        :param tokens: List of filtered tokens
        :return: Dictionary of word scores
        """
        # Count word frequencies
        word_freq = Counter(tokens)

        # Calculate total words
        total_words = len(tokens)

        # Calculate word scores (frequency normalized)
        word_scores = {
            word: freq / total_words
            for word, freq in word_freq.items()
        }
        return word_scores

    def extract_keywords(self, text):
        """
        Main method to extract keywords from text

        :param text: Input text
        :return: List of top keywords
        """
        # Preprocess text
        cleaned_text = self.preprocess_text(text)

        # Tokenize and clean
        tokens = self.tokenize_and_clean(cleaned_text)

        # Filter by part of speech
        filtered_tokens = self.get_pos_filtered_tokens(tokens)

        # Calculate word scores
        word_scores = self.calculate_word_scores(filtered_tokens)

        # Sort words by score and return top keywords
        top_keywords = sorted(
            word_scores.items(),
            key=lambda x: x[1],
            reverse=True
        )[:self.max_keywords]

        return [keyword for keyword, score in top_keywords]

def print_keywords(text):
    """
    Convenience function to print keywords from text

    :param text: Input text to extract keywords from
    """
    extractor = KeywordExtractor()
    keywords = extractor.extract_keywords(text)
    print("Extracted Keywords:")
    for i, keyword in enumerate(keywords, 1):
        print(f"{i}. {keyword}")

# Example usage
sample_text = """
Machine learning is a method of data analysis that automates analytical model building.
Python is a popular programming language used extensively in machine learning and data science.
Deep learning and neural networks are advanced techniques in artificial intelligence that enable
complex pattern recognition and decision-making processes.
"""

if __name__ == "__main__":
    print_keywords(sample_text)

Extracted Keywords:
1. machine
2. data
3. method
4. analysis
5. analytical
6. model
7. building
8. python
9. popular
10. programming
11. language
12. science
13. deep
14. learning
15. neural
16. network
17. technique
18. artificial
19. intelligence
20. enable
