In [1]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Ensure you have the necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

def extract_keywords_from_url(url):
    # Fetch the web page content
    response = requests.get(url)
    content = response.text

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()

    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    words = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Stem the words to unify variations
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]

    # Count the occurrences of each stemmed word
    word_counts = Counter(stemmed_words)

    # Extract keywords and their counts
    keywords = {word: count for word, count in word_counts.items()}

    # Sort keywords by their counts in descending order
    sorted_keywords = dict(sorted(keywords.items(), key=lambda item: item[1], reverse=True))

    return sorted_keywords

# Example usage
url = 'https://zapier.com/blog/best-keyword-research-tool/'  # Replace with the actual URL
keywords = extract_keywords_from_url(url)
print(keywords)

[nltk_data] Downloading package punkt to C:\Users\RAHUL
[nltk_data]     CHOUDHARY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\RAHUL
[nltk_data]     CHOUDHARY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'keyword': 108, 'tool': 64, 'research': 49, 'best': 36, 'free': 34, 'use': 28, 'seo': 23, 'search': 18, 'app': 15, 'data': 15, 'content': 15, 'featur': 15, 'per': 15, 'semrush': 14, 'site': 14, 'includ': 13, 'plan': 13, 'googl': 13, 'ai': 12, 'zapier': 12, 'make': 12, 'rank': 12, 'help': 11, 'like': 11, 'upgrad': 10, 'planner': 10, 'paid': 9, 'need': 8, 'optim': 8, 'test': 8, 'suggest': 8, 'result': 8, 'time': 8, 'get': 8, 'volum': 8, 'manag': 7, 'simpl': 7, 'look': 7, 'competit': 7, 'number': 7, 'ad': 7, 'track': 7, 'audit': 7, 'new': 7, 'right': 6, 'offer': 6, 'straightforward': 6, 'one': 6, 'inform': 6, 'queri': 6, 'pro': 6, 'autom': 5, 'may': 5, 'also': 5, 'rang': 5, 'find': 5, 'year': 5, 'differ': 5, 'mani': 5, 'serp': 5, 'moz': 5, 'complet': 5, 'analysi': 5, 'limit': 5, 'start': 5, 'provid': 5, 'see': 5, 'relat': 5, 'organ': 5, 'even': 5, 'connect': 5, 'chatbot': 5, 'chatgpt': 5, 'design': 4, 'work': 4, 'campaign': 4, 'advanc': 4, 'basic': 4, 'super': 4, 'actual': 4, 'consid': 4