In [14]:
"""The app will check the following:
Warnings, Title, Meta Description, Headings, Image Alt, Keywords"""

# resource video PART1 https://www.youtube.com/watch?v=1Y-x59e90Nw
# resource video PART2 https://youtu.be/j7TLgyTrtp8

# resource static https://pythonology.eu/build-an-seo-analyzer-using-python/



In [15]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from keys import api_keys
# Own Functions <----
from var import res, soup, title, meta_d
from check_length_module import check_length # Fred's function


# nltk for natural language processing
# To look atr frequency of keywords
import nltk
from nltk.tokenize import word_tokenize # GEt a single token eg. single word from the text
nltk.download("stopwords") # Words that aren't helpful when analysing keywords eg. the
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/frederico/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/frederico/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
# Webpage to analyse
# For Streamlit url = input(f"Enter url to run analysis: ")
url = "https://unitedpropertyservices.au/"


def analyse_url(url):
    """Analyse headings, keywords & alt attribute in images"""
    warning = [] # Warning may including missing tile, meta-content, alt etc...
    ok = []      # Includes good titles, headings, descriptions etc...




    if title:  # Add title to ok list if title exists
        ok.append(f"Title exists: {title}")
        check_length(title)
    else:      # If no title then add to warning list  
        warning.append(f"Title is missing!")




    if meta_d: # Add meta description to ok list if title exists
        ok.append(f"Meta Description exists: {meta_d}")
        check_length(meta_d)
    else:
        warning.append(f"Meta Description is missing")




    hs = ["h1", "h2", "h3", "h4", "h5", "h6"] # Grab Headings
    h_tags = []


    for h in soup.find_all(hs): 
        ok.append(f"{h.name}-->{h.text.strip()}")
        h_tags.append(h.name)
    if "h1" not in h_tags:
        warning.append("No H1 found!")
    



    for i in soup.find_all("img", alt=" "): # Extract the images without Alt
        warning.append(f"No Alt: {i}")
    else:
        print("All images contain 'Alt Attribute'.\n")

    # Extract keywords
    bod = soup.find("body").text # Grab text from body of the HTML
    # Add words inside list 
    words = [i.lower() for i in word_tokenize(bod)] # Return i for i in word-tokenize
    
    # Grab a list of English stopwords (actual words not from url)
    sw = nltk.corpus.stopwords.words("english")
    keywords = []

    for i in words:                     # If found in words variable, stop words are to be excluded.
        if i not in sw and i.isalpha(): # isalpha stands for actual words and not symbols etc...
            keywords.append(i)
    
     

    
    freq = nltk.FreqDist(keywords) # Check frequency distribution of keywords

    print(f"Keywords FreqDist: {freq.most_common(10)}") # Check most common top 10
    print(f"OK: {ok}")
    print(f"WARNING: {warning}")



analyse_url(url)



TITLE: 'Home - United Property Services' has 31 characters
The ideal length is 50-60 characters.

META DESCRIPTION: 'New name, same great service! We’ve rebranded, and Elite is now United, which marks the next chapter in our journey.' has 116 characters
The ideal length is 150-160 characters.

All images contain 'Alt Attribute'.

Keywords FreqDist: [('cleaning', 10), ('services', 8), ('damage', 7), ('concrete', 6), ('carpet', 6), ('united', 5), ('dirt', 5), ('clean', 5), ('water', 4), ('restoration', 4)]
OK: ['Title exists: Home - United Property Services', 'Meta Description exists: New name, same great service!\xa0We’ve rebranded, and Elite is now United,\xa0which marks the next chapter in our journey.', 'h1-->UNITED PROPERTY SERVICES', 'h2-->Comprehensive Cleaning & Restoration Services', 'h3-->Welcome to United', 'h2-->Our Services', 'h2-->CARPET & RUG DRY CLEANING', 'h2-->LOUNGES & UPHOLSTERY', 'h2-->TILE & GROUT CLEANING', 'h2-->CONCRETE CLEANING', 'h2-->CURTAINS', 'h2-->LEATHER C

In [18]:

# Recommendations Dict (may not be needed but it is here for now)
rec = {
    "title" : " ",
    "meta description" : " ",
    "headings" : " ",
    "img alt" : " ",
}



In [19]:
res = requests.get(url).text 
soup = BeautifulSoup(res, "html.parser")

In [20]:
# Bod var looks for body tag on html and grabs text. This skips all JS and CSS in between
bod = soup.find("body").text

In [21]:
# Add words inside list
words = [i.lower() for i in word_tokenize(bod)] # Return i for i in word-tokenize
words[:10] # Run top 10 or more and see what stop words come up then get rid of then

['home',
 'concrete',
 'cleaning',
 'water',
 'damage',
 '&',
 'restoration',
 'carpet',
 '&',
 'rug']

In [22]:
# This is an actual list of all English stop words
sw = nltk.corpus.stopwords.words("english")

# If found in words variable, stop words are to be excluded.
new_words = []
for i in words:
    if i not in sw and i.isalpha(): # isalpha stands for actual words and not symbols etc...
        new_words.append(i)

# Check frequency distribution of keywords
freq = nltk.FreqDist(new_words)
# Check most common top 10
print(freq.most_common(10))



[('cleaning', 10), ('services', 8), ('damage', 7), ('concrete', 6), ('carpet', 6), ('united', 5), ('dirt', 5), ('clean', 5), ('water', 4), ('restoration', 4)]
