In [11]:
import json
import requests
from urllib.parse import urljoin, urlparse, urlunparse
from bs4 import BeautifulSoup
import urllib3
import re
import time
from requests.exceptions import RequestException, ConnectionError, Timeout
import socket

# suppress warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# load urls from a json file
def load_urls(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data['unique_urls']

# load common words from a json file
def load_common_words(json_file):
    try:
        with open(json_file, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return list(data.keys())[:10]
    except KeyError:
        print("Error: Could not load common words from the JSON file.")
        return []

# load url headings from a json file
def load_url_headings(json_file):
    try:
        with open(json_file, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except UnicodeDecodeError:
        with open(json_file, 'r', encoding='latin1') as file:
            data = json.load(file)
        return data

# load image data from a json file
def load_image_data(json_file):
    try:
        with open(json_file, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except UnicodeDecodeError:
        with open(json_file, 'r', encoding='latin1') as file:
            data = json.load(file)
        return data

# load script data from a json file
def load_scripts(json_file):
    try:
        with open(json_file, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except UnicodeDecodeError:
        with open(json_file, 'r', encoding='latin1') as file:
            data = json.load(file)
        return data

# check if the website has a title and how many characters it is
def check_title(url):
    try:
        response = requests.get(url, verify=False)  # disable ssl verification
        response.encoding = 'utf-8'  # check if url has correct encoding
        content = response.content.decode('utf-8', errors='ignore')  # ignore decoding errors
        soup = BeautifulSoup(content, 'html.parser')
        title_tag = soup.find('title')
        if title_tag:
            title = title_tag.text
            title_length = len(title)
            if title_length < 20:
                title_status = 'too short'
            elif title_length > 75:
                title_status = 'too long'
            else:
                title_status = 'valid'
            return True, title_length, title_status, title
        return False, 0, 'no title', ''
    except requests.exceptions.RequestException as e:
        print(f"error fetching {url}: {e}")
        return False, 0, 'error', ''

# check if the website has a description and how many characters it is
def check_description(url):
    try:
        response = requests.get(url, verify=False)  # disable ssl verification
        response.encoding = 'utf-8'  # check if url has correct encoding
        content = response.content.decode('utf-8', errors='ignore')  # ignore decoding errors
        soup = BeautifulSoup(content, 'html.parser')
        description_tag = soup.find('meta', attrs={'name': 'description'})
        if description_tag and 'content' in description_tag.attrs:
            description = description_tag['content']
            description_length = len(description)
            if description_length < 50:
                description_status = 'too short'
            elif description_length > 160:
                description_status = 'too long'
            else:
                description_status = 'valid'
            return True, description_length, description_status, description
        return False, 0, 'no description', ''
    except requests.exceptions.RequestException as e:
        print(f"error fetching {url}: {e}")
        return False, 0, 'error', ''

# check if the title or description contains any of the common words
def check_common_words(title, description, common_words):
    title_words = set(title.lower().split())
    description_words = set(description.lower().split())
    for word in common_words:
        if word in title_words or word in description_words:
            return True
    return False

# check if the h1 tag has content
def check_h1_content(url):
    try:
        response = requests.get(url, verify=False)  # disable ssl verification
        response.encoding = 'utf-8'  # check if url has correct encoding
        content = response.content.decode('utf-8', errors='ignore')  # ignore decoding errors
        soup = BeautifulSoup(content, 'html.parser')
        h1_tags = soup.find_all('h1')
        for h1 in h1_tags:
            if h1.text.strip():  # check if there is content inside the h1 tag
                return True
        return False
    except requests.exceptions.RequestException as e:
        print(f"error fetching {url}: {e}")
        return False

# check if the h2 tag has content
def check_h2_content(url):
    try:
        response = requests.get(url, verify=False)  # disable ssl verification
        response.encoding = 'utf-8'  # check if url has correct encoding
        content = response.content.decode('utf-8', errors='ignore')  # ignore decoding errors
        soup = BeautifulSoup(content, 'html.parser')
        h2_tags = soup.find_all('h2')
        for h2 in h2_tags:
            if h2.text.strip():  # check if there is content inside the h2 tag
                return True
        return False
    except requests.exceptions.RequestException as e:
        print(f"error fetching {url}: {e}")
        return False

# check if all images have alt text
def check_images_alt_text(url, image_data):
    images = image_data.get(url, [])
    for image in images:
        if not image['alt'].strip():  # check if alt text is empty
            return False
    return True

# check if the website has a canonical tag
def check_canonical_tag(url):
    try:
        response = requests.get(url, verify=False)  # disable ssl verification
        response.encoding = 'utf-8'  # check if url has correct encoding
        content = response.content.decode('utf-8', errors='ignore')  # ignore decoding errors
        soup = BeautifulSoup(content, 'html.parser')
        canonical_tag = soup.find('link', rel='canonical')
        return canonical_tag is not None
    except requests.exceptions.RequestException as e:
        print(f"error fetching {url}: {e}")
        return False

# check if the website has a noindex header or meta tag
# returns True if there is no noindex tag which is intended
# as there shouldnt be any
def check_noindex_tag(url):
    try:
        response = requests.get(url, verify=False)  # disable ssl verification
        response.encoding = 'utf-8'  # check if url has correct encoding
        content = response.content.decode('utf-8', errors='ignore')  # ignore decoding errors
        soup = BeautifulSoup(content, 'html.parser')
        meta_noindex = soup.find('meta', attrs={'name': 'robots', 'content': 'noindex'})
        header_noindex = response.headers.get('X-Robots-Tag') == 'noindex'
        return meta_noindex is None and not header_noindex
    except requests.exceptions.RequestException as e:
        print(f"error fetching {url}: {e}")
        return False

# check if the www and non-www versions of the url go to the same site
def check_www_and_non_www(url):
    try:
        parsed_url = urlparse(url)
        if parsed_url.hostname.startswith('www.'):
            non_www_url = urlunparse(parsed_url._replace(netloc=parsed_url.hostname[4:]))
        else:
            non_www_url = urlunparse(parsed_url._replace(netloc='www.' + parsed_url.hostname))
        
        www_response = requests.get(url, verify=False, allow_redirects=True)
        non_www_response = requests.get(non_www_url, verify=False, allow_redirects=True)
        
        return www_response.url == non_www_response.url
    except requests.exceptions.RequestException as e:
        print(f"error fetching {url} or {non_www_url}: {e}")
        return False

# check if the website has a robots.txt file
def check_robots_txt(url):
    try:
        parsed_url = urlparse(url)
        robots_url = urlunparse((parsed_url.scheme, parsed_url.netloc, '/robots.txt', '', '', ''))
        response = requests.get(robots_url, verify=False)
        return response.status_code == 200
    except requests.exceptions.RequestException as e:
        print(f"error fetching {robots_url}: {e}")
        return False

# check if the website has Open Graph meta tags
def check_open_graph_tags(url):
    try:
        response = requests.get(url, verify=False)  # disable ssl verification
        response.encoding = 'utf-8'  # check if url has correct encoding
        content = response.content.decode('utf-8', errors='ignore')  # ignore decoding errors
        soup = BeautifulSoup(content, 'html.parser')
        og_tags = soup.find_all('meta', property=lambda x: x and x.startswith('og:'))
        return len(og_tags) > 0
    except requests.exceptions.RequestException as e:
        print(f"error fetching {url}: {e}")
        return False

# check if the website has structured data
def check_structured_data(url):
    try:
        response = requests.get(url, verify=False)  # disable ssl verification
        response.encoding = 'utf-8'  # check if url has correct encoding
        content = response.content.decode('utf-8', errors='ignore')  # ignore decoding errors
        soup = BeautifulSoup(content, 'html.parser')
        json_ld = soup.find_all('script', type='application/ld+json')
        microdata = soup.find_all(attrs={"itemscope": True})
        rdfa = soup.find_all(attrs={"typeof": True})
        return len(json_ld) > 0 or len(microdata) > 0 or len(rdfa) > 0
    except requests.exceptions.RequestException as e:
        print(f"error fetching {url}: {e}")
        return False

# check if images have an expires header
def check_images_expires_header(url, image_data):
    try:
        images = image_data.get(url, [])
        for image in images:
            response = requests.head(image['url'], verify=False)
            if 'Expires' not in response.headers:
                return False
        return True
    except requests.exceptions.RequestException as e:
        print(f"error fetching image {image['url']}: {e}")
        return False

# check if all javascript files are minified
def check_js_minified(url, scripts_data):
    try:
        scripts = scripts_data.get(url, [])
        for script_url in scripts:
            if not script_url.endswith('.js'):
                continue
            script_response = requests.get(script_url, verify=False)
            script_content = script_response.text
            if len(re.findall(r'\n', script_content)) > 20:  # check for minification
                return False
        return True
    except requests.exceptions.RequestException as e:
        print(f"error fetching javascript {script_url}: {e}")
        return False


# main function to load urls and perform seo checks
def main():
    base_url = "https://www.haberturk.com"
    urls = load_urls('urls.json')
    common_words = load_common_words('text_common_words.json')
    url_headings = load_url_headings('url_headings.json')
    image_data = load_image_data('image_data.json')
    scripts_data = load_scripts('scripts.json')
    results = []

    for index, relative_url in enumerate(urls):
        full_url = urljoin(base_url, relative_url)
        try:
            has_title, title_length, title_status, title = check_title(full_url)
            has_description, description_length, description_status, description = check_description(full_url)
            common_words_status = check_common_words(title, description, common_words)
            h1_content_status = check_h1_content(full_url)
            h2_content_status = check_h2_content(full_url)
            images_alt_text_status = check_images_alt_text(full_url, image_data)
            canonical_tag_status = check_canonical_tag(full_url)
            noindex_tag_status = check_noindex_tag(full_url)
            www_non_www_status = check_www_and_non_www(full_url)
            robots_txt_status = check_robots_txt(full_url)
            open_graph_tags_status = check_open_graph_tags(full_url)
            structured_data_status = check_structured_data(full_url)
            images_expires_header_status = check_images_expires_header(full_url, image_data)
            js_minified_status = check_js_minified(full_url, scripts_data)

            # print results for the current URL
            print(f"{index + 1}/{len(urls)} - url: {full_url}\n"
                  f"  has title: {has_title}\n"
                  f"  title length: {title_length} characters\n"
                  f"  title status: {title_status}\n"
                  f"  has description: {has_description}\n"
                  f"  description length: {description_length} characters\n"
                  f"  description status: {description_status}\n"
                  f"  common words status: {common_words_status}\n"
                  f"  h1 content status: {h1_content_status}\n"
                  f"  h2 content status: {h2_content_status}\n"
                  f"  images alt text status: {images_alt_text_status}\n"
                  f"  canonical tag status: {canonical_tag_status}\n"
                  f"  noindex tag status: {noindex_tag_status}\n"
                  f"  www and non-www status: {www_non_www_status}\n"
                  f"  robots.txt status: {robots_txt_status}\n"
                  f"  open graph tags status: {open_graph_tags_status}\n"
                  f"  structured data status: {structured_data_status}\n"
                  f"  images expires header status: {images_expires_header_status}\n"
                  f"  js minified status: {js_minified_status}\n")

            # store results for seo score calculation
            results.append({
                'url': full_url,
                'has_title': has_title,
                'title_length': title_length,
                'title_status': title_status,
                'has_description': has_description,
                'description_length': description_length,
                'description_status': description_status,
                'common_words_status': common_words_status,
                'h1_content_status': h1_content_status,
                'h2_content_status': h2_content_status,
                'images_alt_text_status': images_alt_text_status,
                'canonical_tag_status': canonical_tag_status,
                'noindex_tag_status': noindex_tag_status,
                'www_non_www_status': www_non_www_status,
                'robots_txt_status': robots_txt_status,
                'open_graph_tags_status': open_graph_tags_status,
                'structured_data_status': structured_data_status,
                'images_expires_header_status': images_expires_header_status,
                'js_minified_status': js_minified_status,
                # more will be added
            })

        except (RequestException, ConnectionError, Timeout, socket.gaierror):
            results.append({
                'url': full_url,
                'has_title': False,
                'title_length': 0,
                'title_status': 'error',
                'has_description': False,
                'description_length': 0,
                'description_status': 'error',
                'common_words_status': False,
                'h1_content_status': False,
                'h2_content_status': False,
                'images_alt_text_status': False,
                'canonical_tag_status': False,
                'noindex_tag_status': False,
                'www_non_www_status': False,
                'robots_txt_status': False,
                'open_graph_tags_status': False,
                'structured_data_status': False,
                'images_expires_header_status': False,
                'js_minified_status': False,
                # more will be added
            })

        print("\n")  # add space between the results of each URL
    
    # calculate seo score
    for result in results:
        score = (1 if result['title_status'] == 'valid' else 0) + \
                (1 if result['description_status'] == 'valid' else 0) + \
                (1 if result['common_words_status'] else 0) + \
                (1 if result['h1_content_status'] else 0) + \
                (1 if result['h2_content_status'] else 0) + \
                (1 if result['images_alt_text_status'] else 0) + \
                (1 if result['canonical_tag_status'] else 0) + \
                (1 if result['noindex_tag_status'] else 0) + \
                (1 if result['www_non_www_status'] else 0) + \
                (1 if result['robots_txt_status'] else 0) + \
                (1 if result['open_graph_tags_status'] else 0) + \
                (1 if result['structured_data_status'] else 0) + \
                (1 if result['images_expires_header_status'] else 0) + \
                (1 if result['js_minified_status'] else 0)
                # more will be added
        seo_score = (score / 20) * 100  # to calculate the score out of 100%
        print(f"url: {result['url']}, seo score: {seo_score:.2f}%\n")

if __name__ == "__main__":
    main()

1/536 - url: https://www.haberturk.com/images/common/manifest/180x180.png
  has title: False
  title length: 0 characters
  title status: no title
  has description: False
  description length: 0 characters
  description status: no description
  common words status: False
  h1 content status: False
  h2 content status: False
  images alt text status: False
  canonical tag status: False
  noindex tag status: True
  www and non-www status: True
  robots.txt status: True
  open graph tags status: False
  structured data status: False
  images expires header status: False
  js minified status: True



2/536 - url: https://www.haberturk.com/images/common/favicon/32x32.png
  has title: False
  title length: 0 characters
  title status: no title
  has description: False
  description length: 0 characters
  description status: no description
  common words status: False
  h1 content status: False
  h2 content status: False
  images alt text status: False
  canonical tag status: False
  noindex