# Website Quality Analysis

Analyze website quality using data generated from Google Lighthouse, Pandas dataframes, and visualizations.

## Steps

1. Find all internal links on website
1. Build dataframe of links
1. Assess website quality scores on a per link basis, storing in dataframe
1. Sort and display dataframe results
1. Visualize results as needed

In [None]:
import json
import os
import subprocess
from urllib.parse import urljoin, urlparse

import colorama
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import requests
from bs4 import BeautifulSoup
from plotly.colors import n_colors

In [None]:
target_url = "https://<your website here>"

In [None]:
# lightouse json report audits don't include categories,
# so we run a single csv report to gather audit id's per category
target_url_str = "{}".format(target_url).replace("https://", "").replace("/", "_")
command = "lighthouse --no-update-notifier --no-enable-error-reporting --output=csv --output-path={}  --chrome-flags='--headless' {}".format(
    target_url_str, target_url
)
p = subprocess.Popen(
    command,
    shell=True,
)
p.communicate()

In [None]:
# gather categories of audit ids from csv
df_cats = pd.read_csv(target_url_str)[["name", "category"]]
df_cats.head()

In [None]:
# scraping code courtesy of @x4nth055 from the following link:
# https://github.com/x4nth055/pythoncode-tutorials/blob/master/web-scraping/link-extractor/link_extractor.py

# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW


def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
    }
    soup = BeautifulSoup(
        requests.get(url, headers=headers, verify=False).content, "html.parser"
    )

    for a_tag in soup.findAll("a"):

        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue

        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)

        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

        if not is_valid(href):
            # not a valid URL
            continue
        if href in internal_urls:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in external_urls:
                # print(f"{GRAY}[!] External link: {href}{RESET}")
                external_urls.add(href)
            continue
        # print(f"{GREEN}[*] Internal link: {href}{RESET}")
        urls.add(href)
        internal_urls.add(href)

    return urls


# number of urls visited so far will be stored here
total_urls_visited = 0


def crawl(url, max_urls=1000):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    # print(f"{YELLOW}[*] Crawling: {url}{RESET}")
    links = get_all_website_links(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)

In [None]:
# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()

crawl(target_url)
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))

In [None]:
df = pd.DataFrame(internal_urls)
df = df.rename(columns={0: "URL"})
df = df[
    (~df["URL"].str.contains("tel:"))
    & (~df["URL"].str.contains("mailto:"))
    & (~df["URL"].str.contains("http://"))
]
df["URL"] = df["URL"].str.rstrip("/")
df = df.drop_duplicates()

df.info()

In [None]:
def get_lighthouse_audit(target_url: str) -> pd.DataFrame:
    """
    takes url generate lighthouse report from system
    returns dataframe of result
    """

    # prepare command
    command = "lighthouse --no-update-notifier --no-enable-error-reporting --output=json  --chrome-flags='--headless' {}".format(
        target_url
    )

    # run command from system
    p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)

    # transform stdout str results to json dict
    json_result = json.loads(p.communicate()[0])

    # send audit results to pd.dataframe
    df = pd.DataFrame(json_result["audits"]).T

    # add url to result audits
    df["url"] = target_url

    # reorder df to show url as first column
    df = df[["url"] + [col for col in df.columns if col != "url"]]

    # reset the index
    df = df.reset_index(drop=True)

    # set the categories per audit id
    df["category"] = df["id"]
    df["category"] = df["category"].map(
        dict(zip(df_cats["name"].values.tolist(), df_cats["category"].values.tolist()))
    )

    return df

In [None]:
df_list = []

# go through each url and gather audit results
for target_url in df["URL"].tolist():
    df_list.append(get_lighthouse_audit(target_url))

print(len(df_list))

In [None]:
# join the results as one dataframe, or relable the first element as the same
if len(df_list) > 1:
    results = pd.concat(df_list)
else:
    results = df_list[0]
results.head()

In [None]:
# filter results to only those we'll use to report on.
results_filtered = results[
    (~results["category"].isna())
    & (~(results["scoreDisplayMode"] == "notApplicable"))
    & (~(results["scoreDisplayMode"] == "manual"))
    & (~(results["scoreDisplayMode"] == "informative"))
]

In [None]:
# find the aggregated scores of each category by url
aggregate_scores = (
    results_filtered.groupby(["url", "category"])["score"].sum()
    / results_filtered.groupby(["url", "category"])["id"].count()
)
aggregate_scores.head()

In [None]:
# show mean of all urls
average_score = aggregate_scores.unstack(level=-1).mean().to_frame().T

# round scores as integers
average_score = (average_score.round(decimals=2) * 100).astype("int")

average_score

In [None]:
colors = n_colors("rgb(250, 0, 50)", "rgb(100, 200, 0)", 101, colortype="rgb")


fig = go.Figure(
    data=[
        go.Table(
            header=dict(
                values=list(average_score.columns),
                fill_color="paleturquoise",
                align="center",
                font=dict(color="black", size=11),
            ),
            cells=dict(
                values=[
                    average_score["Accessibility"],
                    average_score["Best Practices"],
                    average_score["Performance"],
                    average_score["Progressive Web App"],
                    average_score["SEO"],
                ],
                line_color=[
                    np.array(colors)[average_score["Accessibility"]],
                    np.array(colors)[average_score["Best Practices"]],
                    np.array(colors)[average_score["Performance"]],
                    np.array(colors)[average_score["Progressive Web App"]],
                    np.array(colors)[average_score["SEO"]],
                ],
                fill_color=[
                    np.array(colors)[average_score["Accessibility"]],
                    np.array(colors)[average_score["Best Practices"]],
                    np.array(colors)[average_score["Performance"]],
                    np.array(colors)[average_score["Progressive Web App"]],
                    np.array(colors)[average_score["SEO"]],
                ],
                align="center",
                font=dict(color="white", size=11),
            ),
        )
    ]
)

fig.show()
fig.write_image("lighthouse_overall_average_score.png")

In [None]:
# target accessibility category aggregate scores
aggregate_scores.unstack().sort_values("Accessibility", ascending=False)

In [None]:
# target accessibility category aggregate scores as csv
aggregate_scores.unstack().sort_values("Accessibility", ascending=False).to_csv(
    "lighthouse_category_scores_by_url.csv"
)

In [None]:
# target all category individual scores as csv
results_filtered.sort_values(["url", "category"]).to_csv(
    "lighthouse_scores_by_url.csv", index=False
)

In [None]:
# target accessibility category individual scores as csv
results_filtered[
    (results_filtered["category"] == "Accessibility") & (results_filtered["score"] != 1)
].sort_values(["url", "category"]).to_csv(
    "lighthouse_accessibility_low_scores_by_url.csv", index=False
)