In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from scipy.stats import boxcox
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
# Set the resolution of the plotted figures
plt.rcParams['figure.dpi'] = 200

# Configure Seaborn plot styles: Set background color and use dark grid
sns.set(rc={'axes.facecolor': '#faded9'}, style='darkgrid')

In [3]:
df = pd.read_csv('dataset.csv')
df 

Unnamed: 0,id,web_URL_id,domain_category,web_URL,scrape_status,html_file_name,html_file_path,violation_count,violation_name,violation_score,violation_description,violation_description_url,affected_html_elements,violation_category,violation_impact,wcag_reference,supplementary_information
0,700_0,700,Government and Public Services,https://www.usa.gov/about-the-us,scraped,www_usa_gov_about_the_us.html,/content/workspace/FullPipeline/html_pages_asy...,4,color-contrast-enhanced,4,Ensures the contrast between foreground and ba...,https://dequeuniversity.com/rules/axe/4.4/colo...,"<a href=\/buy-from-government\"">How to buy fro...",Layout,serious,['1.4.6 Contrast (Enhanced)'],"{'fgColor': '#00bde3', 'bgColor': '#112f4e', '..."
1,700_1,700,Government and Public Services,https://www.usa.gov/about-the-us,scraped,www_usa_gov_about_the_us.html,/content/workspace/FullPipeline/html_pages_asy...,4,landmark-banner-is-top-level,3,Ensures the banner landmark is at top level,https://dequeuniversity.com/rules/axe/4.4/land...,"<div class=\usa-banner__header\"" role=\""banner...",Syntax,moderate,['1.3.1 Info and Relationships'],"<div class=\usa-banner__header\"" role=\""banner..."
2,700_2,700,Government and Public Services,https://www.usa.gov/about-the-us,scraped,www_usa_gov_about_the_us.html,/content/workspace/FullPipeline/html_pages_asy...,4,landmark-no-duplicate-banner,3,Ensures the document has at most one banner la...,https://dequeuniversity.com/rules/axe/4.4/land...,<header class=\usa-header usa-header--extended...,Syntax,moderate,['1.3.1 Info and Relationships'],11 <header> or role='banner' elements found:\n...
3,700_3,700,Government and Public Services,https://www.usa.gov/about-the-us,scraped,www_usa_gov_about_the_us.html,/content/workspace/FullPipeline/html_pages_asy...,4,landmark-unique,3,Landmarks should have a unique role or role/la...,https://dequeuniversity.com/rules/axe/4.4/land...,<header class=\usa-header usa-header--extended...,Syntax,moderate,['1.3.1 Info and Relationships'],Role 'search' found 2 times:\n<form accept-cha...
4,701_0,701,Government and Public Services,https://www.usa.gov/benefits,scraped,www_usa_gov_benefits.html,/content/workspace/FullPipeline/html_pages_asy...,4,color-contrast-enhanced,4,Ensures the contrast between foreground and ba...,https://dequeuniversity.com/rules/axe/4.4/colo...,"<a href=\/food-help\"">Learn about food assista...",Layout,serious,['1.4.6 Contrast (Enhanced)'],"{'fgColor': '#00bde3', 'bgColor': '#112f4e', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3519,6019,4643564980,TechnologyScienceResearch,https://dequeuniversity.com/rules/axe/4.10/ari...,scraped,4643564980.txt,/Volumes/MacPema/HIWI/WebAccessibility-main/20...,1,aria-braille-equivalent,4,Ensure aria-braillelabel and aria-brailleroled...,https://dequeuniversity.com/rules/axe/4.10/ari...,"<img alt=\\"" aria-braillelabel=\""****\"" src=\""...",,serious,"['4.1.2 Name, Role, Value']",
3520,6020,581517740,TechnologyScienceResearch,https://dequeuniversity.com/rules/axe/4.10/tar...,scraped,581517740.txt,/Volumes/MacPema/HIWI/WebAccessibility-main/20...,1,target-size,4,Ensure touch targets have sufficient size and ...,https://dequeuniversity.com/rules/axe/4.10/tar...,"<button id=\target\"">+</button>\n <button s...",Layout,serious,['2.5.5 Target Size'],
3521,6021,2182179087,TechnologyScienceResearch,https://dequeuniversity.com/rules/axe/4.10/emp...,scraped,2182179087.txt,/Volumes/MacPema/HIWI/WebAccessibility-main/20...,1,empty-table-header,2,Ensure table headers have discernible text,,https://dequeuniversity.com/rules/axe/4.10/emp...,Syntax,minor,"['1.3.1 Info and Relationships', '2.4.6 Headin...",
3522,6022,853326546,TechnologyScienceResearch,https://dequeuniversity.com/rules/axe/4.10/ari...,scraped,853326546.txt,/Volumes/MacPema/HIWI/WebAccessibility-main/20...,1,aria-meter-name,4,Ensure every ARIA meter node has an accessible...,https://dequeuniversity.com/rules/axe/4.10/ari...,"<div role=\meter\"" id=\""empty\""></div>\n\n<div...",Syntax,serious,['1.1.1 Non-text Content'],


In [18]:
df["domain_category"] = df["domain_category"].replace("Ecommerce", "E-commerce")
df["domain_category"] = df["domain_category"].replace("TechnologyScienceResearch", "Technology Science and Research")


In [25]:
df['violation_name'].unique()


array(['color-contrast-enhanced', 'landmark-banner-is-top-level',
       'landmark-no-duplicate-banner', 'landmark-unique',
       'color-contrast', 'duplicate-id-active', 'duplicate-id-aria',
       'duplicate-id', 'empty-heading', 'link-name', 'region',
       'scrollable-region-focusable', 'heading-order',
       'aria-valid-attr-value', 'image-alt', 'landmark-one-main',
       'page-has-heading-one', 'aria-allowed-role',
       'landmark-complementary-is-top-level', 'aria-allowed-attr',
       'button-name', 'document-title', 'html-has-lang',
       'aria-hidden-focus', 'aria-dialog-name', 'meta-viewport',
       'landmark-no-duplicate-contentinfo', 'skip-link',
       'presentation-role-conflict', 'image-redundant-alt', 'list',
       'landmark-contentinfo-is-top-level', 'svg-img-alt', 'label',
       'frame-title', 'aria-required-children', 'aria-roles',
       'label-title-only', 'nested-interactive', 'frame-title-unique',
       'aria-progressbar-name', 'landmark-main-is-top-le

In [26]:
unique_rules_per_url = (
    df.groupby("web_URL")["violation_name"]
      .nunique()
      .reset_index(name="unique_rules_per_url")
)
unique_rules_per_url.sort_values("unique_rules_per_url", ascending=False).head(20)


Unnamed: 0,web_URL,unique_rules_per_url
444,https://www.spss.com,18
473,https://www.thehindu.com,17
119,https://www.cloudacademy.com,17
454,https://www.tampabay.com,16
53,https://www.acefitness.org,15
380,https://www.philly.com,15
314,https://www.nbcnews.com/select,15
311,https://www.nbcnews.com,14
460,https://www.ted.com,14
312,https://www.nbcnews.com/,14


In [28]:
unique_rules_per_domain_category = (
    df.groupby("domain_category")["violation_name"]
      .nunique()
      .reset_index(name="unique_rules_per_domain")
)
unique_rules_per_domain_category.sort_values("unique_rules_per_domain", ascending=False).head(20)


Unnamed: 0,domain_category,unique_rules_per_domain
6,Technology Science and Research,76
1,Educational Platforms,59
4,News and Media,50
5,Streaming Platforms,47
0,E-commerce,47
2,Government and Public Services,46
3,Health and Wellness,30


In [30]:

serious_critical_per_url = (
    df.assign(is_serious=df["violation_impact"].str.lower().eq("serious"),
              is_critical=df["violation_impact"].str.lower().eq("critical"))
      .groupby("web_URL")[["is_serious", "is_critical"]]
      .sum()
      .reset_index()
)

serious_critical_per_url["serious_critical"] = (
    serious_critical_per_url["is_serious"] + serious_critical_per_url["is_critical"]
)

serious_critical_per_url.rename(columns={"is_serious":"serious", "is_critical":"critical"}, inplace=True)

serious_critical_per_url.head()

Unnamed: 0,web_URL,serious,critical,serious_critical
0,https://act-rules.github.io/rules/047fe0,2,0,2
1,https://act-rules.github.io/rules/5b7ae0,1,0,1
2,https://act-rules.github.io/rules/5c01ea,3,0,3
3,https://act-rules.github.io/rules/674b10,1,0,1
4,https://act-rules.github.io/rules/80f0bf,1,0,1


In [27]:
rule_topics = {
    "Color contrast": [
        "color-contrast",
        "color-contrast-enhanced",
    ],
    "Duplicate IDs": [
        "duplicate-id",
        "duplicate-id-active",
        "duplicate-id-aria",
    ],
    "Landmarks": [
        "landmark-banner-is-top-level",
        "landmark-no-duplicate-banner",
        "landmark-unique",
        "landmark-one-main",
        "landmark-complementary-is-top-level",
        "landmark-contentinfo-is-top-level",
        "landmark-main-is-top-level",
        "landmark-no-duplicate-contentinfo",
        "landmark-no-duplicate-main",
    ],
    "ARIA": [
        "aria-valid-attr-value",
        "aria-allowed-role",
        "aria-allowed-attr",
        "aria-hidden-focus",
        "aria-dialog-name",
        "aria-required-children",
        "aria-roles",
        "aria-progressbar-name",
        "aria-required-parent",
        "aria-input-field-name",
        "aria-toggle-field-name",
        "aria-required-attr",
        "aria-valid-attr",
        "aria-command-name",
        "aria-roledescription",
        "aria-hidden-body",
        "aria-conditional-attr",
        "aria-prohibited-attr",
        "aria-deprecated-role",
        "aria-tooltip-name",
        "aria-treeitem-name",
        "aria-braille-equivalent",
        "aria-meter-name",
        "aria-text",
    ],
    "Language": [
        "html-has-lang",
        "valid-lang",
        "html-xml-lang-mismatch",
        "lang-mismatch",
        "missing-lang-tag",
        "html-lang-valid",
    ],
    "Names & labels": [
        "link-name",
        "button-name",
        "label",
        "label-title-only",
        "select-name",
        "input-button-name",
        "summary-name",
        "frame-title",
        "frame-title-unique",
    ],
    "Images & alt text": [
        "image-alt",
        "image-redundant-alt",
        "svg-img-alt",
        "role-img-alt",
        "image-alt-not-descriptive",
        "input-image-alt",
        "area-alt",
        "object-alt",
    ],
    "Headings": [
        "empty-heading",
        "heading-order",
        "page-has-heading-one",
        "ambiguous-heading",
    ],
    "Lists & tables": [
        "list",
        "listitem",
        "th-has-data-cells",
        "td-headers-attr",
        "empty-table-header",
    ],
    "Navigation / bypass": [
        "skip-link",
        "bypass",
    ],
    "Viewport / timing": [
        "meta-viewport",
        "meta-viewport-large",
        "meta-refresh",
        "no-autoplay-audio",
        "target-size",
    ],
    "Interactive / focus": [
        "scrollable-region-focusable",
        "nested-interactive",
        "tabindex",
        "frame-focusable-content",
    ],
    "Other": [
        "document-title",
        "presentation-role-conflict",
        "region",
        "avoid-inline-spacing",
        "accesskeys",
        "video-caption",
        "autocomplete-valid",
        "link-text-mismatch",
        "form-label-mismatch",
        "page-title-not-descriptive",
        "link-in-text-block",
        "form-field-multiple-labels",
    ],
}

df_topics = pd.DataFrame(
    {topic: pd.Series(rules) for topic, rules in rule_topics.items()}
)

df_topics

Unnamed: 0,Color contrast,Duplicate IDs,Landmarks,ARIA,Language,Names & labels,Images & alt text,Headings,Lists & tables,Navigation / bypass,Viewport / timing,Interactive / focus,Other
0,color-contrast,duplicate-id,landmark-banner-is-top-level,aria-valid-attr-value,html-has-lang,link-name,image-alt,empty-heading,list,skip-link,meta-viewport,scrollable-region-focusable,document-title
1,color-contrast-enhanced,duplicate-id-active,landmark-no-duplicate-banner,aria-allowed-role,valid-lang,button-name,image-redundant-alt,heading-order,listitem,bypass,meta-viewport-large,nested-interactive,presentation-role-conflict
2,,duplicate-id-aria,landmark-unique,aria-allowed-attr,html-xml-lang-mismatch,label,svg-img-alt,page-has-heading-one,th-has-data-cells,,meta-refresh,tabindex,region
3,,,landmark-one-main,aria-hidden-focus,lang-mismatch,label-title-only,role-img-alt,ambiguous-heading,td-headers-attr,,no-autoplay-audio,frame-focusable-content,avoid-inline-spacing
4,,,landmark-complementary-is-top-level,aria-dialog-name,missing-lang-tag,select-name,image-alt-not-descriptive,,empty-table-header,,target-size,,accesskeys
5,,,landmark-contentinfo-is-top-level,aria-required-children,html-lang-valid,input-button-name,input-image-alt,,,,,,video-caption
6,,,landmark-main-is-top-level,aria-roles,,summary-name,area-alt,,,,,,autocomplete-valid
7,,,landmark-no-duplicate-contentinfo,aria-progressbar-name,,frame-title,object-alt,,,,,,link-text-mismatch
8,,,landmark-no-duplicate-main,aria-required-parent,,frame-title-unique,,,,,,,form-label-mismatch
9,,,,aria-input-field-name,,,,,,,,,page-title-not-descriptive
