In [1]:
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
import pandas as pd
from tqdm import tqdm
import json
from urllib.parse import urlparse

In [2]:
# goal is to list all the tags

def recursive_decomp(tag, i=0):
    tags = []
    if isinstance(tag, Comment):
        pass
    elif isinstance(tag, NavigableString):
        if tag.string != '\n':
            tags.append(str(tag.string))
    elif isinstance(tag, Tag):
        if tag.has_attr('aria-label'):
            tags.append(tag['aria-label'])
        elif tag.has_attr('aria-labelledby'):
            label_tag = tag.find(id=tag['aria-labelledby'])
            if label_tag is not None:
                tags.append(str(label_tag.string))
        elif tag.name == u'img' and tag.has_attr('alt'):
            tags.append(tag['alt'])
        elif tag.name in [u'script', u'style']:
            pass
        else:
            for child in tag.children:
                tags += recursive_decomp(child)
    else:
        for child in tag:
            tags += recursive_decomp(child, i+1)   
    return tags

def get_element_location(element):
    result = None
    while (element.parent):
        if element.parent.name in ['header','body','footer']:
            result = element.parent.name
            break
        else:
            element = element.parent
    return result

def navs_with_labels(soup):
    navs = []
    for result in soup.findAll("div", {"role" : "navigation"}):
        if get_aria_label(result, soup) is not None:
            navs.append(result)
    for result in soup.findAll("nav"):
        if get_aria_label(result, soup) is not None:
            navs.append(result)
    return navs

def get_aria_label(tag, soup = None):
    """ gets the aria label from the tag. 
    
    It requires the soup in case it has aria-labelledby"""
    if tag.has_attr('aria-label'):
        return tag['aria-label']
    elif tag.has_attr('aria-labelledby') and soup is not None:
        label = soup.find(id=tag['aria-labelledby'])
        if label is not None:
            return label.string
        else: 
            return None
    else:
        return None



In [3]:
import unicodedata
import re
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [7]:
dataset = []
# end = False
dir = './sites'
with open('useful.txt', 'r') as ufile:
    data = json.load(ufile)
    for file in tqdm(data):
        with open(f'{dir}/{file}', 'r') as f:
            soup = BeautifulSoup(f, 'html.parser')
            navs = navs_with_labels(soup)
            for i, nav in enumerate(navs):
                entry = {}
                entry['label'] = get_aria_label(nav, soup)
                entry['index'] = i
                entry['location'] = get_element_location(nav)
                entry['words'] = recursive_decomp(nav.children)
#                 for sentence in entry['words']:
#                     if 'span' in normalizeString(sentence).split():
#                         print(file)
#                         print((sentence))
#                         print("---------")
#                         print(nav)
#                         end = True
#                         break
#                 if end:
#                     break
                dataset.append(entry)
#         if end:
#             break
        
with open('dataset.json', 'w') as ofile:
    json.dump(dataset, ofile)


100%|██████████| 1508/1508 [02:14<00:00, 11.22it/s]
