In [2]:
import requests
import pandas as pd
from bs4.element import Comment
from bs4 import BeautifulSoup
import numpy as np

import readability_lxml
from readability_lxml import Document

# Import website URLs and css stylesheet links

In [None]:
links_df = pd.read_csv(r'data/3-css-stylesheet-links.csv')
# this file only has stylesheet links but not url links! 

# uncomment and change index for running code in batches
# links = links_df.iloc[10:15]['URL']
# firms = links_df.iloc[10:15]['Firm']

links = links_df['URL']
firms = links_df['Firm']

# Ratio of Bold Text & Ratio of text in numbered / bullet-points list

In [None]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(texts):
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

all_bulletpoints_text = []
all_numberedpoints_text = []
all_ul_ol_wordcount = []
all_ul_ol_ratio = []
all_bolded_text = []
all_bolded_wordcount = []
all_bolded_ratio = []

for link in links:
    try:
        r = requests.get(link, stream=True)
        doc = Document(r.text)
        page_content = doc.summary()
        
        soup = BeautifulSoup(page_content, features="html.parser")
        
        # Find pure text in bulletpoints for one link
        bulletpoints = soup.find_all("ul")
        
        bulletpoints_text = []
        
        try:
            for bulletpoint in bulletpoints:
                bulletpoint_bs4element = bulletpoint.findAll(text=True)
                bulletpoint_text = text_from_html(bulletpoint_bs4element)
                bulletpoints_text.append(bulletpoint_text)
        except:
            bulletpoints_text.append(np.NaN)
        
        # Find pure text in numbered points for one link
        numberedpoints = soup.find_all("ol")
        
        numberedpoints_text = []
        
        try:
            for numberedpoint in numberedpoints:
                numberedpoint_bs4element = numberedpoint.findAll(text=True)
                numberedpoint_text = text_from_html(numberedpoint_bs4element)
                numberedpoints_text.append(numberedpoint_text)
        except:
            numberedpoints_text.append(np.NaN)
        
        # Find word count of bullet and numbered points
        bulletpoints_string = ' '.join(bulletpoints_text)
        numberedpoints_string = ' '.join(numberedpoints_text)
        ul_ol_wordcount = len(bulletpoints_string.split()) + len(numberedpoints_string.split())
        
        # Find total word count for one link
        total_words = len(page_content.split())
        
        # Find ratio of words in bullet and numbered points to total text
        ul_ol_ratio = ul_ol_wordcount / total_words
        
        
        # Find bolded text for one link
        boldtext = soup.find_all("strong")
        
        bolded_texts = []
        
        try:
            for item in boldtext:
                boldtext_bs4element = item.findAll(text=True)
                bold_text = text_from_html(boldtext_bs4element)
                bolded_texts.append(bold_text)
        
        except:
            bolded_texts.append(np.NaN)
        
        # Find word count of bolded text
        boldtext_str = ' '.join(bolded_texts)
        boldtext_wordcount = len(boldtext_str.split())
        
        # Find ratio of bolded text to total text
        boldtext_ratio = boldtext_wordcount / total_words
        
        all_bulletpoints_text.append(bulletpoints_text)  
        all_numberedpoints_text.append(numberedpoints_text)
        all_ul_ol_wordcount.append(ul_ol_wordcount)
        all_ul_ol_ratio.append(ul_ol_ratio)
        all_bolded_text.append(bolded_texts)
        all_bolded_wordcount.append(boldtext_wordcount)
        all_bolded_ratio.append(boldtext_ratio)

    except:
        all_bulletpoints_text.append(np.NaN)
        all_numberedpoints_text.append(np.NaN)
        all_ul_ol_wordcount.append(np.NaN)
        all_ul_ol_ratio.append(np.NaN)
        all_bolded_text.append(np.NaN)
        all_bolded_wordcount.append(np.NaN)
        all_bolded_ratio.append(np.NaN)

In [None]:
df = pd.DataFrame({'Firm': firms,
                   'Link': links,
                   'CSS Stylesheets': links_df['CSS Stylesheets']
                   'Bullet points': all_bulletpoints_text,
                   'Numbered points': all_numberedpoints_text,
                   'Bullet/numbered point word count': all_ul_ol_wordcount,
                   'Bullet/numbered point Ratio': all_ul_ol_ratio,
                   'Bolded text': all_bolded_text,
                   'Bolded word count': all_bolded_wordcount,
                   'Bold text ratio': all_bolded_ratio})

# Ratio of Header Text

In [None]:
all_headers_text = []
all_headers_wordcount = []
all_headers_ratio = []

for link in links:
    try:
        r = requests.get(link, stream=True)
        doc = Document(r.text)
        page_content = doc.summary()
        
        soup = BeautifulSoup(page_content, features="html.parser")
        
        # Find pure text in headers for one link
        headers_text = []
        
        try:
            for i in np.arange(1,7):
                for header in soup.findAll(f'h{i}'):
                    headers_text.append(header.text.replace('\n',''))
        
        except:
            headers_text.append(np.NaN)
        
        # Find total word count for one link
        total_words = len(page_content.split())
        
        # Find word count of headers
        headers_str = ' '.join(headers_text)
        header_wordcount = len(headers_str.split())
        
        # Find ratio of words in headers to total text
        header_ratio = header_wordcount / total_words
        
        all_headers_text.append(headers_text)
        all_headers_wordcount.append(header_wordcount)
        all_headers_ratio.append(header_ratio)
        
    except:
        all_headers_text.append(np.NaN)
        all_headers_wordcount.append(np.NaN)
        all_headers_ratio.append(np.NaN)

In [None]:
df['Headers text'] = all_headers_text
df['Headers word count'] = all_headers_wordcount
df['Headers ratio'] = all_headers_ratio

# Median font size & Average ratio of text line height to font size

In [None]:
all_all_line_height = []
all_all_font_size = []
all_all_ratio = []

for i in links:
    try:
        trial_links = ast.literal_eval(i)
        
        all_line_height = []
        all_font_size = []
        all_ratio = []
        
        for link in trial_links:
            try:
                r = requests.get(link, stream=True)
                doc = Document(r.text)
                page_content = doc.summary()
                
                soup = BeautifulSoup(page_content, features="html.parser")
                
                #Find line height 
                
                line_height = re.findall(r'line-height:(\d+(?:[\.\,]\d{1,3})?)', str(soup))
                line_height_value = [float(i) for i in line_height]
                median_line_height = median(line_height_value)       
                
                
                #Find font size
                font_size = re.findall(r'font-size:(\d+(?:[\.\,]\d{1,3})?)', str(soup))
                font_size_value = [float(i) for i in font_size]
                median_font_size = median(font_size_value)
                
                #Find ratio of line height to font size 
                
                lf_ratio = median_line_height / median_font_size
                
                all_line_height.append(median_line_height)
                all_font_size.append(median_font_size)
                all_ratio.append(lf_ratio)
                
            except:
                all_line_height.append(np.NaN)
                all_font_size.append(np.NaN)
                all_ratio.append(np.NaN)
                
        all_all_line_height.append(mean(pd.Series(all_line_height).dropna()))
        all_all_font_size.append(mean(pd.Series(all_font_size).dropna()))
        all_all_ratio.append(mean(pd.Series(all_ratio).dropna()))
        
    except:
        all_all_line_height.append(np.NaN)
        all_all_font_size.append(np.NaN)
        all_all_ratio.append(np.NaN)

In [None]:
df['Median Line Height'] = all_all_line_height
df['Median Font Size'] = all_all_font_size
df['Ratio'] = all_all_ratio

# Export

In [None]:
df.to_csv('data/5-legibility-measures.csv')
# make this csv