In [None]:
import itertools
import pickle
from urllib.parse import urlparse

import pandas as pd
from bs4 import BeautifulSoup

## Load Pickle file

In [None]:
file = open(input('Path to pickle file: '), "rb")
output = pickle.load(file)

## Function definitions

In [None]:
def print_k_v(key, value):
    print(f'URL: {key}\n\n{value.strip()}')
    
def get_links(page):
    """Get hrefs from <a> and return list"""
    link_list = []
    soup = BeautifulSoup(page, "html.parser")
    links = soup.findAll('a')
    for link in links:
        try:
            link_list.append(link["href"])
        except Exception as e:
            pass
    return link_list

def element_test(page, tag, attr_type, attr_val):
    """Return True or False if HTML contains element as defined in soup.findAll"""
    soup = BeautifulSoup(page, "html.parser")
    out = soup.findAll(tag, attrs={attr_type: attr_val})
    return bool(out)

def reduce_dict_size(dictionary, N):
    """This reduces dict size to N, used for testing notebook"""
    return dict(itertools.islice(output.items(), N))

In [None]:
# Reduce dictionary size - for testing purposes
# output = reduce_dict_size(output, 100)

## Loop of main {URL:HTML content} dictionary

In [None]:
# Build lists
more_link_list = []
accordion_list = []
tab_list = []

# Main loop
for url, page in output.items():
    # Gets list of links
#     links = get_links(page)
        
    # Build list of Boolean values - More-link
    if element_test(page, tag="div", attr_type="class", attr_val="more-link"):
        more_link_list.append(True)
    else:
        more_link_list.append(False)
        
    # Build list of Boolean values - Accordion-wrapper
    if element_test(page, tag="section", attr_type="class", attr_val="accordion-wrapper"):
        accordion_list.append(True)
    else:
        accordion_list.append(False)
        
    # Build list of Boolean values - Tab-placeholder
    if element_test(page, tag="div", attr_type="class", attr_val="tab-placeholder"):
        tab_list.append(True)
    else:
        tab_list.append(False)

## Create DataFrame

In [None]:
pd.set_option('display.max_rows', 1000) # Notebook display option
df = pd.DataFrame.from_dict(output, orient = 'index')
df = df.drop(df.columns[0], axis=1) # Drop HTML content from df

# Add df columns
df["More-link"] = more_link_list
df["Accordion-wrapper"] = accordion_list
df["Tab-placeholder"] = tab_list

# Display notebook
df

## Output to Excel

In [None]:
df.to_excel('df_out.xlsx')