In [24]:
from bs4 import BeautifulSoup

def xpath_soup(element):
    # type: (typing.Union[bs4.element.Tag, bs4.element.NavigableString]) -> str
    """
    Generate xpath from BeautifulSoup4 element.
    :param element: BeautifulSoup4 element.
    :type element: bs4.element.Tag or bs4.element.NavigableString
    :return: xpath as string
    :rtype: str
    """
    components = []
    child = element if element.name else element.parent
    for parent in child.parents:  # type: bs4.element.Tag
        siblings = parent.find_all(child.name, recursive=False)
        components.append(
            child.name if 1 == len(siblings) else '%s[%d]' % (
                child.name,
                next(i for i, s in enumerate(siblings, 1) if s is child)
                )
            )
        child = parent
    components.reverse()
    return '/%s' % '/'.join(components)



In [89]:
soup = BeautifulSoup(open('./html/ok-cadzand.html'))

# kill all script and style elements
for script in soup(["script", "style"]):
    script.decompose()    # rip it out

In [90]:
import re

texts = [t for t in soup.text.split('\n') if len(t) > 0]
xpaths = []

text_elements = []
for t in texts:
#     el = soup.find(text=t)
    el = soup.find(text=re.compile(t))
    if el:
        xpath = xpath_soup(el)
        text_elements.append({
            'text': t,
            'xpath': xpath
        })

In [91]:
import pandas as pd
df = pd.DataFrame(text_elements)
df['xpath_nonumb'] = df.xpath.apply(lambda s: ''.join([i for i in s if not i.isdigit()]))

In [94]:
for x in df.groupby(df['xpath_nonumb']).count().sort_index().index:
    print(x)

/html/body/div[]/div
/html/body/div[]/div/div/button/span
/html/body/div[]/div/div/div/div/div/div[]/div/div[]/a
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/div[]
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/div[]/a
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/div[]/h
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/h
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/p
/html/body/div[]/div/footer/div[]/div/div[]/div/div/p[]
/html/body/div[]/div/footer/div[]/div/ul/li[]/a
/html/body/div[]/div/footer/div[]/div/ul/li[]/a/span
/html/body/div[]/header/div[]/div[]/div[]/div/span[]
/html/body/div[]/header/div[]/div[]/nav/ul/li[]/a
/html/body/div[]/header/div[]/div[]/nav/ul/li[]/ul/li[]/a
/html/body/div[]/header/div[]/div[]/nav/ul/li[]/ul/li[]/ul/li[]/a
/html/head/title


In [92]:
df.groupby(df['xpath_nonumb']).count().sort_index()

Unnamed: 0_level_0,text,xpath,xpath_nonumb
xpath_nonumb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
/html/body/div[]/div,2,2,2
/html/body/div[]/div/div/button/span,1,1,1
/html/body/div[]/div/div/div/div/div/div[]/div/div[]/a,1,1,1
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/div[],7,7,7
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/div[]/a,2,2,2
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/div[]/h,2,2,2
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/h,1,1,1
/html/body/div[]/div/div[]/div/div/div[]/div/div/div/article/div/div[]/p,2,2,2
/html/body/div[]/div/footer/div[]/div/div[]/div/div/p[],16,16,16
/html/body/div[]/div/footer/div[]/div/ul/li[]/a,5,5,5


In [96]:
import requests
r = requests.get('https://simonnouwens.nl')

In [100]:
import pickle
with open('./test.pkl', 'wb') as f:
    pickle.dump(r, f)

In [110]:
import OpenSSL
import ssl, socket
cert=ssl.get_server_certificate(('www.simonnouwens.nl', 443))
x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, cert)
x509.get_notAfter()

b'20210803120000Z'

b'20200615000000Z'

In [122]:
r = requests.head('https://httpbin.org/get')

In [127]:
r.headers

{'Date': 'Sat, 20 Jun 2020 12:39:48 GMT', 'Content-Type': 'application/json', 'Content-Length': '307', 'Connection': 'keep-alive', 'Server': 'gunicorn/19.9.0', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Credentials': 'true'}