In [None]:
from bs4 import BeautifulSoup
import urllib
import json   
import requests
import re

In [None]:
HEADERS = {
    'Accept': '###',
    'User-Agent': '###'
}


def get_html(url, params = ''):
    r = requests.get(url, headers = HEADERS, params = params)
    return r


def extract_simple_table(soup): # простые таблицы ("Рукоположение / Постриг / Возведение в сан" и "Награды")
    values = []
    
    position = soup.find("table").find_all("tr")
    for item in position:
        info = dict()
        info["Дата"] = item.find('td',class_="date").get_text(strip=True)
        info["Описание"] = item.find('td',class_="details").get_text(strip=True)
        values.append(info)
    return values

def extract_table_with_head(soup): # таблицы с заголовками ("Образование" и "Места служения / Должности")
    values = []
    keys = soup.find('thead').find_all('th')
    lines = soup.find('tbody').find_all('tr')
    for line in range(0,len(lines)):
        cells = lines[line].find_all('td')
        info = dict()
        for cell in range(0,len(cells)):
            info[keys[cell].get_text(strip=True)] = cells[cell].get_text(strip=True)
        values.append(info)
    
    return values

def extract_relations(soup): # таблица "Родственники"
    values = dict()
    items = soup.find_all('li')
    for item in items:
        if(item.find('strong') is None):
            continue
        relation = item.find('strong').get_text(strip=True)
        name = item.get_text(strip=True).split(relation)[1].strip('—').strip(' ') # разделение по "—", так как часто бывает нарушение структуры html
        values[relation] = name
    return values

def extract_text(soup): # "Другие сведения"
    return soup.find(class_="person__text").get_text(strip = True)

def extract_list(soup): # списки ("Архивные источники", "Литература", "Сочинения")
    values = []
    
    for item in soup.find_all("li"):
        values.append(item.get_text(strip=True))
    
    return values

def parse_person_block(soup):
    key = soup.find("h2").get_text(strip = True)
    
    values = []
    if(key == "Родственники"):
        values = extract_relations(soup)
    elif(key == "Рукоположение / Постриг / Возведение в сан"):
        values = extract_simple_table(soup)
    elif(key == "Награды"):
        values = extract_simple_table(soup)
    elif(soup.find("thead") is not None):
        values = extract_table_with_head(soup)
    elif(soup.find(class_="person__list") is not None):
        values = extract_list(soup)
    elif(soup.find(class_="person__text") is not None):
        values = extract_text(soup)
    
    return (key,values)


def parse_all_person_blocks(soup,values):    
    for block in soup.find_all("div", class_="person__block"):
        if(block.find("h2") is None):
            continue
        (key,value) = parse_person_block(block)
        values[key] = value
        
def parse_person_details(soup,values):
    for tr in soup.find(class_='person-detail__info').find_all("tr"):
        ths = tr.find_all("th")
        tds = tr.find_all("td")
        for i in range(0,len(ths)):
            values[ths[i].get_text(strip=True)] = tds[i].get_text(strip=True)
            
def extract_values(soup):
    values = dict()
    
    values['ФИО'] = soup.find('div', class_ = 'person-detail__info').find('h1').get_text(strip = True)
    parse_person_details(soup,values)
    parse_all_person_blocks(soup,values)
    
    return values 

    
def get_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    return extract_values(soup)
    

def parse_person(n):
    return get_content(get_html('https://www.pravoslavnoe-duhovenstvo.ru/person/' + str(n)).text)

html = get_html(URL)
content = get_content(html.text)


In [None]:
parsed = list()

for i in range(,):
    
    try: # напечатает, получилось ли извлечь страницу
        print(i)
        parsed.append(parse_person(i))
    except Exception as e: 
        print(str(e))

with open('###.json', 'w', encoding='utf-8') as f:
        json.dump(parsed, f, ensure_ascii=False, indent=4)