In [1]:
from collections import defaultdict
from bs4 import BeautifulSoup

version = 11

codebook_file = f'ESS{version}/ESS{version} codebook.html'

with open(codebook_file, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

In [2]:
included_variables_with_scale = {'polintr': 'u',
 'psppsgva': 'u',
 'actrolga': 'u',
 'psppipla': 'u',
 'cptppola': 'u',
 'trstprl': 'u',
 'trstlgl': 'u',
 'trstplc': 'u',
 'trstplt': 'u',
 'trstprt': 'u',
 'trstep': 'u',
 'trstun': 'u',
 'contplt': 'bi',
 'donprty': 'bi',
 'badge': 'bi',
 'sgnptit': 'bi',
 'pbldmna': 'bi',
 'bctprd': 'bi',
 'pstplonl': 'bi',
 'volunfp': 'bi',
 'clsprty': 'bi',
 'lrscale': 'bi',
 'stflife': 'bi',
 'stfeco': 'bi',
 'stfgov': 'bi',
 'stfdem': 'bi',
 'stfedu': 'bi',
 'stfhlth': 'bi',
 'gincdif': 'bi',
 'freehms': 'bi',
 'hmsfmlsh': 'bi',
 'hmsacld': 'bi',
 'euftf': 'bi',
 'lrnobed': 'bi',
 'loylead': 'bi',
 'imsmetn': 'bi',
 'imdfetn': 'bi',
 'impcntr': 'bi',
 'imbgeco': 'bi',
 'imueclt': 'bi',
 'imwbcnt': 'bi'}

In [3]:
country_dict = {}
for row in soup.find('h3', id='cntry').find_parent('div').find_all('tr'):
    tds = row.find_all('td')
    if len(tds) == 2:
        country_dict[tds[0].text.strip()] = tds[1].text.strip()

In [4]:
def extract_question(div_element):
    divs = div_element.find('p').find_all('div')
    return divs[len(divs)>1].text.strip()

In [5]:
def extract_values(div_element):
    rows = div_element.find('tbody').find_all('tr')
    values = []
    categories = []
    for r in rows:
        tds = r.find_all("td")
        if not tds[1].text.strip().endswith('*'):
            values.append(int(tds[0].text.strip()))
            categories.append(tds[1].text.strip())
    return values, categories

In [6]:
codebook = defaultdict(dict)
for variable in included_variables_with_scale.keys():
    div_element = soup.find('h3', id=variable).find_parent('div')
    
    question = extract_question(div_element)
    values, categories = extract_values(div_element)

    codebook[variable]["question"] = question
    codebook[variable]["values"] = values
    codebook[variable]["categories"] = categories
    codebook[variable]["scale_type"] = included_variables_with_scale[variable]
    

In [7]:
country_dict = {}
for row in soup.find('h3', id='cntry').find_parent('div').find_all('tr'):
    tds = row.find_all('td')
    if len(tds) == 2:
        country_dict[tds[0].text.strip()] = tds[1].text.strip()

In [8]:
import json

with open("codebook.json", "w") as json_file:
    json.dump(codebook, json_file, indent=4)

with open("country.json", "w") as json_file:
    json.dump(country_dict, json_file, indent=4)