In [None]:
from guide_parsers_config import parsers
from IPython.display import clear_output

# guide_id = "guide_art_1_ron"
# guides = [parsers[guide_id]]

guides = parsers.values()
parsed_guides = {}

for guide in guides:
    clear_output(wait=True)
    print(f"\nParsing guide: {guide.guide_id}") 
    parsed_guides[guide.guide_id] = guide.parse()


    # For testing purposes: output first and last 20 characters of each paragraph

    for i, paragraph in enumerate(parsed_guides[guide.guide_id]):
        print(f"{i+1}. {paragraph[:20]}...{paragraph[-20:]}")

In [None]:
# Debug output a single paragraph

paragraphs = parsed_guides["guide_art_1_ron"]
paragraph_num = 6
highlight_string = "răspunzător"


import textwrap
from termcolor import colored  # install with: pip install termcolor

paragraph = paragraphs[paragraph_num-1]
parts = paragraph.split(highlight_string)
colored_text = colored(highlight_string, 'red').join(parts)
print(textwrap.fill(colored_text))

In [None]:
# Search for a keyword to check if the remove patterns worked

import re
from termcolor import colored

test_patterns = [
    {
        "keyword": r"Curtea Europeană a Drepturilor Omului",
        "error_message": "Removing of footer insuccessful",
    },
    {
        "keyword": r"(?<!\[)Ghid privind",
        "error_message": "Removing of header insuccessful",
    },
]


for guide_id, paragraphs in parsed_guides.items():
    for i, paragraph in enumerate(paragraphs):
        for pattern in test_patterns:
            match = re.search(pattern["keyword"], paragraph)
            if match:
                print(f"\nIn {guide_id} paragraph {i+1}. found: {pattern['error_message']}:")
                # Highlight the matched text
                highlighted = re.sub(pattern["keyword"], lambda m: colored(m.group(), 'red'), paragraph)
                print(highlighted)

In [1]:
import pandas as pd
from collections import defaultdict
from guide_parsers_config import parsers

language_parsers = defaultdict(list)

for key, parser in parsers.items():
    lang_code = key[-3:]
    language_parsers[lang_code].append((key, parser))

for lang_code, parser_list in language_parsers.items():
    df = pd.DataFrame()
    print(f"Processing {lang_code} guides...")

    for key, parser in parser_list:
        dfg = parser.to_csv()
        df = pd.concat([df, dfg], ignore_index=True)
        df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
        print(key, len(dfg))

    output_filename = f"../data/echr_case_law_guides_{lang_code}.csv"
    df.to_csv(output_filename, index=False)
    df

Processing ron guides...
guide_art_1_ron 124
guide_art_2_ron 220
guide_art_3_ron 160
guide_art_4_ron 77
guide_art_5_ron 312
guide_art_6_civil_ron 516
guide_art_6_criminal_ron 595
guide_art_7_ron 66
guide_art_8_ron 675
guide_art_9_ron 296
guide_art_10_ron 689
guide_art_11_ron 294
guide_art_12_ron 61
guide_art_13_ron 305
guide_art_14_art_1_protocol_12_ron 241
guide_art_15_ron 49
guide_art_17_ron 165
guide_art_18_ron 184
admissibility_guide_ron 350
guide_art_46_ron 107
guide_art_1_protocol_1_ron 394
guide_art_2_protocol_1_ron 76
guide_art_3_protocol_1_ron 124
guide_art_2_protocol_4_ron 328
guide_art_3_protocol_4_ron 106
guide_art_4_protocol_4_ron 27
guide_art_1_protocol_7_ron 89
guide_art_2_protocol_7_ron 37
guide_art_4_protocol_7_ron 75
guide_data_protection_ron 392
guide_environment_ron 203
guide_immigration_ron 79
guide_mass_protests_ron 229
guide_prisoners_rights_ron 362
guide_lgbti_rights_ron 149
guide_social_rights_ron 231
guide_terrorism_ron 119
guide_rights_of_the_child_ron 309
gu