In [6]:
import requests
from datetime import datetime, timedelta
import pandas as pd

def get_r(url, timeout, retry, verbose):
    count = 0
    max_attempts = 20
    while count < max_attempts:
        try:
            r = requests.get(url, timeout=timeout)
            return r
        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout):
            count += 1
            if verbose:
                print(f"Timeout. Retry attempt {count}.")
            if count > retry:
                if verbose:
                    print(f"Unable to connect to {url}. Skipping this batch.")
                return None
    return None

def basic_function(term, values):
    values = ['"' + i + '"' for i in values]
    main_body = list()
    cut_term = term.replace('"', '')
    for v in values:
        main_body.append(f"({cut_term}={v}) OR ({cut_term}:{v})")
    query = f"({' OR '.join(main_body)})"
    return query

def link_to_query(link):
    # Function logic remains unchanged.
    pass

def get_echr_metadata(start_id, end_id, verbose, fields, start_date, end_date, link, language):
    data = []
    if not fields:
        fields = ['itemid', 'applicability', 'appno', 'article', 'conclusion', 'docname',
                  'doctype', 'doctypebranch', 'ecli', 'importance', 'judgementdate',
                  'languageisocode', 'originatingbody', 'violation', 'nonviolation',
                  'extractedappno', 'scl', 'publishedby', 'representedby', 'respondent',
                  'separateopinion', 'sharepointid', 'externalsources', 'issue', 'referencedate',
                  'rulesofcourt', 'DocId', 'WorkId', 'Rank', 'Author', 'Size', 'Path',
                  'Description', 'Write', 'CollapsingStatus', 'HighlightedSummary',
                  'HighlightedProperties', 'contentclass', 'PictureThumbnailURL',
                  'ServerRedirectedURL', 'ServerRedirectedEmbedURL', 'ServerRedirectedPreviewURL',
                  'FileExtension', 'ContentTypeId', 'ParentLink', 'ViewsLifeTime', 'ViewsRecent',
                  'SectionNames', 'SectionIndexes', 'SiteLogo', 'SiteDescription', 'deeplinks',
                  'SiteName', 'IsDocument', 'LastModifiedTime', 'FileType', 'IsContainer',
                  'WebTemplate', 'SecondaryFileExtension', 'docaclmeta', 'OriginalPath',
                  'EditorOWSUSER', 'DisplayAuthor', 'ResultTypeIdList', 'PartitionId', 'UrlZone',
                  'AAMEnabledManagedProperties', 'ResultTypeId', 'rendertemplateid']
 
    if link:
        META_URL = link_to_query(link)
    else:
        META_URL = ('http://hudoc.echr.coe.int/app/query/results'
                    '?query=(contentsitename=ECHR) AND '
                    '(documentcollectionid2:"JUDGMENTS") AND lang_inputter'
                    '&select={select}&sort=itemid Ascending&start={start}&length={length}')

    if not link:
        language_input = basic_function('languageisocode', language)
        META_URL = META_URL.replace('lang_inputter', language_input)
        META_URL = META_URL.replace('{select}', ','.join(fields))

    META_URL = META_URL.replace(' ', '%20').replace('"', '%22')

    def get_date_ranges(start_date, end_date, days_per_batch=365):
        start_date = datetime.strptime(start_date, '%Y-%m-%d')
        end_date = datetime.strptime(end_date, '%Y-%m-%d')
        date_ranges = []
        while start_date < end_date:
            batch_end_date = start_date + timedelta(days=days_per_batch - 1)
            if batch_end_date > end_date:
                batch_end_date = end_date
            date_ranges.append((start_date.strftime('%Y-%m-%d'), batch_end_date.strftime('%Y-%m-%d')))
            start_date = batch_end_date + timedelta(days=1)
        return date_ranges

    if not end_date:
        end_date = datetime.today().strftime('%Y-%m-%d')
    if not start_date:
        start_date = '1900-01-01'

    date_ranges = get_date_ranges(start_date, end_date)

    for start_date, batch_end_date in date_ranges:
        if verbose:
            print(f"Fetching data for date range {start_date} to {batch_end_date}")

        addition = f'(kpdate>="{start_date}" AND kpdate<="{batch_end_date}")'
        url = META_URL.replace('(contentsitename=ECHR)', f'(contentsitename=ECHR) AND {addition}')

        for i in range(start_id, end_id, 500):
            if verbose:
                print(f"Fetching cases {i} to {i + 500} for date range {start_date} to {batch_end_date}.")

            batch_url = url.format(start=i, length=500)
            r = get_r(batch_url, timeout=6, retry=3, verbose=verbose)

            if r is not None:
                temp_dict = r.json().get('results', [])
                for result in temp_dict:
                    data.append(result['columns'])

    if len(data) == 0:
        print("Search results ended up empty")
        return False
    return pd.DataFrame.from_records(data)


if __name__ == "__main__":
    start_id = 0
    end_id = 10000
    verbose = True
    fields = ["itemid",
    "docname", 
    "doctype",
    "appno",
    "conclusion",
    "importance",
    "originatingbody",
    "languageisocode",
    "extractedappno", 
    "doctypebranch",
    "respondent",
    "ecli",
    "article",
    "applicability",
    "judgementdate",
    "externalsources",
    "issue",
    "representedby",
    "separateopinion",
    "violation",
    "nonviolation",
    "rank",
    "rulesofcourt",
    "referencedate",
    "publishedby",
    "scl"] 
    start_date = '1959-01-01'
    end_date = datetime.today().strftime('%Y-%m-%d')
    link = None
    language = ['ENG']  # Fetch results in English.

    print("Starting data extraction...")
    df = get_echr_metadata(start_id, end_id, verbose, fields, start_date, end_date, link, language)

    if isinstance(df, pd.DataFrame):
        print(f"Extracted {len(df)} rows of data.")
        df.to_csv("echr_metadata2.csv", index=False)
        print("Data saved to echr_metadata2.csv.")

Starting data extraction...
Fetching data for date range 1959-01-01 to 1959-12-31
Fetching cases 0 to 500 for date range 1959-01-01 to 1959-12-31.
Fetching cases 500 to 1000 for date range 1959-01-01 to 1959-12-31.
Fetching cases 1000 to 1500 for date range 1959-01-01 to 1959-12-31.
Fetching cases 1500 to 2000 for date range 1959-01-01 to 1959-12-31.
Fetching cases 2000 to 2500 for date range 1959-01-01 to 1959-12-31.
Fetching cases 2500 to 3000 for date range 1959-01-01 to 1959-12-31.
Fetching cases 3000 to 3500 for date range 1959-01-01 to 1959-12-31.
Fetching cases 3500 to 4000 for date range 1959-01-01 to 1959-12-31.
Fetching cases 4000 to 4500 for date range 1959-01-01 to 1959-12-31.
Fetching cases 4500 to 5000 for date range 1959-01-01 to 1959-12-31.
Fetching cases 5000 to 5500 for date range 1959-01-01 to 1959-12-31.
Fetching cases 5500 to 6000 for date range 1959-01-01 to 1959-12-31.
Fetching cases 6000 to 6500 for date range 1959-01-01 to 1959-12-31.
Fetching cases 6500 to 700

# Load Edges

In [2]:
import subprocess
import sys
import threading

def stream_output(pipe, prefix=''):
    """Helper function to stream output from a pipe"""
    for line in iter(pipe.readline, ''):
        if line:
            print(f"{prefix}{line.rstrip()}", flush=True)

# Run the processing script with real-time output handling
process = subprocess.Popen(
    ['python', 'process.py'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    universal_newlines=True,
    bufsize=1
)

# Create threads to handle stdout and stderr streams
stdout_thread = threading.Thread(target=stream_output, args=(process.stdout,))
stderr_thread = threading.Thread(target=stream_output, args=(process.stderr, 'INFO: '))

# Start the threads
stdout_thread.start()
stderr_thread.start()

# Wait for the process to complete
process.wait()

# Wait for output threads to complete
stdout_thread.join()
stderr_thread.join()

# Check return code
if process.returncode != 0:
    print(f"Process exited with return code {process.returncode}", file=sys.stderr)

[2025-01-24 20:53:25] Starting data processing
[2025-01-24 20:53:25] Number of rows in metadata: 34090
[2025-01-24 20:53:25] Found 6289 rows with missing ECLIs
[2025-01-24 20:53:25] Merging duplicate entries...
[2025-01-24 20:53:25] Found 6288 duplicate entries
INFO: INFO:root:[2025-01-24 20:53:25] Processing 34090 cases

INFO: --- COLLECTING METADATA ---
INFO: 
INFO: INFO:root:
INFO: --- EXTRACTING NODES LIST ---
INFO: 
INFO: INFO:root:
INFO: --- EXTRACTING EDGES LIST ---
INFO: 
INFO: 
INFO:   0%|[32m          [0m| 0/34090 [00:00<?, ?it/s]
INFO:   0%|[32m          [0m| 4/34090 [00:00<1:45:34,  5.38it/s]
INFO:   0%|[32m          [0m| 10/34090 [00:01<55:33, 10.22it/s]
INFO:   0%|[32m          [0m| 12/34090 [00:01<50:50, 11.17it/s]
INFO:   0%|[32m          [0m| 14/34090 [00:01<47:42, 11.90it/s]
INFO:   0%|[32m          [0m| 16/34090 [00:01<45:34, 12.46it/s]
INFO:   0%|[32m          [0m| 19/34090 [00:01<47:23, 11.98it/s]
INFO:   0%|[32m          [0m| 21/34090 [00:02<57:18,

In [33]:
import json
import csv
import ast
from pathlib import Path
from typing import Union, Dict, List

def json_to_csv(input_nodes: Union[str, Path], input_edges: Union[str, Path], 
                output_nodes: Union[str, Path], output_edges: Union[str, Path]) -> bool:
    """
    Convert JSON nodes and edges files to CSV format while maintaining structure.
    
    Args:
        input_nodes: Path to input nodes JSON file
        input_edges: Path to input edges JSON file
        output_nodes: Path to save nodes CSV
        output_edges: Path to save edges CSV
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        print("Starting JSON to CSV conversion...")
        
        # Define all possible node columns
        print("Defining node columns...")
        node_columns = [
            'appno', 'respondent', 'languageisocode', 'representedby', 'violation',
            'docname', 'separateopinion', 'judgementdate', 'scl', 'publishedby',
            'doctypebranch', 'applicability', 'conclusion', 'importance',
            'externalsources', 'rulesofcourt', 'referencedate', 'extractedappno',
            'itemid', 'article', 'ecli', 'issue', 'nonviolation', 'originatingbody',
            'doctype', 'rank'
        ]
        print(f"Defined {len(node_columns)} columns for nodes")

        # Process nodes
        print(f"Reading nodes from {input_nodes}...")
        with open(input_nodes, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            nodes = list(reader)
            print(f"Found {len(nodes)} nodes")

        print(f"Writing nodes to {output_nodes}...")
        with open(output_nodes, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=node_columns)
            writer.writeheader()
            counter = 0
            for node in nodes:
                # Ensure all columns exist in the row
                row = {col: node.get(col, '') for col in node_columns}
                writer.writerow(row)
                counter += 1
            print(f"Successfully wrote {counter} nodes")

        # Process edges
        print(f"Reading edges from {input_edges}...")
        with open(input_edges, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            edges = list(reader)
            print(f"Found {len(edges)} edges")

        # Create a dictionary to store aggregated references for each ECLI
        ecli_references = {}

        # Aggregate references for each ECLI
        for edge in edges:
            try:
                source = edge['source']
                target = edge['target']
                
                # Initialize list if ECLI not seen before
                if source not in ecli_references:
                    ecli_references[source] = []
                
                # Add target to references if not already present
                if target not in ecli_references[source]:
                    ecli_references[source].append(target)
                    
            except Exception as e:
                print(f"Error processing edge: {edge}")
                print(f"Error details: {str(e)}")
                continue

        print(f"Writing edges to {output_edges}...")
        with open(output_edges, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['ecli', 'references'])
            edge_counter = 0
            
            # Write aggregated edges
            for ecli, references in ecli_references.items():
                writer.writerow([ecli, references])
                edge_counter += 1
                
            print(f"Successfully wrote {edge_counter} unique ECLIs with their references")

        print("JSON to CSV conversion completed successfully")
        return True

    except Exception as e:
        print(f"Error processing files: {str(e)}")
        print(f"Failed to convert files: {input_nodes}, {input_edges}")
        return False

In [35]:
import csv
from pathlib import Path

def count_unique_eclis(input_edges: Union[str, Path]) -> int:
    """
    Count unique ECLIs from the edges CSV file.
    
    Args:
        input_edges: Path to input edges CSV file
    
    Returns:
        int: Number of unique ECLIs
    """
    try:
        # Set to store unique ECLIs
        unique_eclis = set()
        
        print(f"Reading edges from {input_edges}...")
        with open(input_edges, 'r', encoding='utf-8') as f:
            
            reader = csv.DictReader(f)
            for edge in reader:
                try:
                    source = edge['source']
                    unique_eclis.add(source)
                except Exception as e:
                    print(f"Error processing edge: {edge}")
                    print(f"Error details: {str(e)}")
                    continue
        
        count = len(unique_eclis)
        print(f"Found {count} unique ECLIs")
        return count

    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return 0

# Example usage
if __name__ == "__main__":
    count_unique_eclis('../data/RAW/downloadedEdges.csv')

Reading edges from ../data/RAW/downloadedEdges.csv...
Found 3979 unique ECLIs


In [38]:
# Example usage:
json_to_csv(
    '../data/RAW/downloadedNodes.csv',
    '../data/RAW/downloadedEdges.csv',
    '../data/FULL/nodesRefactored.csv',
    '../data/FULL/edgesRefactored.csv'
)

Starting JSON to CSV conversion...
Defining node columns...
Defined 26 columns for nodes
Reading nodes from ../data/RAW/downloadedNodes.csv...
Found 27801 nodes
Writing nodes to ../data/FULL/nodesRefactored.csv...
Successfully wrote 27801 nodes
Reading edges from ../data/RAW/downloadedEdges.csv...
Found 28347 edges
Writing edges to ../data/FULL/edgesRefactored.csv...
Successfully wrote 3979 unique ECLIs with their references
JSON to CSV conversion completed successfully


True