In [57]:
import duckdb
from huggingface_hub import snapshot_download
from pathlib import Path
import json

In [2]:
REPO_NAME = "ClimatePolicyRadar/all-document-text-data-weekly"
REPO_URL = f"https://huggingface.co/datasets/{REPO_NAME}"
CACHE_DIR = "../cache"

# INTERNAL NOTE: use this commit hash until the weekly pipeline run that produces the data
# is stable.
REVISION = "bd0abf24ae34d3150bdd8ac66f36a28e47f3ee93"  # Use this to set a commit hash. Recommended!

snapshot_download(
    repo_id=REPO_NAME,
    repo_type="dataset",
    local_dir=CACHE_DIR,
    revision=REVISION,
    allow_patterns=["*.parquet"],
)

Fetching 15 files: 100%|██████████| 15/15 [00:00<00:00, 1540.70it/s]


'/Users/kalyan/Documents/CPR/open-data/cache'

In [3]:
db = duckdb.connect()

# Authenticate (needed if loading a private dataset)
# You'll need to log in using `huggingface-cli login` in your terminal first
db.execute("CREATE SECRET hf_token (TYPE HUGGINGFACE, PROVIDER credential_chain);")

# Create a view called 'open_data', and count the number of rows and distinct documents
# in the view
db.execute(
    f"CREATE VIEW open_data AS SELECT * FROM read_parquet('{CACHE_DIR}/*.parquet')"
)
db.sql("SELECT COUNT(*), COUNT(DISTINCT document_id) FROM open_data")

┌──────────────┬─────────────────────────────┐
│ count_star() │ count(DISTINCT document_id) │
│    int64     │            int64            │
├──────────────┼─────────────────────────────┤
│     21037269 │                        7795 │
└──────────────┴─────────────────────────────┘

In [237]:
# text search on document title

db.sql(
    """SELECT DISTINCT document_id, "document_metadata.geographies", "document_metadata.document_title", "document_metadata.publication_ts", "html_data.has_valid_text" FROM open_data 
    WHERE LOWER("document_metadata.document_title") LIKE '%carbon budget%' ORDER BY "document_metadata.geographies" """
).to_df()

Unnamed: 0,document_id,document_metadata.geographies,document_metadata.document_title,document_metadata.publication_ts,html_data.has_valid_text
0,CCLW.executive.9477.3766,[FRA],Decree n° 2015-1491 on the national carbon bud...,2015-11-18T00:00:00Z,
1,CCLW.document.i00001366.n0000,[GBR],Carbon Budget Delivery Plan,2023-03-30T00:00:00Z,
2,CCLW.legislative.1755.rtl_72,[GBR],The Sixth Carbon Budget,2020-12-09T00:00:00Z,
3,CCLW.legislative.2066.2052,[GBR],Carbon Budget Order 2016,2016-07-20T00:00:00Z,
4,CCLW.legislative.1755.4614,[GBR],The Sixth Carbon Budget,2020-12-09T00:00:00Z,
5,UNFCCC.non-party.1453.0,[XAA],"The University of Exeter, Global Carbon Budget...",2022-04-13T00:00:00Z,
6,UNFCCC.non-party.1607.0,[XAA],University of Exeter global stocktake submissi...,2023-03-07T00:00:00Z,
7,UNFCCC.non-party.1611.0,[XAA],University of Exeter global stocktake submissi...,2023-03-07T00:00:00Z,


In [260]:
def view_document(document_id):
    doc_df = db.sql(
        f"SELECT * FROM open_data WHERE document_id = '{document_id}'"
    ).to_df()

    cols = ["document_id"] + [
        col for col in doc_df.columns if col.startswith("text_block")
    ]

    return doc_df[cols]


def get_random_doc_id() -> str:
    return db.sql(
        """SELECT document_id FROM open_data WHERE "document_metadata.languages" = ['English'] ORDER BY RANDOM() LIMIT 1"""
    ).fetchone()[0]


def merge_successive_duplicate_titles(
    text_blocks: list[dict[str, str]],
) -> list[dict[str, str]]:
    new_text_blocks = []
    current_title = {"text": None}

    for i, block in enumerate(text_blocks):
        if block["type"] == "title":
            if block["text"] == current_title["text"]:
                continue
            else:
                current_title = block
                new_text_blocks.append(block)
        else:
            new_text_blocks.append(block)

    return new_text_blocks


def view_document_text_blocks(document_id) -> list[dict[str, str]]:
    doc_df = view_document(document_id).sort_values("text_block.index")

    text_blocks = [
        {"text": row["text_block.text"], "type": row["text_block.type"]}
        for _, row in doc_df.iterrows()
    ]

    new_text_blocks = merge_successive_duplicate_titles(text_blocks)

    return new_text_blocks


def text_block_array_to_html(data):
    json_data = json.dumps(data)
    html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document Viewer</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
            display: flex;
            height: 100vh;
            overflow: hidden;
        }}
        #sidebar {{
            width: 20%;
            height: 100vh;
            overflow-y: auto;
            padding: 20px;
            background-color: #f8f9fa;
            box-sizing: border-box;
            font-size: 14px;
            line-height: 1.4;
            border-right: 1px solid #e9ecef;
            box-shadow: 2px 0 5px rgba(0,0,0,0.1);
        }}
        #main-content {{
            flex: 1;
            height: 100vh;
            overflow-y: auto;
            padding: 20px 30px;
            box-sizing: border-box;
        }}
        #search-box {{
            width: 100%;
            padding: 12px;
            font-size: 16px;
            margin-bottom: 20px;
            box-sizing: border-box;
            border: 1px solid #ced4da;
            border-radius: 4px;
        }}
        .sidebar-item {{
            cursor: pointer;
            margin-bottom: 5px;
            word-wrap: break-word;
            overflow-wrap: break-word;
            padding: 8px 5px;
            border-radius: 4px;
        }}
        .sidebar-item:hover {{
            background-color: #e9ecef;
        }}
        .sidebar-item.active {{
            font-weight: bold;
            color: #007bff;
            background-color: #e7f1ff;
        }}
        .sidebar-item[data-type="title"] {{
            font-size: 16px;
            font-weight: bold;
            margin-bottom: 5px;
            border-bottom: 2px solid #dee2e6;
            padding-bottom: 2px;
        }}
        .sidebar-item[data-type="sectionHeading"] {{
            font-size: 14px;
            margin-left: 10px;
        }}
        .Text, .title, .sectionHeading, .footnote, .pageHeader, .pageFooter, .pageNumber, .TableCell {{
            max-width: 800px;
            margin-left: auto;
            margin-right: auto;
        }}
        .title {{
            font-size: 28px;
            margin-bottom: 30px;
        }}
        .sectionHeading {{
            font-size: 24px;
            margin-top: 40px;
            margin-bottom: 20px;
        }}
        .Text {{
            font-size: 16px;
            line-height: 1.6;
            margin-bottom: 15px;
        }}
        .footnote, .pageHeader, .pageFooter, .pageNumber {{
            font-size: 14px;
            margin-top: 20px;
            color: #6c757d;
        }}
        .TableCell {{
            padding: 5px 10px;
            border: 1px solid #dee2e6;
        }}
        .highlight {{
            background-color: yellow;
        }}
        .hidden {{
            display: none;
        }}
    </style>
</head>
<body>
    <div id="sidebar"></div>
    <div id="main-content">
        <input type="text" id="search-box" placeholder="Search...">
        <div id="content"></div>
    </div>
    <script>
        const data = {json_data};
        const sidebar = document.getElementById('sidebar');
        const content = document.getElementById('content');
        const searchBox = document.getElementById('search-box');
        const mainContent = document.getElementById('main-content');
        
        function renderSidebar(filteredData) {{
            sidebar.innerHTML = '';
            let currentTitle = null;
            let currentSectionHeading = null;

            data.forEach((item, index) => {{
                if (item.type === 'title') {{
                    currentTitle = item;
                }} else if (item.type === 'sectionHeading') {{
                    currentSectionHeading = item;
                }}

                if ((item.type === 'title' || item.type === 'sectionHeading') && 
                    (filteredData.includes(item) || 
                    filteredData.some(d => {{
                        const dataIndex = data.indexOf(d);
                        return dataIndex > index && 
                                (item.type === 'title' || 
                                (item.type === 'sectionHeading' && dataIndex < data.findIndex((i, idx) => idx > index && i.type === 'sectionHeading')));
                    }}))) {{
                    const element = document.createElement('div');
                    element.className = 'sidebar-item';
                    element.textContent = item.text;
                    element.dataset.index = index;
                    element.dataset.type = item.type;
                    element.onclick = () => scrollToHeader(index);
                    
                    if (item.type === 'sectionHeading' && currentTitle && !sidebar.querySelector(`[data-index="${{data.indexOf(currentTitle)}}"]`)) {{
                        const titleElement = document.createElement('div');
                        titleElement.className = 'sidebar-item';
                        titleElement.textContent = currentTitle.text;
                        titleElement.dataset.index = data.indexOf(currentTitle);
                        titleElement.dataset.type = 'title';
                        titleElement.onclick = () => scrollToHeader(data.indexOf(currentTitle));
                        sidebar.appendChild(titleElement);
                    }}
                    
                    sidebar.appendChild(element);
                }}
            }});
        }}
        
        function scrollToHeader(index) {{
            const element = document.getElementById(`content-item-${{index}}`);
            if (element) {{
                const headerOffset = 60;
                const elementPosition = element.getBoundingClientRect().top;
                const offsetPosition = elementPosition + mainContent.scrollTop - headerOffset;
                
                mainContent.scrollTo({{
                    top: offsetPosition,
                    behavior: 'smooth'
                }});
            }}
        }}

        function renderContent(filteredData) {{
            content.innerHTML = '';
            let currentTitle = null;
            let currentSection = null;
            data.forEach((item, index) => {{
                const element = document.createElement('div');
                element.className = item.type || 'Text';
                element.textContent = item.text;
                element.id = `content-item-${{index}}`;
                
                if (item.type === 'title') {{
                    currentTitle = element;
                    element.classList.add('hidden');
                }} else if (item.type === 'sectionHeading') {{
                    currentSection = element;
                    element.classList.add('hidden');
                }}
                
                if (filteredData.includes(item)) {{
                    if (currentTitle && currentTitle.classList.contains('hidden')) {{
                        content.appendChild(currentTitle);
                        currentTitle.classList.remove('hidden');
                    }}
                    if (currentSection && currentSection.classList.contains('hidden')) {{
                        content.appendChild(currentSection);
                        currentSection.classList.remove('hidden');
                    }}
                    content.appendChild(element);
                }}
            }});
        }}

        function updateSidebarHighlights(filteredData) {{
            const sidebarItems = sidebar.querySelectorAll('.sidebar-item');
            sidebarItems.forEach(item => {{
                const index = parseInt(item.dataset.index);
                const hasMatchingContent = filteredData.some(dataItem => 
                    data.indexOf(dataItem) > index && 
                    (data[index].type === 'title' || data.findIndex(d => d.type === 'sectionHeading' && d !== data[index]) > data.indexOf(dataItem))
                );
                item.classList.toggle('active', hasMatchingContent);
            }});
        }}

        function debounce(func, wait) {{
            let timeout;
            return function executedFunction(...args) {{
                const later = () => {{
                    clearTimeout(timeout);
                    func(...args);
                }};
                clearTimeout(timeout);
                timeout = setTimeout(later, wait);
            }};
        }}

        function highlightText(element, searchTerm) {{
            const text = element.textContent;
            const searchWords = searchTerm.split(/\\s+/).map(word => word.replace(/[.*+?^${{}}()|[\\]\\\\]/g, '\\\\$&'));
            const regex = new RegExp(`(${{searchWords.join('|')}})`, 'gi');
            element.innerHTML = text.replace(regex, '<span class="highlight">$1</span>');
        }}
        
        const performSearch = debounce(function() {{
            const searchTerm = searchBox.value.toLowerCase();
            
            if (!searchTerm) {{    
                renderContent(data);
                renderSidebar(data);
                return;
            }}
            
            const filteredData = data.filter(item => {{
                const itemWords = item.text.toLowerCase().split(/\s+/);
                const searchWords = searchTerm.toLowerCase().trim().split(/\s+/).filter(word => word.length > 0);
                return searchWords.length > 0 && searchWords.every(searchWord => itemWords.includes(searchWord));
            }});
                        
            renderContent(filteredData);
            renderSidebar(filteredData);

            if (searchTerm) {{
                document.querySelectorAll('#content > div:not(.hidden)').forEach(element => {{
                    highlightText(element, searchTerm);
                }});
            }}
        }}, 1000);

        searchBox.addEventListener('input', performSearch);
        renderSidebar(data);
        renderContent(data);
    </script>
</body>
</html>
"""

    return html

  """


In [261]:
# doc_id = get_random_doc_id()
doc_id = "UNFCCC.party.1039.0"
doc_id

'UNFCCC.party.1039.0'

In [262]:
doc_text_blocks = view_document_text_blocks(doc_id)
html = text_block_array_to_html(doc_text_blocks)

Path(f"../experiment-htmls/{doc_id}.html").write_text(html)

236212

In [246]:
doc_text_blocks

[{'text': 'HM Government', 'type': 'pageHeader'},
 {'text': 'Carbon Budget Delivery Plan', 'type': 'title'},
 {'text': 'March 2023 HC 1269', 'type': 'Text'},
 {'text': '1', 'type': 'pageNumber'},
 {'text': '2', 'type': 'pageNumber'},
 {'text': 'HM Government', 'type': 'title'},
 {'text': 'Carbon Budget Delivery Plan', 'type': 'title'},
 {'text': 'Presented to Parliament pursuant to details of the Climate Change Act (2008) Section 14',
  'type': 'Text'},
 {'text': 'Ordered by the House of Commons to be printed 30 March 2023',
  'type': 'Text'},
 {'text': 'HC 1269', 'type': 'pageFooter'},
 {'text': '3', 'type': 'pageNumber'},
 {'text': 'OGL', 'type': 'sectionHeading'},
 {'text': 'Crown copyright 2023', 'type': 'sectionHeading'},
 {'text': 'This publication is licensed under the terms of the Open Government Licence v3.0 except where otherwise stated. To view this licence, visit nationalarchives.gov.uk/doc/open-government-licence/version/3',
  'type': 'Text'},
 {'text': 'Where we have iden