In [4]:
from IPython.display import display, HTML
from collections import Counter
import ipywidgets as widgets

_style = """
<style>
.compact-mention-output {
    font-size: 11px !important;
    line-height: 1.4 !important;
    max-height: 1100px;
    min-height: 550px;
    max-width: 1300px;
    min-width: 900px;
    width: 98vw;
    overflow-y: auto;
    background: #fcfcfc;
    padding: 24px 36px 24px 16px;
    border: 1px solid #ddd;
    border-radius: 10px;
}
</style>
"""
display(HTML(_style))

def display_highlighted_mentions(text, mentions_list):
    mention_fields = ["hyperlinks", "entity_linking", "coref", "coref_cluster"]

    # Build index from mentions list
    qid_counts = Counter()
    field_score_ranges = {field: [] for field in mention_fields}
    all_scores = []
    mention_map = {}  # (start, end) → {qid → (scores, agg, name)}

    for m in mentions_list:
        start, end = m["char_start"], m["char_end"]
        key = (start, end)
        for c in m.get("candidates", []):
            qid = c["qid"]
            name = c.get("name", "")
            scores = c.get("scores_by_source", {})
            agg = c.get("aggregated_score", 0.0)

            qid_counts[qid] += 1
            all_scores.append(agg)
            for field in mention_fields:
                if field in scores:
                    field_score_ranges[field].append(scores[field])

            mention_map.setdefault(key, {})[qid] = (scores, agg, name)

    # Sort QIDs and generate dropdown options with names
    sorted_qids = sorted(qid_counts.items(), key=lambda x: (-x[1], x[0]))
    qid_labels = []
    for qid, count in sorted_qids:
        name = None
        for m in mentions_list:
            for c in m.get("candidates", []):
                if c["qid"] == qid and c.get("name"):
                    name = c["name"]
                    break
            if name:
                break
        label = f"{name} ({qid}) [{count}]" if name else f"{qid} [{count}]"
        qid_labels.append((label, qid))

    min_score = round(min(all_scores), 2) if all_scores else 0.0
    max_score = round(max(all_scores), 2) if all_scores else 1.0

    field_sliders = {}
    for field in mention_fields:
        values = field_score_ranges[field]
        fmin = round(min(values), 2) if values else min_score
        fmax = round(max(values), 2) if values else max_score

        # Set default value based on field
        if field == "hyperlinks":
            default_val = 1.0
        elif field == "entity_linking":
            default_val = 0.57
        else:
            default_val = 0.6

        # Clamp within range
        default_val = max(fmin, min(fmax, default_val))

        field_sliders[field] = widgets.FloatSlider(
            value=default_val,
            min=fmin,
            max=fmax,
            step=0.01,
            description=f'{field} ≥',
            readout_format='.2f',
            continuous_update=True,
            layout=widgets.Layout(width='280px')
        )

    checkboxes = {field: widgets.Checkbox(value=True, description=field, indent=False)
                  for field in mention_fields}
    checkbox_widgets = list(checkboxes.values())

    dropdown = widgets.Dropdown(
        options=qid_labels,
        description='Entity:',
        value=qid_labels[0][1],
        style={'description_width': 'initial'}
    )

    html_out = widgets.HTML()

    def render_text(selected_qid, field_thresholds, active_fields):
        highlight_spans = []
        for (start, end), qid_map in mention_map.items():
            if selected_qid not in qid_map:
                continue
            scores, agg, _ = qid_map[selected_qid]
            if any(scores.get(field, 0.0) >= field_thresholds[field] for field in active_fields):
                highlight_spans.append((start, end, agg))

        highlight_spans_sorted = sorted(highlight_spans, key=lambda x: -x[2])
        span_ranks = {(start, end): rank + 1 for rank, (start, end, _) in enumerate(highlight_spans_sorted)}

        if not highlight_spans:
            return f"<div class='compact-mention-output'>{text}</div>"

        events = []
        for (start, end, score) in highlight_spans:
            rank = span_ranks[(start, end)]
            events.append((start, 'start', score, rank))
            events.append((end, 'end', None, None))
        events.sort(key=lambda x: (x[0], 0 if x[1] == 'end' else 1))

        out = []
        last_idx = 0
        highlight_stack = []

        for idx, typ, score, rank in events:
            if last_idx < idx:
                out.append(text[last_idx:idx])
            if typ == 'start':
                out.append(
                    f'<span style="background-color: #fff574; border-radius:3px; padding:2px 4px; font-size: 11px;" title="aggregated: {score}, rank: {rank}">'
                    f'<b>({rank})</b> '
                )
                highlight_stack.append('open')
            elif typ == 'end' and highlight_stack:
                out.append('</span>')
                highlight_stack.pop()
            last_idx = idx

        out.append(text[last_idx:])
        return f'<div class="compact-mention-output">{"".join(out)}</div>'

    def update_html(change=None):
        qid = dropdown.value
        active_fields = [f for f, cb in checkboxes.items() if cb.value]
        thresholds = {f: field_sliders[f].value for f in active_fields}
        html_out.value = render_text(qid, thresholds, active_fields)

    dropdown.observe(update_html, names='value')
    for cb in checkbox_widgets:
        cb.observe(update_html, names='value')
    for slider in field_sliders.values():
        slider.observe(update_html, names='value')

    update_html()

    controls = widgets.VBox([
        dropdown,
        widgets.HBox(checkbox_widgets),
        widgets.HBox([field_sliders[f] for f in mention_fields])
    ], layout=widgets.Layout(width='99%', max_width='1300px'))

    container = widgets.VBox([controls, html_out],
                             layout=widgets.Layout(width='99%', max_width='1350px'))
    display(container)


In [20]:
import csv
import json
import ast

from aggregate import stream_ndjson_with_offsets
from tqdm import tqdm
import sys

csv.field_size_limit(sys.maxsize)

def get_entity_row_by_index(csv_path, row_idx):
    headers = ['start', 'end', 'id', 'src', 'loc', 'title', 'entities', 'offsets']

    
    with open(csv_path, 'r', encoding='utf-8') as cf:
        csv_reader = csv.DictReader(cf, fieldnames=headers)
        first_row = next(csv_reader)
        src_path = first_row['src']
        print(src_path)

    # Now open both CSV and JSON together
    with open(csv_path, 'r', encoding='utf-8') as cf:
        csv_reader = csv.DictReader(cf, fieldnames=headers)
        for i, csv_row in tqdm(enumerate(csv_reader)):
            if csv_row["id"] == row_idx:
                ndjson_iter = stream_ndjson_with_offsets(src_path, [csv_row['id']])
                json_obj = next(ndjson_iter)

                assert str(csv_row['id']) == str(json_obj['id']), \
                    f"Mismatched IDs at row {row_idx}: CSV={csv_row['id']} JSON={json_obj['id']}"

                try:
                    entities = ast.literal_eval(csv_row['entities'])
                except (ValueError, SyntaxError):
                    entities = []

                return {
                    'id': str(csv_row['id']),
                    'text': json_obj['text'],
                    'entities': entities,
                    'offsets': json_obj['offsets']
                }

    raise IndexError(f"Row index {row_idx} not found.")

# doc = get_entity_row_by_index("/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-2-00000_new.csv", 300)
# doc = get_entity_row_by_index("/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-5-00000_new.csv", 0)
# doc = get_entity_row_by_index("/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-2-00000_new.csv", 5000)
# doc = get_entity_row_by_index("/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-0-00000_new.csv", 496021)
# doc = get_entity_row_by_index("/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-0-00000_new.csv", 2512)
# doc = get_entity_row_by_index("/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-0-00000_new.csv", 1451)
doc = get_entity_row_by_index("/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-0-00000_new.csv", '52518974')

/home/morg/dataset/maverick/maverick_6.json


936it [00:01, 559.72it/s] 


KeyError: 'offsets'

In [18]:
len(doc["text"])

261

In [19]:
doc["offsets"]

KeyError: 'offsets'

In [14]:
doc["text"][1274:1909]

' from the lines of neighboring states. Indeed, the choice of a gauge , different from the meter gauge usually used in Africa, contributed to this effect.\nToday most of these Italian colonial railways have disappeared: those of Somalia after the British occupation in 1941–1945. The Libyan ones were suppressed in the 1960s, but in the same decade the Eritrean railway between Italian Asmara and Massawa was reactivated after long neglect of trafficking.\n\nIn 1940 there were some projects of new colonial railways in the Italian Empire: \nTwo international projects were studied for decades, but never done because of excessive financial'

In [19]:
display_highlighted_mentions(doc["text"], doc["entities"])

VBox(children=(VBox(children=(Dropdown(description='Entity:', options=(('Austro-Hungarian Navy (Q159211) [51]'…

In [1]:
def read_last_bytes(file_path, num_bytes):
    """
    Reads the last `num_bytes` bytes from the specified file.

    :param file_path: Path to the file.
    :param num_bytes: Number of bytes to read from the end of the file.
    :return: The last `num_bytes` of the file as bytes.
    """
    with open(file_path, 'rb') as f:
        # Move to the end minus the number of bytes requested
        f.seek(-num_bytes, 2)
        return f.read(num_bytes)


In [15]:
read_last_bytes("/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-4-00000_new_2.csv", 100000)

b' (2503, 2508), (2508, 2510), (2510, 2517), (2517, 2522), (2522, 2526), (2526, 2528), (2528, 2536), (2536, 2541), (2541, 2545), (2545, 2550), (2550, 2554), (2554, 2558), (2558, 2559), (2559, 2565), (2565, 2573), (2573, 2578), (2578, 2580), (2580, 2587), (2587, 2595), (2595, 2596), (2596, 2601), (2601, 2604), (2604, 2607), (2607, 2608), (2608, 2618), (2618, 2623), (2623, 2628), (2628, 2634), (2634, 2642), (2642, 2645), (2645, 2647), (2647, 2651), (2651, 2657), (2657, 2665), (2665, 2670), (2670, 2676), (2676, 2679), (2679, 2681), (2681, 2688), (2688, 2694), (2694, 2701), (2701, 2706), (2706, 2708), (2708, 2716), (2716, 2721), (2721, 2727), (2727, 2728), (2728, 2732), (2732, 2736), (2736, 2740), (2740, 2745), (2745, 2750), (2750, 2755), (2755, 2758), (2758, 2762), (2762, 2768), (2768, 2773), (2773, 2778), (2778, 2782), (2782, 2786), (2786, 2790), (2790, 2791), (2791, 2795), (2795, 2799), (2799, 2806), (2806, 2810), (2810, 2815), (2815, 2819), (2819, 2820), (2820, 2824), (2824, 2829), (28

In [14]:
import csv
import json
from ast import literal_eval

headers = ['start', 'end', 'id', 'src', 'loc', 'title', 'entities', 'offsets']

with open('/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-4-00000_new_2.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    first_row = next(reader)  # Read the first data row
    row = dict(zip(headers, first_row))  # Map headers to values
    
    # Parse the 'entities' field (which is a JSON string)
    entities = json.loads(row['entities'])
    offsets = literal_eval(row['offsets'])

    
    print("Parsed Row:", row)
    print("Parsed Entities:", entities)
    print("Offsets Entities:", offsets)

Parsed Row: {'start': '0', 'end': '55', 'id': '59516359', 'src': '/home/morg/dataset/maverick/maverick_7.json', 'loc': '8152', 'title': 'Tobi Pelly', 'entities': '[{"char_start":49,"char_end":54,"text_mention":"boxer","candidates":[{"qid":"Q32112","name":"boxing","scores_by_source":{"hyperlinks":1.0,"entity_linking":0.96,"coref":0.0,"coref_cluster":0.0},"aggregated_score":0.69}]},{"char_start":75,"char_end":95,"text_mention":"1984 Summer Olympics","candidates":[{"qid":"Q8456","name":"1984 Summer Olympics","scores_by_source":{"hyperlinks":1.0,"entity_linking":0.94,"coref":0.0,"coref_cluster":0.0},"aggregated_score":0.68}]},{"char_start":104,"char_end":124,"text_mention":"1988 Summer Olympics","candidates":[{"qid":"Q8470","name":"1988 Summer Olympics","scores_by_source":{"hyperlinks":1.0,"entity_linking":0.94,"coref":0.0,"coref_cluster":0.0},"aggregated_score":0.68}]},{"char_start":166,"char_end":181,"text_mention":"Paul Fitzgerald","candidates":[{"qid":"Q11813651","name":"Paul Fitzgeral

In [13]:
import csv
import json
from ast import literal_eval

headers = ['start', 'end', 'id', 'src', 'loc', 'title', 'entities', 'offsets']

file_path = '/home/morg/students/gottesman3/knowledge-analysis-suite/dolma/python/final_tokenizations_with_offsets/no_special/part-4-00000_new_2.csv'

def read_last_non_empty_line(path, encoding='utf-8'):
    with open(path, 'rb') as f:
        f.seek(0, 2)  # Seek to end of file
        end = f.tell()
        buffer = bytearray()
        pos = end - 1

        while pos >= 0:
            f.seek(pos)
            byte = f.read(1)
            if byte == b'\n' and buffer:
                break
            buffer.insert(0, byte[0])  # Prepend byte
            pos -= 1

        return buffer.decode(encoding).strip()

# Read the actual last non-empty line
last_line = read_last_non_empty_line(file_path)

# Use csv.reader to parse it safely
row = next(csv.reader([last_line]))
row_dict = dict(zip(headers, row))

# Parse structured fields
entities = json.loads(row_dict['entities'])
offsets = literal_eval(row_dict['offsets'])

# Print results
print("Last Row:", row_dict)
print("Parsed Entities:", entities)
print("Parsed Offsets:", offsets)


Last Row: {'start': '239098563', 'end': '239098611', 'id': '67189461', 'src': '/home/morg/dataset/maverick/maverick_7.json', 'loc': '540582', 'title': 'Basedowia basicollis', 'entities': '[{"char_start":46,"char_end":52,"text_mention":"beetle","candidates":[{"qid":"Q22671","name":"beetle","scores_by_source":{"hyperlinks":1.0,"entity_linking":0.0,"coref":0.0,"coref_cluster":0.0},"aggregated_score":0.4}]},{"char_start":78,"char_end":91,"text_mention":"Curculionidae","candidates":[{"qid":"Q7415384","name":"Curculionidae","scores_by_source":{"hyperlinks":1.0,"entity_linking":0.99,"coref":0.0,"coref_cluster":0.0},"aggregated_score":0.7}]},{"char_start":97,"char_end":109,"text_mention":"only species","candidates":[{"qid":"Q310890","name":"Monotypic taxon","scores_by_source":{"hyperlinks":1.0,"entity_linking":0.0,"coref":0.0,"coref_cluster":0.0},"aggregated_score":0.4}]},{"char_start":46,"char_end":53,"text_mention":"beetles","candidates":[{"qid":"Q22671","name":"beetles","scores_by_source":{