# MITRE LangExtract Mini Flow + Relationship Graph

This notebook runs the self-contained `mitre-langextract-mini` workflow on a fixed seed URL, then visualizes how claims, citations, validation, and MITRE techniques are connected.

## 1) Install dependencies

In [1]:
%pip install -q langextract requests trafilatura waybackpy pypdf networkx plotly pandas nbformat


Note: you may need to restart the kernel to use updated packages.


## 2) Imports and configuration

In [2]:
import json
import os
import sys
from pathlib import Path

import networkx as nx
import plotly.graph_objects as go

ROOT = Path.cwd()
if ROOT.name == 'notebooks':
    ROOT = ROOT.parent

PKG_SRC = ROOT / 'packages' / 'mitre-langextract-mini' / 'src'
if str(PKG_SRC) not in sys.path:
    sys.path.insert(0, str(PKG_SRC))

from mitre_langextract_mini.workflow import run_random_reference_langextract

SEED_URL = 'https://therecord.media/dutch-telecom-giant-announces-data-breach'
USE_LANGEXTRACT = bool(os.getenv('OPENAI_API_KEY'))
TIMEOUT_SECONDS = 45
MAX_WAYBACK_URLS = 8

print(f'Seed URL: {SEED_URL}')
print(f'USE_LANGEXTRACT={USE_LANGEXTRACT} (set OPENAI_API_KEY to enable model extraction)')
print(f'Package source path: {PKG_SRC}')

Seed URL: https://therecord.media/dutch-telecom-giant-announces-data-breach
USE_LANGEXTRACT=True (set OPENAI_API_KEY to enable model extraction)
Package source path: /home/gangsta/hustle/mybot/packages/mitre-langextract-mini/src


## 3) Run extraction + validation flow

In [3]:
result = run_random_reference_langextract(
    reference_url=SEED_URL,
    use_langextract=USE_LANGEXTRACT,
    prefer_wayback=True,
    max_wayback_urls=MAX_WAYBACK_URLS,
    timeout=TIMEOUT_SECONDS,
)

status = result.get('status', {})
reference = result.get('reference', {})
summary = {
    'results_source': status.get('results_source'),
    'claims_count': status.get('claims_count'),
    'falsifiable_claims_count': status.get('falsifiable_claims_count'),
    'mitre_claims_count': status.get('mitre_claims_count'),
    'mitre_linkages_count': status.get('mitre_linkages_count'),
    'download_source': reference.get('download_source'),
    'resolved_url': reference.get('resolved_url'),
}
print(json.dumps(summary, indent=2))

{
  "results_source": "langextract",
  "claims_count": 21,
  "falsifiable_claims_count": 20,
  "mitre_claims_count": 1,
  "mitre_linkages_count": 2,
  "download_source": "wayback",
  "resolved_url": "https://web.archive.org/web/20260213141303/https://therecord.media/dutch-telecom-giant-announces-data-breach"
}


## 4) Inspect linkage-level validation explanations

In [4]:
linkages = result.get('mitre_linkages') or []
print(f'Total MITRE linkages: {len(linkages)}')

for idx, linkage in enumerate(linkages[:5], start=1):
    print(f'\n[{idx}] {linkage.get("mitre_id")} {linkage.get("technique_name")}')
    print(f'Claim: {linkage.get("claim_text")}')
    validation = linkage.get('validation') or {}
    print(f'Validation: {validation.get("verdict")} score={validation.get("score")}')
    print(f'Validation explanation: {validation.get("explanation")}')
    for cidx, citation in enumerate((linkage.get('citations') or [])[:2], start=1):
        print(f'  Citation {cidx}: offsets={citation.get("start")}:{citation.get("end")}')
        print(f'    relation_to_mitre={citation.get("relation_to_mitre")}')
        print(f'    validation_explanation={citation.get("validation_explanation")}')
        print(f'    quote={citation.get("quote")!r}')

Total MITRE linkages: 2

[1] T1566 Phishing
Claim: Some may send phishing emails made to look like Odido communications.
Validation: supported score=0.75
Validation explanation: Citation includes the technique name for Phishing (T1566), grounding the MITRE linkage in source text.
  Citation 1: offsets=1343:1412
    relation_to_mitre=technique_name_match
    validation_explanation=Citation includes the technique name for Phishing (T1566), grounding the MITRE linkage in source text.
    quote='Some may send phishing emails made to look like Odido communications.'
  Citation 2: offsets=1357:1365
    relation_to_mitre=technique_name_match
    validation_explanation=Citation includes the technique name for Phishing (T1566), grounding the MITRE linkage in source text.
    quote='phishing'

[2] T1589.002 Email Addresses
Claim: In a statement about the incident, Odido CEO Søren Abildgaard said names, bank account numbers, addresses, mobile numbers, email addresses, account numbers and IDs – ra

## 5) Tabular view for non-MITRE claims


In [5]:
try:
    import pandas as pd
except ImportError as exc:
    raise RuntimeError(
        "pandas is required for tabular output in this notebook. "
        "Install in this kernel with: %pip install pandas"
    ) from exc

from IPython.display import display


def _clip(value: str, limit: int = 120) -> str:
    text = ' '.join(str(value or '').split())
    if len(text) <= limit:
        return text
    return text[: max(0, limit - 3)] + '...'


claim_assessments = result.get('claim_assessments') or []
claim_references = result.get('claim_associations') or []

if not claim_assessments:
    print('No claim_assessments found in result payload.')
else:
    non_mitre_claims = [
        claim
        for claim in claim_assessments
        if not str(claim.get('mitre_id') or '').strip()
        and not str(claim.get('technique_name') or '').strip()
    ]

    if not non_mitre_claims:
        print('No non-MITRE claims found.')
    else:
        summary_rows = []
        for claim in non_mitre_claims:
            summary_rows.append(
                {
                    'claim_id': claim.get('claim_id'),
                    'claim_state': claim.get('state'),
                    'confidence': claim.get('confidence'),
                    'validation_method': claim.get('validation_method'),
                    'claim_scope': claim.get('claim_scope'),
                    'source': claim.get('source'),
                    'references_total': (
                        int(claim.get('support_reference_count') or 0)
                        + int(claim.get('contradict_reference_count') or 0)
                        + int(claim.get('neutral_reference_count') or 0)
                    ),
                    'support_refs': int(claim.get('support_reference_count') or 0),
                    'contradict_refs': int(claim.get('contradict_reference_count') or 0),
                    'neutral_refs': int(claim.get('neutral_reference_count') or 0),
                    'claim_text': _clip(claim.get('claim_text'), 180),
                    'assessment_reason': _clip(claim.get('reason'), 180),
                }
            )

        summary_df = pd.DataFrame(summary_rows)
        summary_df = summary_df.sort_values(
            by=['claim_state', 'confidence', 'references_total', 'claim_id'],
            ascending=[True, False, False, True],
            na_position='last',
        ).reset_index(drop=True)

        print(f'Non-MITRE claims: {len(summary_df)}')
        display(summary_df)

        non_mitre_ids = set(summary_df['claim_id'].astype(str).tolist())
        refs = [
            ref
            for ref in claim_references
            if str(ref.get('claim_id') or '') in non_mitre_ids
        ]

        if refs:
            claim_state_by_id = {
                str(row.get('claim_id') or ''): row.get('claim_state')
                for row in summary_rows
            }
            refs_rows = []
            for ref in refs:
                claim_id = str(ref.get('claim_id') or '')
                refs_rows.append(
                    {
                        'claim_id': claim_id,
                        'claim_state': claim_state_by_id.get(claim_id),
                        'stance': ref.get('stance'),
                        'stance_source': ref.get('stance_source'),
                        'citation_id': ref.get('citation_id'),
                        'method': ref.get('method'),
                        'relation_to_mitre': ref.get('relation_to_mitre'),
                        'start': ref.get('start'),
                        'end': ref.get('end'),
                        'quote': _clip(ref.get('quote'), 180),
                        'context': _clip(ref.get('context'), 220),
                        'nli_label': ref.get('nli_label'),
                        'nli_confidence': ref.get('nli_confidence'),
                        'nli_reason': _clip(ref.get('nli_reason'), 140),
                        'reference_url': ref.get('reference_url'),
                    }
                )

            refs_df = pd.DataFrame(refs_rows)
            refs_df = refs_df.sort_values(
                by=['claim_id', 'stance', 'start', 'citation_id'],
                ascending=[True, True, True, True],
                na_position='last',
            ).reset_index(drop=True)

            print(f'Non-MITRE claim references/citations: {len(refs_df)}')
            display(refs_df)
        else:
            print('No reference/citation rows for non-MITRE claims.')



Non-MITRE claims: 18


Unnamed: 0,claim_id,claim_state,confidence,validation_method,claim_scope,source,references_total,support_refs,contradict_refs,neutral_refs,claim_text,validation_explanation
0,claim_7deab3579c4c,partial,0.65,self_referential,cybersecurity,langextract_claim_first_pass,1,1,0,0,They warned victims that hackers could contact...,Citation grounds the claim directly in source ...
1,claim_7f80aede0027,partial,0.65,self_referential,general,langextract_claim_first_pass,1,1,0,0,6.2 million people had information stolen.,Citation grounds the claim directly in source ...
2,claim_a070ef3cd59b,partial,0.65,self_referential,general,langextract_claim_first_pass,1,1,0,0,Customers will be contacted directly by Odido ...,Citation grounds the claim directly in source ...
3,claim_124e9174b84d,partial,0.58,self_referential,cybersecurity,langextract_claim_first_pass,1,1,0,0,The cybercriminals downloaded customer informa...,Citation grounds the claim directly in source ...
4,claim_3035f4d22b62,partial,0.58,self_referential,cybersecurity,langextract_claim_first_pass,1,1,0,0,"South Korea’s major mobile carrier, SK Telecom...",Citation grounds the claim directly in source ...
5,claim_7e2e61c3ce97,partial,0.58,self_referential,cybersecurity,langextract_claim_first_pass,1,1,0,0,Cyberattacks on large national telecoms have r...,Citation grounds the claim directly in source ...
6,claim_86597f12534f,partial,0.58,self_referential,cybersecurity,langextract_claim_first_pass,1,1,0,0,Investigators traced the attack back to a comp...,Citation grounds the claim directly in source ...
7,claim_cdb5f61c567e,partial,0.58,self_referential,cybersecurity,langextract_claim_first_pass,1,1,0,0,No cybercriminals have come forward to claim t...,Citation grounds the claim directly in source ...
8,claim_cf1c13939d3b,partial,0.58,self_referential,cybersecurity,langextract_claim_first_pass,1,1,0,0,The breach exposed the personal data of about ...,Citation grounds the claim directly in source ...
9,claim_1c53981c8b8a,supported,0.88,self_referential,cybersecurity,falsifiable_claim_extraction,1,1,0,0,Abildgaard said the incident took place on Feb...,Citation grounds the claim directly in source ...


Non-MITRE claim references/citations: 18


Unnamed: 0,claim_id,claim_state,stance,stance_source,citation_id,method,relation_to_mitre,start,end,quote,context,nli_label,nli_confidence,nli_reason,reference_url
0,claim_124e9174b84d,partial,support,heuristic,cite_280dbcb8_1,extraction_text,self_referential_source_support,773,845,The cybercriminals downloaded customer informa...,The cybercriminals downloaded customer informa...,,,,https://web.archive.org/web/20260213141303/htt...
1,claim_1c53981c8b8a,supported,support,heuristic,cite_bd71bbdf_1,extraction_span,self_referential_source_support,528,679,Abildgaard said the incident took place on Feb...,Abildgaard said the incident took place on Feb...,,,,https://web.archive.org/web/20260213141303/htt...
2,claim_3035f4d22b62,partial,support,heuristic,cite_1138473b_1,extraction_text,self_referential_source_support,1661,1872,"South Korea’s major mobile carrier, SK Telecom...","South Korea’s major mobile carrier, SK Telecom...",,,,https://web.archive.org/web/20260213141303/htt...
3,claim_399b935a13ac,supported,support,heuristic,cite_af95c206_1,extraction_span,self_referential_source_support,1413,1572,Odido has about 7 million customers and has ch...,Odido has about 7 million customers and has ch...,,,,https://web.archive.org/web/20260213141303/htt...
4,claim_3a25a33f834b,supported,support,heuristic,cite_ee6b523b_1,extraction_span,self_referential_source_support,2173,2240,Jonathan Greig is a Breaking News Reporter at ...,Jonathan Greig is a Breaking News Reporter at ...,,,,https://web.archive.org/web/20260213141303/htt...
5,claim_4d21f2fba53a,supported,support,heuristic,cite_b2fb8531_1,extraction_span,self_referential_source_support,189,280,"The company Odido, told a local news outlet th...","The company Odido, told a local news outlet th...",,,,https://web.archive.org/web/20260213141303/htt...
6,claim_4eb4ae824013,supported,support,heuristic,cite_bf766e4e_1,extraction_span,self_referential_source_support,0,188,Dutch mobile phone giant Odido announces data ...,Dutch mobile phone giant Odido announces data ...,,,,https://web.archive.org/web/20260213141303/htt...
7,claim_6490d9989e48,supported,support,heuristic,cite_3d9c2c2d_1,extraction_span,self_referential_source_support,1941,2172,"Last month, French regulators fined a French t...","Last month, French regulators fined a French t...",,,,https://web.archive.org/web/20260213141303/htt...
8,claim_7deab3579c4c,partial,support,heuristic,cite_e762a4b0_1,extraction_text,self_referential_source_support,1210,1342,They warned victims that hackers could contact...,They warned victims that hackers could contact...,,,,https://web.archive.org/web/20260213141303/htt...
9,claim_7e2e61c3ce97,partial,support,heuristic,cite_08db8ec8_1,extraction_text,self_referential_source_support,1573,1660,Cyberattacks on large national telecoms have r...,Cyberattacks on large national telecoms have r...,,,,https://web.archive.org/web/20260213141303/htt...


## 6) Build a NetworkX relationship graph


In [6]:
def _short(text: str, limit: int = 90) -> str:
    value = ' '.join(str(text or '').split())
    if len(value) <= limit:
        return value
    return value[: max(0, limit - 3)] + '...'


def build_relationship_graph(payload: dict, max_claims: int = 50, max_citations_per_linkage: int = 2) -> nx.DiGraph:
    graph = nx.DiGraph()

    reference = payload.get('reference') or {}
    status = payload.get('status') or {}
    reference_url = reference.get('resolved_url') or (reference.get('meta') or {}).get('url') or 'unknown'
    source_name = (reference.get('meta') or {}).get('source_name') or 'reference'

    reference_node = 'reference::document'
    graph.add_node(
        reference_node,
        label=f'Reference: {source_name}',
        node_type='reference',
        hover=f'URL: {reference_url}',
    )

    status_node = 'status::results_source'
    graph.add_node(
        status_node,
        label=f'results_source={status.get("results_source", "unknown")}',
        node_type='status',
        hover=json.dumps(status, indent=2),
    )
    graph.add_edge(reference_node, status_node, relation='processed_as')

    claims = list(payload.get('claims') or [])
    if max_claims:
        claims = claims[:max_claims]

    claim_node_by_id: dict[str, str] = {}
    technique_node_by_key: dict[str, str] = {}

    for index, claim in enumerate(claims, start=1):
        claim_id = str(claim.get('claim_id') or f'claim_{index}')
        claim_node = f'claim::{claim_id}'
        claim_text = str(claim.get('text') or '').strip()
        validation = claim.get('validation') or {}
        verdict = validation.get('verdict')
        score = validation.get('score')
        explanation = validation.get('explanation')

        graph.add_node(
            claim_node,
            label=f'Claim {index}: {_short(claim_text, 80)}',
            node_type='claim',
            hover=(
                f'{claim_text}\n\n'
                f'verdict={verdict} score={score}\n'
                f'validation_explanation={explanation}'
            ),
        )
        graph.add_edge(reference_node, claim_node, relation='contains_claim')
        claim_node_by_id[claim_id] = claim_node

        validation_method = str(claim.get('validation_method') or '').strip()
        if validation_method:
            vm_node = f'validation_method::{validation_method}'
            if vm_node not in graph:
                graph.add_node(
                    vm_node,
                    label=f'Validation method: {validation_method}',
                    node_type='validation_method',
                    hover=validation_method,
                )
            graph.add_edge(claim_node, vm_node, relation='validated_by')

        mitre_id = str(claim.get('mitre_id') or '').strip()
        technique_name = str(claim.get('technique_name') or '').strip()
        if mitre_id or technique_name:
            key = mitre_id or technique_name
            technique_node = f'technique::{key}'
            if technique_node not in graph:
                label = f'{mitre_id} {technique_name}'.strip() or key
                graph.add_node(
                    technique_node,
                    label=f'Technique: {_short(label, 80)}',
                    node_type='technique',
                    hover=label,
                )
            graph.add_edge(claim_node, technique_node, relation='maps_to')
            technique_node_by_key[key] = technique_node

    for linkage in payload.get('mitre_linkages') or []:
        claim_id = str(linkage.get('claim_id') or '').strip()
        claim_node = claim_node_by_id.get(claim_id)
        if not claim_node:
            continue

        mitre_id = str(linkage.get('mitre_id') or '').strip()
        technique_name = str(linkage.get('technique_name') or '').strip()
        technique_key = mitre_id or technique_name
        technique_node = technique_node_by_key.get(technique_key)

        citations = (linkage.get('citations') or [])[:max_citations_per_linkage]
        for idx, citation in enumerate(citations, start=1):
            citation_node = f'citation::{claim_id}::{idx}'
            relation = str(citation.get('relation_to_mitre') or citation.get('method') or 'supports')
            quote = str(citation.get('quote') or '')
            offsets = f'{citation.get("start")}:{citation.get("end")}'
            validation_explanation = str(citation.get('validation_explanation') or '')

            graph.add_node(
                citation_node,
                label=f'Citation {idx}: {_short(quote, 75)}',
                node_type='citation',
                hover=(
                    f'offsets={offsets}\n'
                    f'method={citation.get("method")}\n'
                    f'relation_to_mitre={relation}\n'
                    f'validation_explanation={validation_explanation}'
                ),
            )
            graph.add_edge(claim_node, citation_node, relation='supported_by')
            if technique_node:
                graph.add_edge(citation_node, technique_node, relation=relation)

    return graph

## 7) Interactive visualization (Plotly over NetworkX layout)


In [7]:
def render_interactive_graph(graph: nx.DiGraph, seed: int = 42, width: int = 1300, height: int = 900):
    if graph.number_of_nodes() == 0:
        print('Graph is empty.')
        return None

    k_value = 1.4 / max(1.0, graph.number_of_nodes() ** 0.5)
    pos = nx.spring_layout(graph, seed=seed, k=k_value)

    edge_x, edge_y = [], []
    edge_label_x, edge_label_y, edge_labels = [], [], []
    for src, dst, edge_data in graph.edges(data=True):
        x0, y0 = pos[src]
        x1, y1 = pos[dst]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edge_label_x.append((x0 + x1) / 2)
        edge_label_y.append((y0 + y1) / 2)
        edge_labels.append(str(edge_data.get('relation') or ''))

    edge_trace = go.Scatter(
        x=edge_x,
        y=edge_y,
        mode='lines',
        line=dict(width=1, color='#9ca3af'),
        hoverinfo='none',
        showlegend=False,
    )

    edge_label_trace = go.Scatter(
        x=edge_label_x,
        y=edge_label_y,
        mode='text',
        text=edge_labels,
        textfont=dict(size=10, color='#6b7280'),
        hoverinfo='none',
        showlegend=False,
    )

    node_style = {
        'reference': {'color': '#2563eb', 'symbol': 'diamond', 'size': 24},
        'status': {'color': '#0891b2', 'symbol': 'square', 'size': 18},
        'claim': {'color': '#059669', 'symbol': 'circle', 'size': 16},
        'technique': {'color': '#dc2626', 'symbol': 'hexagon', 'size': 18},
        'citation': {'color': '#d97706', 'symbol': 'triangle-up', 'size': 14},
        'validation_method': {'color': '#7c3aed', 'symbol': 'square-dot', 'size': 14},
        'other': {'color': '#6b7280', 'symbol': 'circle', 'size': 12},
    }

    traces = [edge_trace, edge_label_trace]
    node_types = sorted({graph.nodes[node].get('node_type', 'other') for node in graph.nodes})

    for node_type in node_types:
        nodes = [node for node in graph.nodes if graph.nodes[node].get('node_type', 'other') == node_type]
        if not nodes:
            continue
        style = node_style.get(node_type, node_style['other'])
        traces.append(
            go.Scatter(
                x=[pos[node][0] for node in nodes],
                y=[pos[node][1] for node in nodes],
                mode='markers+text',
                name=node_type,
                text=[_short(graph.nodes[node].get('label', node), 46) for node in nodes],
                textposition='top center',
                hovertext=[graph.nodes[node].get('hover', '') for node in nodes],
                hoverinfo='text',
                marker=dict(
                    size=style['size'],
                    color=style['color'],
                    symbol=style['symbol'],
                    line=dict(width=1, color='#1f2937'),
                ),
            )
        )

    fig = go.Figure(
        data=traces,
        layout=go.Layout(
            title='Claim/Citation/MITRE relationship graph',
            width=width,
            height=height,
            hovermode='closest',
            showlegend=True,
            margin=dict(l=20, r=20, t=40, b=20),
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            dragmode='pan',
        ),
    )
    fig.show()
    return fig

graph = build_relationship_graph(result, max_claims=50, max_citations_per_linkage=2)
print(f'nodes={graph.number_of_nodes()} edges={graph.number_of_edges()}')
render_interactive_graph(graph)

nodes=31 edges=54
