In [0]:
%sql
-- ============================================================
-- NOTEBOOK A: Research Actions (v2.1)
-- Run BEFORE Notebook B (signal detection)
--
-- v2 additions vs v1:
--   gold_research_action: DOWNLOAD_DOCUMENTS, TRANSCRIBE_DOCUMENTS,
--     RESOLVE_CONFLICTS, INVESTIGATE_OCCUPATION, INVESTIGATE_LATER_LIFE
--   ref_intent_category_weights: narrative/context at 0.25
--   ref_signal_weights: SIGNAL_LATE_LIFE_GAP, SIGNAL_EARLY_LIFE_ONLY,
--     SIGNAL_CHILD_GAPS, SIGNAL_MISSING_CENSUS_COVERAGE,
--     SIGNAL_UNCOVERED_SOURCES, SIGNAL_IMPRECISE_PLACES,
--     SIGNAL_DOCS_NOT_TRANSCRIBED, SIGNAL_FACT_CONFLICT,
--     SIGNAL_POSSIBLE_OCCUPATION, SIGNAL_HIGH_FAMILY_PAYOFF,
--     SIGNAL_TRANSCRIPT_AVAILABLE, SIGNAL_VARIED_OCCUPATIONS,
--     SIGNAL_STORY_WRITTEN (negative weight)
--   gold_research_signal_action: new OCR signal mappings,
--     new lifecycle/context signal mappings
--
-- v2.1 patch (incorporated):
--   ref_signal_weights:
--     SIGNAL_LOW_EVIDENCE_DENSITY   base_score 30 -> 15,
--                                   threshold label updated (< 1.0 not < 1.2)
--     SIGNAL_VERY_LOW_EVIDENCE_DENSITY  NEW, base_score 30, threshold < 0.3
--     SIGNAL_UNCOVERED_SOURCES      base_score 25 -> 15
--                                   (tightened: > 1 source AND proximity <= 2)
--     SIGNAL_POSSIBLE_MARRIAGE      NEW entry, context/narrative, base 15
--     SIGNAL_POSSIBLE_CHILDREN      NEW entry, context/narrative, base 15
--     SIGNAL_POSSIBLE_RESIDENCE     NEW entry, context/narrative, base 15
--   gold_research_signal_action:
--     SIGNAL_VERY_LOW_EVIDENCE_DENSITY  NEW mappings
--     SIGNAL_POSSIBLE_MARRIAGE      NEW mappings
--     SIGNAL_POSSIBLE_CHILDREN      NEW mappings
--     SIGNAL_POSSIBLE_RESIDENCE     NEW mappings
-- ============================================================

CREATE OR REPLACE TABLE genealogy.gold_research_action (
  action_code     STRING,
  action_label    STRING,
  action_category STRING,   -- lifecycle | evidence | family | identity | narrative
  default_effort  INT       -- 1 = low, 2 = medium, 3 = high
);

INSERT INTO genealogy.gold_research_action VALUES
-- Evidence & identity
('CLUSTER_SOURCES',         'Find corroborating sources',                         'evidence',  2),
('REVIEW_EXISTING_SOURCES', 'Review existing sources closely',                     'evidence',  1),
('RESOLVE_NAME_VARIANTS',   'Resolve name variants / aliases',                     'identity',  2),
('RESOLVE_CONFLICTS',       'Investigate and resolve transcript vs tree conflicts', 'evidence',  2),

-- Lifecycle
('SEARCH_BIRTH_RECORDS',    'Search for birth/baptism records',                    'lifecycle', 1),
('SEARCH_DEATH_RECORDS',    'Search for death/burial records',                     'lifecycle', 1),
('SEARCH_MARRIAGE_RECORDS', 'Search for marriage records',                         'lifecycle', 1),
('SEARCH_CENSUS',           'Search census / population records',                  'lifecycle', 1),

-- Document pipeline
('DOWNLOAD_DOCUMENTS',      'Download cited documents from Ancestry/archive',      'evidence',  2),
('TRANSCRIBE_DOCUMENTS',    'Run OCR transcription pipeline on downloaded docs',   'evidence',  1),

-- Family structure
('TRACE_PARENTS',           'Identify or verify parents',                          'family',    2),
('TRACE_CHILDREN_FORWARD',  'Trace children forward',                              'family',    2),
('VERIFY_FAMILY_EVENTS',    'Source family events (marriage, children)',            'family',    1),

-- Contextual / narrative
('INVESTIGATE_MIGRATION',   'Investigate migration and movement',                  'narrative', 2),
('INVESTIGATE_MILITARY',    'Investigate military service',                        'narrative', 3),
('INVESTIGATE_OCCUPATION',  'Research occupational history and records',           'narrative', 2),
('INVESTIGATE_LATER_LIFE',  'Search for late-life events and records',             'narrative', 2);


In [0]:
%sql
-- ============================================================
-- CELL 2: ref_intent_category_weights
-- Weights for each intent category should sum to 1.0
-- ============================================================

CREATE OR REPLACE TABLE genealogy.ref_intent_category_weights (
  intent   STRING,
  category STRING,
  weight   DOUBLE
);

INSERT OVERWRITE genealogy.ref_intent_category_weights VALUES
  ('integrity', 'evidence',     0.60),
  ('integrity', 'completeness', 0.40),
  ('narrative', 'texture',      0.45),
  ('narrative', 'family',       0.30),
  ('narrative', 'context',      0.25);  -- SIGNAL_POSSIBLE_OCCUPATION,
                                        -- SIGNAL_POSSIBLE_RESIDENCE (tightened),
                                        -- SIGNAL_HIGH_FAMILY_PAYOFF,
                                        -- SIGNAL_TRANSCRIPT_AVAILABLE,
                                        -- SIGNAL_VARIED_OCCUPATIONS,
                                        -- SIGNAL_POSSIBLE_MARRIAGE,
                                        -- SIGNAL_POSSIBLE_CHILDREN


In [0]:
%sql
-- ============================================================
-- CELL 3: ref_signal_weights (v2.1)
--
-- v2 changes vs v1:
--   REMOVED: SIGNAL_HAS_MARRIAGE, SIGNAL_HAS_CHILDREN
--   ADDED:   completeness — SIGNAL_LATE_LIFE_GAP, SIGNAL_EARLY_LIFE_ONLY,
--                           SIGNAL_CHILD_GAPS, SIGNAL_MISSING_CENSUS_COVERAGE,
--                           SIGNAL_UNCOVERED_SOURCES
--   ADDED:   evidence     — SIGNAL_IMPRECISE_PLACES, SIGNAL_DOCS_NOT_TRANSCRIBED,
--                           SIGNAL_FACT_CONFLICT
--   ADDED:   context      — SIGNAL_POSSIBLE_OCCUPATION, SIGNAL_HIGH_FAMILY_PAYOFF,
--                           SIGNAL_TRANSCRIPT_AVAILABLE, SIGNAL_VARIED_OCCUPATIONS
--   ADDED:   texture      — SIGNAL_STORY_WRITTEN (negative weight)
--
-- v2.1 patch changes (incorporated here):
--   SIGNAL_LOW_EVIDENCE_DENSITY   base_score 30 -> 15; threshold now < 1.0
--   SIGNAL_VERY_LOW_EVIDENCE_DENSITY  NEW; threshold < 0.3 or NULL; base 30
--   SIGNAL_UNCOVERED_SOURCES      base_score 25 -> 15
--   SIGNAL_POSSIBLE_MARRIAGE      NEW; context/narrative; base 15
--   SIGNAL_POSSIBLE_CHILDREN      NEW; context/narrative; base 15
--   SIGNAL_POSSIBLE_RESIDENCE     NEW; context/narrative; base 15
-- ============================================================

CREATE OR REPLACE TABLE genealogy.ref_signal_weights (
  signal_code  STRING,
  category     STRING,   -- evidence | completeness | texture | family | context
  intent       STRING,   -- integrity | narrative
  base_score   INT,
  reason_label STRING,
  rationale    STRING
);

INSERT OVERWRITE genealogy.ref_signal_weights VALUES

  -- ===========================================================
  -- INTEGRITY — Evidence fragility
  -- ===========================================================
  -- v2.1: base_score 30->15; threshold tightened to < 1.0 (some unsourced facts)
  ('SIGNAL_LOW_EVIDENCE_DENSITY',       'evidence', 'integrity', 15, 'low source density (some unsourced facts)',  'avg sources per fact < 1.0 — at least one fact has no source citation'),
  -- v2.1 NEW: catches largely-unsourced profiles (~8% of tree)
  ('SIGNAL_VERY_LOW_EVIDENCE_DENSITY',  'evidence', 'integrity', 30, 'very low source density (mostly unsourced)', 'avg sources per fact < 0.3 — profile is largely unsourced, ~8% of tree'),
  ('SIGNAL_SINGLE_SOURCE_DEPENDENCE',   'evidence', 'integrity', 25, 'single-source reliance',     'catches illusion-of-certainty problem'),
  ('SIGNAL_UNSOURCED_FAMILY_EVENTS',    'evidence', 'integrity', 25, 'unsourced family events',    'marriages and child births without sources are research traps'),
  ('SIGNAL_IMPRECISE_DATES',            'evidence', 'integrity', 10, 'imprecise dates',            'post-1837 civil registration events where a certificate should exist'),
  ('SIGNAL_INCOMPLETE_NAME',            'evidence', 'integrity', 10, 'incomplete name',            'suggests exhaustive searches not done'),
  ('SIGNAL_IMPRECISE_PLACES',           'evidence', 'integrity', 10, 'imprecise places',           'post-1837 records should resolve to town/parish level'),
  ('SIGNAL_DOCS_NOT_TRANSCRIBED',       'evidence', 'integrity', 15, 'documents not transcribed',  'evidence downloaded but not yet verified via OCR'),
  ('SIGNAL_FACT_CONFLICT',              'evidence', 'integrity', 30, 'transcript conflicts tree',  'highest integrity risk — document contradicts recorded fact'),

  -- ===========================================================
  -- INTEGRITY — Structural completeness
  -- ===========================================================
  ('SIGNAL_NO_BIRTH_RECORDED',        'completeness', 'integrity', 30, 'birth not recorded',           NULL),
  ('SIGNAL_MISSING_PARENT',           'completeness', 'integrity', 25, 'missing parent',               NULL),
  ('SIGNAL_NO_DEATH_RECORDED',        'completeness', 'integrity', 20, 'death not recorded',           NULL),
  ('SIGNAL_NO_MARRIAGES',             'completeness', 'integrity', 15, 'no marriage recorded',         'age-guarded: suppressed for young deaths'),
  ('SIGNAL_NO_CHILDREN',              'completeness', 'integrity', 10, 'no children recorded',         'proximity guard: proximity <= 1 only'),
  ('SIGNAL_MISSING_CENSUS_COVERAGE',  'completeness', 'integrity', 20, 'missing expected census',      'person was alive during census year but no RESI event present'),
  -- v2.1: base_score 25->15; fires only for proximity <= 2 with > 1 uncovered source
  ('SIGNAL_UNCOVERED_SOURCES',        'completeness', 'integrity', 15, 'uncovered cited sources',      'tightened: only fires for proximity <= 2 with > 1 uncovered source'),
  ('SIGNAL_LATE_LIFE_GAP',            'completeness', 'integrity', 15, 'late-life records gap',        'no events recorded after age 40 despite expected survival'),
  ('SIGNAL_EARLY_LIFE_ONLY',          'completeness', 'integrity', 10, 'early-life records only',      'classic birth+parents-only profile, no adult records'),
  ('SIGNAL_CHILD_GAPS',               'completeness', 'integrity', 10, 'gaps between child births',    'unusually long inter-birth gap or marriage with no children'),

  -- ===========================================================
  -- NARRATIVE — Life texture
  -- ===========================================================
  ('SIGNAL_MIGRANT',         'texture', 'narrative',  30, 'evidence of migration',        NULL),
  ('SIGNAL_MILITARY',        'texture', 'narrative',  25, 'evidence of military service', NULL),
  ('SIGNAL_POSSIBLE_WWI',    'texture', 'narrative',  15, 'possible WWI service',         NULL),
  ('SIGNAL_POSSIBLE_WWII',   'texture', 'narrative',  15, 'possible WWII service',        NULL),
  ('SIGNAL_YOUNG_DEATH',     'texture', 'narrative',  20, 'young death',                  NULL),
  ('SIGNAL_STORY_WRITTEN',   'texture', 'narrative', -50, 'story already written',        'suppresses narrative priority once story is complete'),

  -- ===========================================================
  -- NARRATIVE — Family
  -- ===========================================================
  ('SIGNAL_MULTIPLE_SPOUSES', 'family', 'narrative', 40, 'multiple spouses', NULL),

  -- ===========================================================
  -- NARRATIVE — Context (weight 0.25)
  -- ===========================================================
  ('SIGNAL_POSSIBLE_OCCUPATION', 'context', 'narrative', 20, 'occupation records likely available',      'census-present adult — occupational records plausible'),
  ('SIGNAL_HIGH_FAMILY_PAYOFF',  'context', 'narrative', 25, 'high-leverage under-researched family',   'many family events, low source density'),
  ('SIGNAL_TRANSCRIPT_AVAILABLE','context', 'narrative', 20, 'transcribed documents available',         'positive signal: OCR transcripts exist to draw narrative from'),
  ('SIGNAL_VARIED_OCCUPATIONS',  'context', 'narrative', 15, 'varied occupational history',             'multiple distinct occupations recorded — richer life story'),
  -- v2.1 NEW: previously unscored signals now wired up
  ('SIGNAL_POSSIBLE_MARRIAGE',   'context', 'narrative', 15, 'possible unrecorded marriage',            'female in 1939 register with no marriage recorded — likely has married name'),
  ('SIGNAL_POSSIBLE_CHILDREN',   'context', 'narrative', 15, 'possible unrecorded children',            'female in 1911 census born <=1895 — 1911 recorded number of live births'),
  ('SIGNAL_POSSIBLE_RESIDENCE',  'context', 'narrative', 15, 'residence records likely findable',       'family events present but census coverage incomplete — residential records plausible');


In [0]:
%sql
-- ============================================================
-- CELL 4: gold_research_signal_action (v2.1)
--
-- v2 changes vs v1:
--   REMOVED: SIGNAL_HAS_CHILDREN, SIGNAL_HAS_MARRIAGE, SIGNAL_MULTIPLE_SPOUSES
--            mappings that were duplicated or low-value
--   ADDED:   missing action gaps identified in audit
--   ADDED:   new OCR signal -> action mappings
--   ADDED:   new signal mappings for wired-up signals
--
-- v2.1 patch changes (incorporated here):
--   ADDED:   SIGNAL_VERY_LOW_EVIDENCE_DENSITY action mappings (new signal)
--   ADDED:   SIGNAL_POSSIBLE_MARRIAGE action mappings (previously unmapped)
--   ADDED:   SIGNAL_POSSIBLE_CHILDREN action mappings (previously unmapped)
--   ADDED:   SIGNAL_POSSIBLE_RESIDENCE action mappings (previously unmapped)
-- ============================================================

CREATE OR REPLACE TABLE genealogy.gold_research_signal_action (
  signal_code   STRING,
  action_code   STRING,
  action_weight INT   -- higher = higher priority for this signal
);

INSERT INTO genealogy.gold_research_signal_action VALUES

  -- Evidence quality
  ('SIGNAL_LOW_EVIDENCE_DENSITY',          'CLUSTER_SOURCES',         3),
  ('SIGNAL_LOW_EVIDENCE_DENSITY',          'SEARCH_CENSUS',           2),
  ('SIGNAL_LOW_EVIDENCE_DENSITY',          'REVIEW_EXISTING_SOURCES', 2),
  -- v2.1 NEW: same actions as LOW but higher action_weight reflects severity
  ('SIGNAL_VERY_LOW_EVIDENCE_DENSITY',     'CLUSTER_SOURCES',         3),
  ('SIGNAL_VERY_LOW_EVIDENCE_DENSITY',     'SEARCH_CENSUS',           2),
  ('SIGNAL_VERY_LOW_EVIDENCE_DENSITY',     'REVIEW_EXISTING_SOURCES', 2),
  ('SIGNAL_SINGLE_SOURCE_DEPENDENCE',      'REVIEW_EXISTING_SOURCES', 3),
  ('SIGNAL_SINGLE_SOURCE_DEPENDENCE',      'CLUSTER_SOURCES',         2),
  ('SIGNAL_UNSOURCED_FAMILY_EVENTS',       'VERIFY_FAMILY_EVENTS',    3),
  ('SIGNAL_FACT_CONFLICT',                 'RESOLVE_CONFLICTS',       3),
  ('SIGNAL_DOCS_NOT_TRANSCRIBED',          'TRANSCRIBE_DOCUMENTS',    3),
  ('SIGNAL_UNCOVERED_SOURCES',             'DOWNLOAD_DOCUMENTS',      3),

  -- Identity quality
  ('SIGNAL_INCOMPLETE_NAME',   'SEARCH_MARRIAGE_RECORDS', 3),
  ('SIGNAL_INCOMPLETE_NAME',   'SEARCH_BIRTH_RECORDS',    2),
  ('SIGNAL_IMPRECISE_DATES',   'REVIEW_EXISTING_SOURCES', 3),
  ('SIGNAL_IMPRECISE_DATES',   'SEARCH_BIRTH_RECORDS',    2),
  ('SIGNAL_IMPRECISE_DATES',   'SEARCH_DEATH_RECORDS',    2),
  ('SIGNAL_IMPRECISE_PLACES',  'REVIEW_EXISTING_SOURCES', 3),
  ('SIGNAL_IMPRECISE_PLACES',  'CLUSTER_SOURCES',         2),

  -- Lifecycle gaps
  ('SIGNAL_NO_BIRTH_RECORDED',       'SEARCH_BIRTH_RECORDS',    3),
  ('SIGNAL_NO_DEATH_RECORDED',       'SEARCH_DEATH_RECORDS',    3),
  ('SIGNAL_NO_MARRIAGES',            'SEARCH_MARRIAGE_RECORDS', 3),
  ('SIGNAL_NO_CHILDREN',             'SEARCH_CENSUS',           3),
  ('SIGNAL_NO_CHILDREN',             'SEARCH_BIRTH_RECORDS',    2),
  ('SIGNAL_MISSING_CENSUS_COVERAGE', 'SEARCH_CENSUS',           3),
  ('SIGNAL_LATE_LIFE_GAP',           'INVESTIGATE_LATER_LIFE',  3),
  ('SIGNAL_LATE_LIFE_GAP',           'SEARCH_DEATH_RECORDS',    2),
  ('SIGNAL_EARLY_LIFE_ONLY',         'SEARCH_CENSUS',           3),
  ('SIGNAL_EARLY_LIFE_ONLY',         'CLUSTER_SOURCES',         2),
  ('SIGNAL_CHILD_GAPS',              'SEARCH_BIRTH_RECORDS',    3),
  ('SIGNAL_CHILD_GAPS',              'SEARCH_CENSUS',           2),

  -- Family structure
  ('SIGNAL_MISSING_PARENT',    'TRACE_PARENTS',           3),
  ('SIGNAL_MULTIPLE_SPOUSES',  'VERIFY_FAMILY_EVENTS',    3),

  -- Context & narrative
  ('SIGNAL_MIGRANT',              'INVESTIGATE_MIGRATION',   3),
  ('SIGNAL_MILITARY',             'INVESTIGATE_MILITARY',    3),
  ('SIGNAL_POSSIBLE_WWI',         'INVESTIGATE_MILITARY',    3),
  ('SIGNAL_POSSIBLE_WWII',        'INVESTIGATE_MILITARY',    3),
  ('SIGNAL_YOUNG_DEATH',          'REVIEW_EXISTING_SOURCES', 3),
  ('SIGNAL_YOUNG_DEATH',          'SEARCH_DEATH_RECORDS',    2),
  ('SIGNAL_POSSIBLE_OCCUPATION',  'INVESTIGATE_OCCUPATION',  3),
  ('SIGNAL_POSSIBLE_OCCUPATION',  'SEARCH_CENSUS',           2),
  ('SIGNAL_HIGH_FAMILY_PAYOFF',   'VERIFY_FAMILY_EVENTS',    3),
  ('SIGNAL_HIGH_FAMILY_PAYOFF',   'CLUSTER_SOURCES',         2),
  ('SIGNAL_VARIED_OCCUPATIONS',   'INVESTIGATE_OCCUPATION',  3),
  -- v2.1 NEW: previously unmapped signals
  ('SIGNAL_POSSIBLE_MARRIAGE',    'SEARCH_MARRIAGE_RECORDS', 3),
  ('SIGNAL_POSSIBLE_MARRIAGE',    'SEARCH_CENSUS',           2),
  ('SIGNAL_POSSIBLE_CHILDREN',    'SEARCH_BIRTH_RECORDS',    3),
  ('SIGNAL_POSSIBLE_CHILDREN',    'SEARCH_CENSUS',           2),
  ('SIGNAL_POSSIBLE_RESIDENCE',   'SEARCH_CENSUS',           3),
  ('SIGNAL_POSSIBLE_RESIDENCE',   'CLUSTER_SOURCES',         2);
  -- Note: SIGNAL_TRANSCRIPT_AVAILABLE and SIGNAL_STORY_WRITTEN
  --       are read/suppression signals — no research actions triggered


In [0]:
%sql
-- ============================================================
-- CELL 5: Verification queries
-- Run after all cells above to confirm consistency
-- ============================================================

-- 1. Signals with weights but no action mapping (should return 0 rows,
--    except SIGNAL_TRANSCRIPT_AVAILABLE and SIGNAL_STORY_WRITTEN which
--    are intentionally action-free)
SELECT
  w.signal_code,
  w.intent,
  w.category,
  w.base_score
FROM genealogy.ref_signal_weights w
LEFT JOIN genealogy.gold_research_signal_action a
  ON a.signal_code = w.signal_code
WHERE a.signal_code IS NULL
  AND w.signal_code NOT IN ('SIGNAL_TRANSCRIPT_AVAILABLE', 'SIGNAL_STORY_WRITTEN')
ORDER BY w.intent, w.category;


In [0]:
%sql
-- 2. Action mappings pointing to non-existent action codes (should return 0 rows)
SELECT
  sa.signal_code,
  sa.action_code
FROM genealogy.gold_research_signal_action sa
LEFT JOIN genealogy.gold_research_action a
  ON a.action_code = sa.action_code
WHERE a.action_code IS NULL;


In [0]:
%sql
-- 3. Category weights summary — confirm narrative sums to 1.0
SELECT
  intent,
  ROUND(SUM(weight), 2) AS total_weight,
  COLLECT_LIST(CONCAT(category, '=', CAST(weight AS STRING))) AS breakdown
FROM genealogy.ref_intent_category_weights
GROUP BY intent
ORDER BY intent;
