In [0]:
%sql
-- ============================================================
-- CELL 1: Update ref_signal_weights
--
-- Changes:
--   SIGNAL_LOW_EVIDENCE_DENSITY   — retargeted to < 1.0, base score reduced 30 -> 15
--   SIGNAL_VERY_LOW_EVIDENCE_DENSITY — NEW, < 0.3 threshold, base score 30
--   SIGNAL_UNCOVERED_SOURCES      — base score reduced 25 -> 15
--   SIGNAL_POSSIBLE_MARRIAGE      — NEW entry, context/narrative
--   SIGNAL_POSSIBLE_CHILDREN      — NEW entry, context/narrative
--   SIGNAL_POSSIBLE_RESIDENCE     — NEW entry, context/narrative
-- ============================================================

-- Update LOW_EVIDENCE_DENSITY: reduce base score now threshold is tightened
UPDATE genealogy.ref_signal_weights
SET
  base_score   = 15,
  reason_label = 'low source density (some unsourced facts)',
  rationale    = 'avg sources per fact < 1.0 — at least one fact has no source citation'
WHERE signal_code = 'SIGNAL_LOW_EVIDENCE_DENSITY';

-- Insert new VERY_LOW_EVIDENCE_DENSITY signal
INSERT INTO genealogy.ref_signal_weights VALUES
  ('SIGNAL_VERY_LOW_EVIDENCE_DENSITY', 'evidence', 'integrity', 30,
   'very low source density (mostly unsourced)',
   'avg sources per fact < 0.3 — profile is largely unsourced, ~8% of tree');

-- Reduce UNCOVERED_SOURCES base score (67% fire rate — common signal, lower weight)
UPDATE genealogy.ref_signal_weights
SET
  base_score = 15,
  rationale  = 'tightened: only fires for proximity <= 2 with > 1 uncovered source'
WHERE signal_code = 'SIGNAL_UNCOVERED_SOURCES';

-- Add missing weight entries for three previously unscored signals
INSERT INTO genealogy.ref_signal_weights VALUES
  ('SIGNAL_POSSIBLE_MARRIAGE',  'context', 'narrative', 15,
   'possible unrecorded marriage',
   'female in 1939 register with no marriage recorded — likely has married name'),
  ('SIGNAL_POSSIBLE_CHILDREN',  'context', 'narrative', 15,
   'possible unrecorded children',
   'female in 1911 census born <=1895 — 1911 recorded number of live births'),
  ('SIGNAL_POSSIBLE_RESIDENCE', 'context', 'narrative', 15,
   'residence records likely findable',
   'family events present but census coverage incomplete — residential records plausible');


In [0]:
%sql
-- ============================================================
-- CELL 2: Update gold_research_signal_action
--
-- Changes:
--   Add action mappings for SIGNAL_VERY_LOW_EVIDENCE_DENSITY (new)
--   Add action mappings for SIGNAL_POSSIBLE_MARRIAGE (previously unmapped)
--   Add action mappings for SIGNAL_POSSIBLE_CHILDREN (previously unmapped)
--   Add action mappings for SIGNAL_POSSIBLE_RESIDENCE (previously unmapped)
-- ============================================================

INSERT INTO genealogy.gold_research_signal_action VALUES
  -- New signal — same actions as LOW_EVIDENCE_DENSITY but higher weight
  ('SIGNAL_VERY_LOW_EVIDENCE_DENSITY', 'CLUSTER_SOURCES',         3),
  ('SIGNAL_VERY_LOW_EVIDENCE_DENSITY', 'SEARCH_CENSUS',           2),
  ('SIGNAL_VERY_LOW_EVIDENCE_DENSITY', 'REVIEW_EXISTING_SOURCES', 2),

  -- Previously unmapped signals
  ('SIGNAL_POSSIBLE_MARRIAGE',  'SEARCH_MARRIAGE_RECORDS', 3),
  ('SIGNAL_POSSIBLE_MARRIAGE',  'SEARCH_CENSUS',           2),
  ('SIGNAL_POSSIBLE_CHILDREN',  'SEARCH_BIRTH_RECORDS',    3),
  ('SIGNAL_POSSIBLE_CHILDREN',  'SEARCH_CENSUS',           2),
  ('SIGNAL_POSSIBLE_RESIDENCE', 'SEARCH_CENSUS',           3),
  ('SIGNAL_POSSIBLE_RESIDENCE', 'CLUSTER_SOURCES',         2);


In [0]:
%sql
-- ============================================================
-- CELL 3: Update gold_research_person_signals view
--
-- Four signal definition changes:
--
-- 1. SIGNAL_LOW_EVIDENCE_DENSITY
--    Was:  avg_sources_per_fact < 1.2  (99% fire rate)
--    Now:  avg_sources_per_fact < 1.0  (facts with no source)
--
-- 2. SIGNAL_VERY_LOW_EVIDENCE_DENSITY (NEW)
--    avg_sources_per_fact < 0.3 OR NULL  (~8% of tree)
--
-- 3. SIGNAL_UNCOVERED_SOURCES
--    Was:  any uncovered source (67% fire rate)
--    Now:  > 1 uncovered source AND proximity <= 2
--
-- 4. SIGNAL_IMPRECISE_DATES
--    Was:  birth or death date not day-precision (67% fire rate)
--    Now:  post-1837 AND imprecision on birth/death/marriage specifically
--          (civil registration events where a certificate should exist)
--
-- 5. SIGNAL_NO_CHILDREN
--    Was:  tree-wide with young-death guard
--    Now:  restricted to proximity <= 1 (direct ancestor or close collateral)
--
-- All other signal definitions are unchanged from Notebook B v2.
-- The full view is reproduced here so this cell is self-contained
-- and can be run independently of Notebook B.
-- ============================================================

CREATE OR REPLACE VIEW genealogy.gold_research_person_signals AS

WITH timeline AS (
  SELECT
    t.person_gedcom_id,
    MAX(p.sex)                                    AS sex,
    MIN(event_year_parsed)                        AS first_event_year,
    MAX(event_year_parsed)                        AS last_event_year,
    MAX(age_years)                                AS max_event_age,
    MAX(year(p.death_date))                       AS death_year,
    MAX(year(p.birth_date))                       AS birth_year,
    COALESCE(
      MAX(year(p.death_date)),
      LEAST(MAX(year(p.birth_date)) + 80, year(current_date)),
      LEAST(MIN(event_year_parsed) + 80, year(current_date))
    )                                             AS expected_end_year,
    (
      COALESCE(
        MAX(year(p.death_date)),
        LEAST(MAX(year(p.birth_date)) + 80, year(current_date)),
        LEAST(MIN(event_year_parsed) + 80, year(current_date))
      ) - MIN(event_year_parsed) + 1
    )                                             AS effective_span_years,
    COUNT(*)                                      AS event_count
  FROM genealogy.gold_person_event_timeline t
  JOIN genealogy.gold_person_life p ON p.person_gedcom_id = t.person_gedcom_id
  WHERE event_year_parsed IS NOT NULL
  GROUP BY t.person_gedcom_id
),

evidence AS (
  SELECT
    person_gedcom_id,
    total_facts,
    total_sources,
    avg_sources_per_fact,
    child_event_count,
    marriage_event_count,
    family_event_count,
    sourced_family_event_count
  FROM genealogy.gold_person_evidence_summary
),

proximity AS (
  SELECT
    p.person_id,
    MIN(ancestral_proximity)    AS proximity,
    MIN(d.generation_depth)     AS depth,
    d.person_id                 AS nearest_ancestor_id
  FROM genealogy.gold_ancestral_proximity p
  JOIN genealogy.gold_generation_depth d ON d.person_id = p.path_to_ancestor[0]
  GROUP BY p.person_id, d.person_id
),

census_coverage AS (
  SELECT
    t.person_gedcom_id,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1840
          AND t.expected_end_year >= 1841 THEN 1 ELSE 0 END  AS expected_1841,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1850
          AND t.expected_end_year >= 1851 THEN 1 ELSE 0 END  AS expected_1851,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1860
          AND t.expected_end_year >= 1861 THEN 1 ELSE 0 END  AS expected_1861,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1870
          AND t.expected_end_year >= 1871 THEN 1 ELSE 0 END  AS expected_1871,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1880
          AND t.expected_end_year >= 1881 THEN 1 ELSE 0 END  AS expected_1881,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1890
          AND t.expected_end_year >= 1891 THEN 1 ELSE 0 END  AS expected_1891,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1900
          AND t.expected_end_year >= 1901 THEN 1 ELSE 0 END  AS expected_1901,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1910
          AND t.expected_end_year >= 1911 THEN 1 ELSE 0 END  AS expected_1911,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1920
          AND t.expected_end_year >= 1921 THEN 1 ELSE 0 END  AS expected_1921,
    CASE WHEN t.birth_year IS NOT NULL AND t.birth_year <= 1938
          AND t.expected_end_year >= 1939 THEN 1 ELSE 0 END  AS expected_1939
  FROM timeline t
),

-- Uncovered source counts per person (used for tightened signal definition)
uncovered_counts AS (
  SELECT
    person_gedcom_id,
    COUNT(*) AS uncovered_count
  FROM genealogy.gold_source_coverage
  WHERE coverage_status = 'UNCOVERED'
  GROUP BY person_gedcom_id
),

ocr_signals AS (
  SELECT
    p.person_gedcom_id,
    MAX(CASE WHEN fc.status = 'CONFLICT' THEN TRUE ELSE FALSE END)
      AS has_fact_conflict,
    MAX(CASE WHEN sc.coverage_status = 'DOCUMENT_NO_TRANSCRIPT' THEN TRUE ELSE FALSE END)
      AS has_docs_not_transcribed,
    MAX(CASE WHEN tf.person_gedcom_id IS NOT NULL THEN TRUE ELSE FALSE END)
      AS has_transcript
  FROM genealogy.gold_person_life p
  LEFT JOIN genealogy.gold_fact_comparison fc  ON fc.person_gedcom_id = p.person_gedcom_id
  LEFT JOIN genealogy.gold_source_coverage sc  ON sc.person_gedcom_id = p.person_gedcom_id
  LEFT JOIN (
    SELECT DISTINCT person_gedcom_id FROM genealogy.gold_transcript_facts
  ) tf ON tf.person_gedcom_id = p.person_gedcom_id
  GROUP BY p.person_gedcom_id
),

story_status AS (
  SELECT person_gedcom_id, story_written
  FROM genealogy.silver_person_story_status
)

SELECT
  p.person_gedcom_id,

  -- ── Structural / lineage context ─────────────────────────────────────────
  CASE WHEN p.person_gedcom_id = pr.nearest_ancestor_id THEN TRUE ELSE FALSE END
    AS is_direct_ancestor,
  pr.depth,
  pr.proximity,
  p.event_count,
  p.effective_span_years,

  CASE WHEN pr.proximity = 0 THEN TRUE ELSE FALSE END  AS SIGNAL_DIRECT_ANCESTOR,
  CASE WHEN pr.proximity = 1 THEN TRUE ELSE FALSE END  AS SIGNAL_CLOSE_COLLATERAL,

  -- ── INTEGRITY — Completeness ──────────────────────────────────────────────

  CASE WHEN p.birth_year IS NULL
    THEN TRUE ELSE FALSE END
    AS SIGNAL_NO_BIRTH_RECORDED,

  CASE WHEN p.death_year IS NULL
    AND p.expected_end_year < year(current_date)
    AND (p.birth_year IS NULL OR p.birth_year <= 1930)
    THEN TRUE ELSE FALSE END
    AS SIGNAL_NO_DEATH_RECORDED,

  CASE WHEN fs.num_marriages = 0
    AND (
      p.death_year IS NULL
      OR (p.death_year - COALESCE(p.birth_year, p.death_year - 40)) >= 16
    )
    THEN TRUE ELSE FALSE END
    AS SIGNAL_NO_MARRIAGES,

  -- PATCHED: restricted to proximity <= 1 (direct ancestor or close collateral)
  -- Was: tree-wide with young-death guard only (~59% fire rate)
  -- Now: only meaningful where a missing generation is a genuine research gap
  CASE WHEN fs.num_child_births = 0
    AND COALESCE(pr.proximity, 99) <= 1
    AND NOT (
      p.death_year IS NOT NULL
      AND p.effective_span_years BETWEEN 16 AND 40
    )
    THEN TRUE ELSE FALSE END
    AS SIGNAL_NO_CHILDREN,

  CASE WHEN fs.num_parents < 2
    THEN TRUE ELSE FALSE END
    AS SIGNAL_MISSING_PARENT,

  CASE WHEN (
      (cc.expected_1841 > 0 AND fs.has_1841_census = 0) OR
      (cc.expected_1851 > 0 AND fs.has_1851_census = 0) OR
      (cc.expected_1861 > 0 AND fs.has_1861_census = 0) OR
      (cc.expected_1871 > 0 AND fs.has_1871_census = 0) OR
      (cc.expected_1881 > 0 AND fs.has_1881_census = 0) OR
      (cc.expected_1891 > 0 AND fs.has_1891_census = 0) OR
      (cc.expected_1901 > 0 AND fs.has_1901_census = 0) OR
      (cc.expected_1911 > 0 AND fs.has_1911_census = 0) OR
      (cc.expected_1921 > 0 AND fs.has_1921_census = 0) OR
      (cc.expected_1939 > 0 AND fs.has_1939_register = 0)
    )
    THEN TRUE ELSE FALSE END
    AS SIGNAL_MISSING_CENSUS_COVERAGE,

  -- PATCHED: > 1 uncovered source AND proximity <= 2
  -- Was: any uncovered source (~68% fire rate)
  -- Now: multiple uncovered sources on people close to the direct line
  CASE WHEN COALESCE(uc.uncovered_count, 0) > 1
    AND COALESCE(pr.proximity, 99) <= 2
    THEN TRUE ELSE FALSE END
    AS SIGNAL_UNCOVERED_SOURCES,

  COALESCE(o.has_docs_not_transcribed, FALSE) AS SIGNAL_DOCS_NOT_TRANSCRIBED,

  CASE WHEN
    p.death_year IS NULL
    AND p.last_event_year < p.birth_year + 40
    THEN TRUE ELSE FALSE END
    AS SIGNAL_LATE_LIFE_GAP,

  CASE WHEN
    p.max_event_age <= 25
    AND e.family_event_count > 0
    THEN TRUE ELSE FALSE END
    AS SIGNAL_EARLY_LIFE_ONLY,

  CASE WHEN
    fs.max_days_between_child_births > 730
    OR (fs.num_marriages > 0 AND fs.num_child_births = 0)
    THEN TRUE ELSE FALSE END
    AS SIGNAL_CHILD_GAPS,

  -- ── INTEGRITY — Evidence fragility ───────────────────────────────────────

  -- PATCHED: threshold tightened from < 1.2 to < 1.0
  -- Was: 99.3% fire rate
  -- Now: fires only when at least one fact has no source citation
  CASE WHEN COALESCE(e.avg_sources_per_fact, 0) < 1.0
    THEN TRUE ELSE FALSE END
    AS SIGNAL_LOW_EVIDENCE_DENSITY,

  -- NEW: the very sparse end of the distribution (NULL or < 0.3)
  -- Catches profiles that are largely unsourced — ~8% of tree
  CASE WHEN e.avg_sources_per_fact IS NULL
    OR e.avg_sources_per_fact < 0.3
    THEN TRUE ELSE FALSE END
    AS SIGNAL_VERY_LOW_EVIDENCE_DENSITY,

  CASE WHEN e.total_sources = 1 AND e.total_facts >= 3
    THEN TRUE ELSE FALSE END
    AS SIGNAL_SINGLE_SOURCE_DEPENDENCE,

  CASE WHEN e.family_event_count > e.sourced_family_event_count
    THEN TRUE ELSE FALSE END
    AS SIGNAL_UNSOURCED_FAMILY_EVENTS,

  CASE WHEN fs.has_given_name = 0 OR fs.has_surname = 0
    THEN TRUE ELSE FALSE END
    AS SIGNAL_INCOMPLETE_NAME,

  -- PATCHED: restricted to post-1837 civil registration events
  -- Was: any birth or death not day-precision (~67% fire rate)
  -- Now: post-1837 birth/death/marriage where a certificate should exist
  --      and would give an exact date if found
  CASE WHEN
    (
      (p.birth_year  >= 1837 AND fs.birth_date_precision  <> 'DAY')
      OR
      (p.death_year  >= 1837 AND fs.death_date_precision  <> 'DAY')
      OR
      (fs.marriage_date_precision IS NOT NULL
        AND fs.earliest_marriage_year >= 1837
        AND fs.marriage_date_precision <> 'DAY')
    )
    THEN TRUE ELSE FALSE END
    AS SIGNAL_IMPRECISE_DATES,

  CASE WHEN
    p.birth_year IS NOT NULL AND p.birth_year >= 1600
    AND (
      (p.birth_year IS NOT NULL AND (
        fs.birth_place IS NULL
        OR LENGTH(fs.birth_place) - LENGTH(REPLACE(fs.birth_place, ',', '')) < 1
      ))
      OR
      (p.death_year IS NOT NULL AND (
        fs.death_place IS NULL
        OR LENGTH(fs.death_place) - LENGTH(REPLACE(fs.death_place, ',', '')) < 1
      ))
    )
    THEN TRUE ELSE FALSE END
    AS SIGNAL_IMPRECISE_PLACES,

  COALESCE(o.has_fact_conflict, FALSE) AS SIGNAL_FACT_CONFLICT,

  -- ── NARRATIVE — Texture ───────────────────────────────────────────────────

  CASE WHEN fs.num_military > 0
    THEN TRUE ELSE FALSE END
    AS SIGNAL_MILITARY,

  CASE WHEN fs.num_marriages > 1
    THEN TRUE ELSE FALSE END
    AS SIGNAL_MULTIPLE_SPOUSES,

  CASE WHEN
    p.death_year IS NOT NULL
    AND p.effective_span_years BETWEEN 16 AND 40
    THEN TRUE ELSE FALSE END
    AS SIGNAL_YOUNG_DEATH,

  CASE WHEN fs.num_migration > 0 OR fs.num_countries > 1
    THEN TRUE ELSE FALSE END
    AS SIGNAL_MIGRANT,

  CASE WHEN
    p.sex = 'M'
    AND (p.birth_year BETWEEN 1867 AND 1904)
    AND p.expected_end_year >= 1914
    THEN TRUE ELSE FALSE END
    AS SIGNAL_POSSIBLE_WWI,

  CASE WHEN
    (
      (p.sex = 'M' AND (p.birth_year BETWEEN 1891 AND 1929))
      OR
      (p.sex = 'F' AND (p.birth_year BETWEEN 1911 AND 1925))
    )
    AND p.expected_end_year >= 1939
    THEN TRUE ELSE FALSE END
    AS SIGNAL_POSSIBLE_WWII,

  COALESCE(ss.story_written, FALSE) AS SIGNAL_STORY_WRITTEN,

  -- ── NARRATIVE — Context ───────────────────────────────────────────────────

  CASE WHEN
    (fs.has_1921_census = 1 AND p.birth_year <= 1907)
    OR fs.num_child_births > 0
    THEN TRUE ELSE FALSE END
    AS SIGNAL_POSSIBLE_OCCUPATION,

  CASE WHEN fs.num_occupations > 1
    THEN TRUE ELSE FALSE END
    AS SIGNAL_VARIED_OCCUPATIONS,

  CASE WHEN
    (fs.num_marriages > 0 OR fs.num_child_births > 0)
    AND (
      (cc.expected_1841 > 0 AND fs.has_1841_census = 0) OR
      (cc.expected_1851 > 0 AND fs.has_1851_census = 0) OR
      (cc.expected_1861 > 0 AND fs.has_1861_census = 0) OR
      (cc.expected_1871 > 0 AND fs.has_1871_census = 0) OR
      (cc.expected_1881 > 0 AND fs.has_1881_census = 0) OR
      (cc.expected_1891 > 0 AND fs.has_1891_census = 0) OR
      (cc.expected_1901 > 0 AND fs.has_1901_census = 0) OR
      (cc.expected_1911 > 0 AND fs.has_1911_census = 0) OR
      (cc.expected_1921 > 0 AND fs.has_1921_census = 0) OR
      (cc.expected_1939 > 0 AND fs.has_1939_register = 0)
    )
    THEN TRUE ELSE FALSE END
    AS SIGNAL_POSSIBLE_RESIDENCE,

  CASE WHEN
    e.family_event_count >= 3
    AND e.avg_sources_per_fact < 1.5
    THEN TRUE ELSE FALSE END
    AS SIGNAL_HIGH_FAMILY_PAYOFF,

  CASE WHEN
    p.sex = 'F'
    AND fs.has_1939_register = 1
    AND fs.num_marriages = 0
    THEN TRUE ELSE FALSE END
    AS SIGNAL_POSSIBLE_MARRIAGE,

  CASE WHEN
    p.sex = 'F'
    AND fs.has_1911_census = 1
    AND p.birth_year <= 1895
    THEN TRUE ELSE FALSE END
    AS SIGNAL_POSSIBLE_CHILDREN,

  COALESCE(o.has_transcript, FALSE) AS SIGNAL_TRANSCRIPT_AVAILABLE

FROM timeline p
JOIN  evidence e                  ON p.person_gedcom_id = e.person_gedcom_id
JOIN  genealogy.gold_person_fact_summary fs ON fs.person_gedcom_id = p.person_gedcom_id
LEFT JOIN proximity pr            ON p.person_gedcom_id = pr.person_id
LEFT JOIN census_coverage cc      ON p.person_gedcom_id = cc.person_gedcom_id
LEFT JOIN uncovered_counts uc     ON p.person_gedcom_id = uc.person_gedcom_id
LEFT JOIN ocr_signals o           ON p.person_gedcom_id = o.person_gedcom_id
LEFT JOIN story_status ss         ON p.person_gedcom_id = ss.person_gedcom_id;


In [0]:
%sql
-- ============================================================
-- CELL 4: Update unpivot map to include SIGNAL_VERY_LOW_EVIDENCE_DENSITY
-- All other entries unchanged from Notebook B Cell 2.
-- ============================================================

CREATE OR REPLACE VIEW genealogy.gold_research_person_signals_pivoted AS
SELECT person_gedcom_id, signal_code
FROM genealogy.gold_research_person_signals
LATERAL VIEW explode(map(
  -- INTEGRITY — Completeness
  'SIGNAL_NO_BIRTH_RECORDED',          SIGNAL_NO_BIRTH_RECORDED,
  'SIGNAL_NO_DEATH_RECORDED',          SIGNAL_NO_DEATH_RECORDED,
  'SIGNAL_NO_MARRIAGES',               SIGNAL_NO_MARRIAGES,
  'SIGNAL_NO_CHILDREN',                SIGNAL_NO_CHILDREN,
  'SIGNAL_MISSING_PARENT',             SIGNAL_MISSING_PARENT,
  'SIGNAL_MISSING_CENSUS_COVERAGE',    SIGNAL_MISSING_CENSUS_COVERAGE,
  'SIGNAL_UNCOVERED_SOURCES',          SIGNAL_UNCOVERED_SOURCES,
  'SIGNAL_DOCS_NOT_TRANSCRIBED',       SIGNAL_DOCS_NOT_TRANSCRIBED,
  'SIGNAL_LATE_LIFE_GAP',              SIGNAL_LATE_LIFE_GAP,
  'SIGNAL_EARLY_LIFE_ONLY',            SIGNAL_EARLY_LIFE_ONLY,
  'SIGNAL_CHILD_GAPS',                 SIGNAL_CHILD_GAPS,
  -- INTEGRITY — Evidence
  'SIGNAL_LOW_EVIDENCE_DENSITY',       SIGNAL_LOW_EVIDENCE_DENSITY,
  'SIGNAL_VERY_LOW_EVIDENCE_DENSITY',  SIGNAL_VERY_LOW_EVIDENCE_DENSITY,
  'SIGNAL_SINGLE_SOURCE_DEPENDENCE',   SIGNAL_SINGLE_SOURCE_DEPENDENCE,
  'SIGNAL_UNSOURCED_FAMILY_EVENTS',    SIGNAL_UNSOURCED_FAMILY_EVENTS,
  'SIGNAL_IMPRECISE_DATES',            SIGNAL_IMPRECISE_DATES,
  'SIGNAL_INCOMPLETE_NAME',            SIGNAL_INCOMPLETE_NAME,
  'SIGNAL_IMPRECISE_PLACES',           SIGNAL_IMPRECISE_PLACES,
  'SIGNAL_FACT_CONFLICT',              SIGNAL_FACT_CONFLICT,
  -- NARRATIVE — Texture
  'SIGNAL_MILITARY',                   SIGNAL_MILITARY,
  'SIGNAL_YOUNG_DEATH',                SIGNAL_YOUNG_DEATH,
  'SIGNAL_MIGRANT',                    SIGNAL_MIGRANT,
  'SIGNAL_POSSIBLE_WWI',               SIGNAL_POSSIBLE_WWI,
  'SIGNAL_POSSIBLE_WWII',              SIGNAL_POSSIBLE_WWII,
  'SIGNAL_STORY_WRITTEN',              SIGNAL_STORY_WRITTEN,
  -- NARRATIVE — Family
  'SIGNAL_MULTIPLE_SPOUSES',           SIGNAL_MULTIPLE_SPOUSES,
  -- NARRATIVE — Context
  'SIGNAL_POSSIBLE_OCCUPATION',        SIGNAL_POSSIBLE_OCCUPATION,
  'SIGNAL_VARIED_OCCUPATIONS',         SIGNAL_VARIED_OCCUPATIONS,
  'SIGNAL_POSSIBLE_RESIDENCE',         SIGNAL_POSSIBLE_RESIDENCE,
  'SIGNAL_HIGH_FAMILY_PAYOFF',         SIGNAL_HIGH_FAMILY_PAYOFF,
  'SIGNAL_POSSIBLE_MARRIAGE',          SIGNAL_POSSIBLE_MARRIAGE,
  'SIGNAL_POSSIBLE_CHILDREN',          SIGNAL_POSSIBLE_CHILDREN,
  'SIGNAL_TRANSCRIPT_AVAILABLE',       SIGNAL_TRANSCRIPT_AVAILABLE
)) exploded AS signal_code, is_present
WHERE is_present = TRUE;


In [0]:
%sql
-- ============================================================
-- CELL 5: Verification — revised signal fire rates
-- Compare against pre-patch rates:
--   SIGNAL_LOW_EVIDENCE_DENSITY    was 99.3% — expect ~77% (everyone below 1.0)
--   SIGNAL_VERY_LOW_EVIDENCE_DENSITY  new    — expect ~8%
--   SIGNAL_UNCOVERED_SOURCES       was 67.8% — expect ~15-25%
--   SIGNAL_IMPRECISE_DATES         was 67.3% — expect meaningful reduction
--   SIGNAL_NO_CHILDREN             was 59.3% — expect <5% (proximity <= 1 only)
--   SIGNAL_POSSIBLE_MARRIAGE       was 0%    — expect small %
--   SIGNAL_POSSIBLE_CHILDREN       was 0%    — expect small %
--   SIGNAL_POSSIBLE_RESIDENCE      was 0%    — expect moderate %
-- ============================================================

SELECT
  signal_code,
  COUNT(*)                                          AS fire_count,
  ROUND(COUNT(*) * 100.0 / MAX(total.cnt), 1)       AS fire_pct
FROM genealogy.gold_research_person_signals_pivoted
CROSS JOIN (
  SELECT COUNT(DISTINCT person_gedcom_id) AS cnt
  FROM genealogy.gold_research_person_signals
) total
GROUP BY signal_code
ORDER BY fire_pct DESC;


In [0]:
%sql
-- Signals firing with no weight entry (should return 0 rows)
SELECT DISTINCT p.signal_code
FROM genealogy.gold_research_person_signals_pivoted p
LEFT JOIN genealogy.ref_signal_weights w ON w.signal_code = p.signal_code
WHERE w.signal_code IS NULL
ORDER BY p.signal_code;
