# Validation Notebook: RSID Intersection Check

This notebook validates that rsids from the previous version (SQLite database) intersect with rsids in the annotated parquet file.

## Purpose

- Load the annotated parquet file containing VCF annotations
- Load the previous SQLite database with various annotation tables
- Extract rsids from both sources
- Validate that rsids from the SQLite database intersect with rsids in the annotated file


In [1]:
import polars as pl
import polars_bio as pb
import sqlite3
from pathlib import Path

from genobear.annotation.chromosomes import harmonize_chrom_column
from platformdirs import user_cache_dir

# Configure Polars to show more rows and columns
pl.Config.set_tbl_rows(-1)  # Show all rows
pl.Config.set_tbl_cols(-1)  # Show all columns
pl.Config.set_tbl_width_chars(1000)  # Increase table width
pl.Config.set_fmt_str_lengths(1000)  # Show longer string values without truncation


polars.config.Config

## Setup: Configure Paths

Set up paths to data directories and files.


In [2]:
# Set up paths
current_folder = Path.cwd().absolute().resolve()
if current_folder.name == "notebooks":
    current_folder = current_folder.parent

cache_dir = Path(user_cache_dir(appname="genobear")) / "ensembl_variations" / "splitted_variants" / "SNV"
data_dir = current_folder / "data"

# Paths to files
annotated_genome_path = data_dir / "output" / "antonkulaga_annotated.parquet"
db_path = data_dir / "input" / "tests" / "antonkulaga.vcf_longevity.sqlite"

## Load Annotated Parquet File

Load the annotated parquet file and examine its structure.

In [3]:
# Load annotated parquet file
annotated_genome = pl.scan_parquet(str(annotated_genome_path))

annotated_genome.head().collect()

chrom,start,end,rsid,id,ref,alt,qual,filter,END,end_ensembl,qual_ensembl,filter_ensembl,cosmic_101,clinvar_202502,dbsnp_156,hgmd-public_20204,tsa,e_cited,e_multiple_observations,e_freq,e_topmed,e_hapmap,e_phenotype_or_disease,e_esp,e_gnomad,e_1000g,e_exac,clin_risk_factor,clin_protective,clin_confers_sensitivity,clin_other,clin_drug_response,clin_uncertain_significance,clin_benign,clin_likely_pathogenic,clin_pathogenic,clin_likely_benign,clin_histocompatibility,clin_not_provided,clin_association,ma,maf,mac,aa
str,u32,u32,str,str,str,str,f64,str,i32,u32,f64,str,bool,bool,bool,bool,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,f32,i32,str
"""1""",16712710,16712710,"""rs1736697""","""""","""A""","""G""",33.799999,"""PASS""",,16712710.0,,"""""",False,False,True,False,"""SNV""",False,False,True,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",16725938,16725938,"""rs647390""","""""","""T""","""C""",2.2,"""RefCall""",,16725938.0,,"""""",False,False,True,False,"""SNV""",False,False,True,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",16727021,16727026,,"""""","""AAAAAG""","""A""",3.8,"""PASS""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1""",16727298,16727298,"""rs2419167""","""""","""A""","""G""",0.4,"""RefCall""",,16727298.0,,"""""",False,False,True,False,"""SNV""",False,False,True,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",16727339,16727339,,"""""","""A""","""AC""",12.1,"""PASS""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Extract RSIDs from Annotated File

Extract unique rsids from the annotated parquet file.

In [4]:
# Compute unique rsids in annotated genome once (reuse for all joins below)
annotated_rsids_df = (
    annotated_genome
    .select(pl.col("rsid"))
    .filter(pl.col("rsid").is_not_null())
    .unique()
    .collect()
)

print(f"Total unique rsids in annotated genome: {len(annotated_rsids_df)}")
print("\nSample rsids:")
annotated_rsids_df.head(10)

Total unique rsids in annotated genome: 4402888

Sample rsids:


rsid
str
"""rs722517"""
"""rs906834349"""
"""rs4663444"""
"""rs34832724"""
"""rs468677"""
"""rs4267848"""
"""rs769379780"""
"""rs1930517939"""
"""rs111611697"""
"""rs7631268"""


## Load SQLite Database

Connect to the previous version SQLite database and examine its structure.


In [5]:
# Connect to SQLite database
conn = sqlite3.connect(db_path)

# Get list of tables
tables_query = "SELECT name, type, sql FROM sqlite_master WHERE type='table';"
tables_df = pl.read_database(query=tables_query, connection=conn)
print("Tables in the SQLite database:")
tables_df.select("name")



Tables in the SQLite database:


name
str
"""cancer"""
"""coronary"""
"""drugs"""
"""longevitymap"""
"""prs"""


## Validate RSID Intersection: Longevitymap

This section validates that rsids from the SQLite `longevitymap` table intersect with rsids in the annotated parquet file.


### Longevitymap Table

Note: This table may use 'snp' instead of 'rsid' for the column name.


In [31]:
# Load longevitymap table
longevitymap_df = pl.read_database(query="SELECT * FROM longevitymap;", connection=conn)

# Check columns
print("Longevitymap table columns:", longevitymap_df.columns)
print(f"\nLongevitymap table shape: {longevitymap_df.shape}")

# Extract unique rsids from longevitymap table - check if it's 'snp' or 'rsid'
if "snp" in longevitymap_df.columns:
    longevitymap_rsid_col = "snp"
elif "rsid" in longevitymap_df.columns:
    longevitymap_rsid_col = "rsid"
else:
    print("ERROR: Cannot find rsid or snp column in longevitymap table")
    longevitymap_rsid_col = None

if longevitymap_rsid_col:
    longevitymap_rsids_before = longevitymap_df.select(pl.col(longevitymap_rsid_col)).filter(pl.col(longevitymap_rsid_col).is_not_null()).unique()
    print(f"Unique rsids in longevitymap table (column '{longevitymap_rsid_col}'): {len(longevitymap_rsids_before)}")
    
    # Get unique rsids from annotated genome for join
    annotated_rsids = (
        annotated_genome
        .select(pl.col("rsid"))
        .filter(pl.col("rsid").is_not_null())
        .unique()
        .collect()
    )
    
    # Inner join with annotated dataframe
    longevitymap_matched = longevitymap_rsids_before.join(
        annotated_rsids,
        left_on=longevitymap_rsid_col,
        right_on="rsid",
        how="inner"
    )
    
    print(f"Unique rsids from longevitymap found in annotated: {len(longevitymap_matched)}")
    print(f"Match rate: {len(longevitymap_matched) / len(longevitymap_rsids_before) * 100:.2f}%")
    
    # Show some matched rsids
    print("\nSample matched rsids:")
    print(longevitymap_matched.head(10))


Longevitymap table columns: ['id', 'weight', 'weightcolor', 'population', 'snp', 'gene', 'conflicted_rows', 'description', 'coding', 'ref', 'alt', 'cdnachange', 'deseases', 'zegot', 'alelfreq', 'nucleotides', 'priority', 'ncbidesc']

Longevitymap table shape: (322, 18)
Unique rsids in longevitymap table (column 'snp'): 322
Unique rsids from longevitymap found in annotated: 244
Match rate: 75.78%

Sample matched rsids:
shape: (10, 1)
┌───────────┐
│ snp       │
│ ---       │
│ str       │
╞═══════════╡
│ rs2247549 │
│ rs6443429 │
│ rs2590504 │
│ rs697887  │
│ rs2485662 │
│ rs5771675 │
│ rs473268  │
│ rs1834461 │
│ rs4646    │
│ rs2764264 │
└───────────┘


In [14]:
# Load the "full" Longevitymap SQLite database (modules/just_longevitymap)
# Source: notebooks/work_with_sqlite.ipynb

import sqlite3

longevitymap_modules_db_path = data_dir / "modules" / "just_longevitymap" / "longevitymap.sqlite"

modules_conn = sqlite3.connect(longevitymap_modules_db_path)

modules_tables = pl.read_database(
    query="SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;",
    connection=modules_conn,
)
print("Tables in modules longevitymap DB:")
print(modules_tables)

# The main table with rsids/weights in this DB is `allele_weights`.
longevitymap_allele_weights = pl.read_database(
    query="SELECT * FROM allele_weights;",
    connection=modules_conn,
)

print(f"\nallele_weights shape: {longevitymap_allele_weights.shape}")
longevitymap_allele_weights.head(10)

# NOTE: keep `modules_conn` open; it will be closed at the very end of the notebook.



Tables in modules longevitymap DB:
shape: (5, 1)
┌────────────────┐
│ name           │
│ ---            │
│ str            │
╞════════════════╡
│ allele_weights │
│ categories     │
│ gene           │
│ population     │
│ variant        │
└────────────────┘

allele_weights shape: (1043, 8)


id,allele,state,zygosity,weight,rsid,priority,category_id
i64,str,str,str,f64,str,str,i64
0,"""T""","""alt""","""het""",0.5,"""rs7412""","""1.0""",1
1,"""T""","""alt""","""hom""",1.0,"""rs7412""","""1.0""",1
2,"""C""","""alt""","""het""",-0.5,"""rs429358""","""1.0""",1
3,"""C""","""alt""","""hom""",-1.0,"""rs429358""","""1.0""",1
4,"""G""","""ref""","""hom""",0.97,"""rs5882""","""0.97""",1
5,"""T""","""alt""","""het""",0.475,"""rs2802292""","""0.95""",2
6,"""G""","""ref""","""hom""",0.95,"""rs2802292""","""0.95""",2
7,"""G""","""alt""","""het""",0.94,"""rs9536314""","""0.94""",2
8,"""A""","""alt""","""het""",0.94,"""rs9536314""","""0.94""",2
9,"""C""","""alt""","""het""",0.47,"""rs9527025""","""0.94""",2


# Let's compare how many actually intersect

In [23]:
longevitymap_weights = longevitymap_allele_weights.with_columns(pl.col("allele").alias("alt")).lazy()
longevitymap_weights.head(10).collect()

id,allele,state,zygosity,weight,rsid,priority,category_id,alt
i64,str,str,str,f64,str,str,i64,str
0,"""T""","""alt""","""het""",0.5,"""rs7412""","""1.0""",1,"""T"""
1,"""T""","""alt""","""hom""",1.0,"""rs7412""","""1.0""",1,"""T"""
2,"""C""","""alt""","""het""",-0.5,"""rs429358""","""1.0""",1,"""C"""
3,"""C""","""alt""","""hom""",-1.0,"""rs429358""","""1.0""",1,"""C"""
4,"""G""","""ref""","""hom""",0.97,"""rs5882""","""0.97""",1,"""G"""
5,"""T""","""alt""","""het""",0.475,"""rs2802292""","""0.95""",2,"""T"""
6,"""G""","""ref""","""hom""",0.95,"""rs2802292""","""0.95""",2,"""G"""
7,"""G""","""alt""","""het""",0.94,"""rs9536314""","""0.94""",2,"""G"""
8,"""A""","""alt""","""het""",0.94,"""rs9536314""","""0.94""",2,"""A"""
9,"""C""","""alt""","""het""",0.47,"""rs9527025""","""0.94""",2,"""C"""


In [26]:
intersections_weights = annotated_genome.join(longevitymap_weights, on=["rsid", "alt"], how="inner")
intersections_weights.head(10).collect()

chrom,start,end,rsid,id,ref,alt,qual,filter,END,end_ensembl,qual_ensembl,filter_ensembl,cosmic_101,clinvar_202502,dbsnp_156,hgmd-public_20204,tsa,e_cited,e_multiple_observations,e_freq,e_topmed,e_hapmap,e_phenotype_or_disease,e_esp,e_gnomad,e_1000g,e_exac,clin_risk_factor,clin_protective,clin_confers_sensitivity,clin_other,clin_drug_response,clin_uncertain_significance,clin_benign,clin_likely_pathogenic,clin_pathogenic,clin_likely_benign,clin_histocompatibility,clin_not_provided,clin_association,ma,maf,mac,aa,id_right,allele,state,zygosity,weight,priority,category_id
str,u32,u32,str,str,str,str,f64,str,i32,u32,f64,str,bool,bool,bool,bool,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,f32,i32,str,i64,str,str,str,f64,str,i64
"""1""",24934895,24934895,"""rs4648884""","""""","""T""","""C""",68.300003,"""PASS""",,24934895,,"""""",False,False,True,False,"""SNV""",True,False,True,True,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""T""",0.456044,2324,"""C""",996,"""C""","""alt""","""het""",0.08,"""0.16""",10
"""1""",44809879,44809879,"""rs11211037""","""""","""A""","""C""",63.700001,"""PASS""",,44809879,,"""""",False,False,True,False,"""SNV""",False,False,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""C""",0.485086,2472,"""A""",505,"""C""","""alt""","""het""",0.07,"""0.14""",0
"""1""",118773640,118773640,"""rs10923673""","""""","""T""","""G""",31.0,"""PASS""",,118773640,,"""""",False,False,True,False,"""SNV""",False,False,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""G""",0.283556,1445,"""T""",936,"""G""","""alt""","""het""",0.07,"""0.14""",0
"""1""",12513771,12513771,"""rs12129750""","""""","""A""","""G""",64.400002,"""PASS""",,12513771,,"""""",False,False,True,False,"""SNV""",False,False,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""A""",0.152669,778,"""G""",916,"""G""","""alt""","""het""",0.07,"""0.14""",0
"""1""",12513771,12513771,"""rs12129750""","""""","""A""","""G""",64.400002,"""PASS""",,12513771,,"""""",False,False,True,False,"""SNV""",False,False,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""A""",0.152669,778,"""G""",917,"""G""","""alt""","""hom""",0.14,"""0.14""",0
"""1""",11340352,11340352,"""rs3120819""","""""","""A""","""C""",62.700001,"""PASS""",,11340352,,"""""",False,False,True,False,"""SNV""",True,False,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""C""",0.452119,2304,"""A""",133,"""C""","""alt""","""het""",0.24,"""0.48""",7
"""1""",11340352,11340352,"""rs3120819""","""""","""A""","""C""",62.700001,"""PASS""",,11340352,,"""""",False,False,True,False,"""SNV""",True,False,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""C""",0.452119,2304,"""A""",134,"""C""","""alt""","""hom""",0.48,"""0.48""",7
"""1""",156113677,156113677,"""rs2485662""","""""","""T""","""C""",65.599998,"""PASS""",,156113677,,"""""",False,False,True,False,"""SNV""",True,False,True,True,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""T""",0.309458,1577,"""C""",315,"""C""","""alt""","""het""",-0.135,"""0.27""",11
"""1""",156113677,156113677,"""rs2485662""","""""","""T""","""C""",65.599998,"""PASS""",,156113677,,"""""",False,False,True,False,"""SNV""",True,False,True,True,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""T""",0.309458,1577,"""C""",316,"""C""","""alt""","""hom""",-0.27,"""0.27""",11
"""1""",156146697,156146697,"""rs1468772""","""""","""T""","""G""",58.799999,"""PASS""",,156146697,,"""""",False,False,True,False,"""SNV""",True,False,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""G""",0.199961,1019,"""T""",1018,"""G""","""alt""","""het""",0.05,"""0.1""",0


In [27]:
intersections_weights.count().collect()

chrom,start,end,rsid,id,ref,alt,qual,filter,END,end_ensembl,qual_ensembl,filter_ensembl,cosmic_101,clinvar_202502,dbsnp_156,hgmd-public_20204,tsa,e_cited,e_multiple_observations,e_freq,e_topmed,e_hapmap,e_phenotype_or_disease,e_esp,e_gnomad,e_1000g,e_exac,clin_risk_factor,clin_protective,clin_confers_sensitivity,clin_other,clin_drug_response,clin_uncertain_significance,clin_benign,clin_likely_pathogenic,clin_pathogenic,clin_likely_benign,clin_histocompatibility,clin_not_provided,clin_association,ma,maf,mac,aa,id_right,allele,state,zygosity,weight,priority,category_id
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
410,410,410,410,410,410,410,410,410,0,410,0,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,410,332,332,332,408,410,410,410,410,410,410,410


# And now with longevitymap_df

In [47]:
longevitymap_old = longevitymap_df.with_columns(pl.col("snp").alias("rsid")).lazy()
longevitymap_old.count().collect()

id,weight,weightcolor,population,snp,gene,conflicted_rows,description,coding,ref,alt,cdnachange,deseases,zegot,alelfreq,nucleotides,priority,ncbidesc,rsid
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
322,322,322,322,322,322,322,322,146,322,322,237,156,322,321,322,322,242,322


In [51]:
longevitymap_weights.count().collect()

id,allele,state,zygosity,weight,rsid,priority,category_id,alt
u32,u32,u32,u32,u32,u32,u32,u32,u32
1043,1043,1043,1043,1043,1043,1043,1043,1043


In [50]:
longevitymap_old_with_weights = longevitymap_old.join(longevitymap_weights, on=["rsid", "alt"], how="inner")
longevitymap_old_with_weights.count().collect()

id,weight,weightcolor,population,snp,gene,conflicted_rows,description,coding,ref,alt,cdnachange,deseases,zegot,alelfreq,nucleotides,priority,ncbidesc,rsid,id_right,allele,state,zygosity,weight_right,priority_right,category_id
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
448,448,448,448,448,448,448,448,152,448,448,303,169,448,446,448,448,312,448,448,448,448,448,448,448,448


In [38]:
intersections_old = annotated_genome.join(longevitymap_old, on=["rsid", "alt"], how="inner")
intersections_old.head(3).collect()

chrom,start,end,rsid,id,ref,alt,qual,filter,END,end_ensembl,qual_ensembl,filter_ensembl,cosmic_101,clinvar_202502,dbsnp_156,hgmd-public_20204,tsa,e_cited,e_multiple_observations,e_freq,e_topmed,e_hapmap,e_phenotype_or_disease,e_esp,e_gnomad,e_1000g,e_exac,clin_risk_factor,clin_protective,clin_confers_sensitivity,clin_other,clin_drug_response,clin_uncertain_significance,clin_benign,clin_likely_pathogenic,clin_pathogenic,clin_likely_benign,clin_histocompatibility,clin_not_provided,clin_association,ma,maf,mac,aa,id_right,weight,weightcolor,population,snp,gene,conflicted_rows,description,coding,ref_right,cdnachange,deseases,zegot,alelfreq,nucleotides,priority,ncbidesc
str,u32,u32,str,str,str,str,f64,str,i32,u32,f64,str,bool,bool,bool,bool,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,f32,i32,str,i64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str
"""1""",24934895,24934895,"""rs4648884""","""""","""T""","""C""",68.300003,"""PASS""",,24934895,,"""""",False,False,True,False,"""SNV""",True,False,True,True,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""T""",0.456044,2324,"""C""",5,0.08,"""e0ffe0""","""American (Caucasian)""","""rs4648884""","""RUNX3""","""[]""","""[{""pubmedid"": ""22533364"", ""study_design"": ""Genome-wide association study using 1,471 genotyped participants from the Framingham Heart Study, of which 1,173 individuals had known lifespans, plus 517 individuals from the Offspring cohort for validating connections between longevity and genetic variants"", ""conclusions"": ""A total of 27 SNPs, including in CDH4, SVEP1, CACNA1C, CARS, STK24, C7orf50, PARVG, NCAM2, PPP2R2C, NLRC5, BTBD9, RAC2, TGFA, KIAA0649, ABCC4, CLSTN2, FAM19A5 and RUNX3, were identified at the intersection of various statistical procedures""}]""",,"""T""","""c.59-5043A>G""",,"""het""","""0.514773599386""","""C/T""",0.16,"""This gene encodes a member of the runt domain-containing family of transcription factors. A heterodimer of this protein and a beta subunit forms a complex that binds to the core DNA sequence 5&apos;-PYGPYGGT-3&apos; found in a number of enhancers and promoters, and can either activate or suppress transcription. It also interacts with other transcription factors. It functions as a tumor suppressor, and the gene is frequently deleted or transcriptionally silenced in cancer. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Mar 2016]"""
"""1""",44809879,44809879,"""rs11211037""","""""","""A""","""C""",63.700001,"""PASS""",,44809879,,"""""",False,False,True,False,"""SNV""",False,False,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""C""",0.485086,2472,"""A""",6,0.07,"""e4ffe4""","""American (Caucasian)""","""rs11211037""","""BTBD19""","""[]""","""[{""pubmedid"": ""22279548"", ""study_design"": ""Genome-wide association study in 801 centenarians and 914 healthy controls"", ""conclusions"": ""281 SNPs were found to discriminate between cases and controls""}]""",,"""A""","""c.87-334A>C""",,"""het""","""0.452681992337""","""C/A""",0.14,
"""1""",118773640,118773640,"""rs10923673""","""""","""T""","""G""",31.0,"""PASS""",,118773640,,"""""",False,False,True,False,"""SNV""",False,False,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"""G""",0.283556,1445,"""T""",7,0.07,"""e4ffe4""","""American (Caucasian)""","""rs10923673""","""""","""[]""","""[{""pubmedid"": ""22279548"", ""study_design"": ""Genome-wide association study in 801 centenarians and 914 healthy controls"", ""conclusions"": ""281 SNPs were found to discriminate between cases and controls""}]""",,"""T""",,,"""het""","""0.211086261981""","""G/T""",0.14,


In [39]:
intersections_old.count().collect()

chrom,start,end,rsid,id,ref,alt,qual,filter,END,end_ensembl,qual_ensembl,filter_ensembl,cosmic_101,clinvar_202502,dbsnp_156,hgmd-public_20204,tsa,e_cited,e_multiple_observations,e_freq,e_topmed,e_hapmap,e_phenotype_or_disease,e_esp,e_gnomad,e_1000g,e_exac,clin_risk_factor,clin_protective,clin_confers_sensitivity,clin_other,clin_drug_response,clin_uncertain_significance,clin_benign,clin_likely_pathogenic,clin_pathogenic,clin_likely_benign,clin_histocompatibility,clin_not_provided,clin_association,ma,maf,mac,aa,id_right,weight,weightcolor,population,snp,gene,conflicted_rows,description,coding,ref_right,cdnachange,deseases,zegot,alelfreq,nucleotides,priority,ncbidesc
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
186,186,186,186,186,186,186,186,186,0,186,0,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,162,162,162,185,186,186,186,186,186,186,186,186,10,186,101,20,186,185,186,186,106


## Summary

Overall validation results for the SQLite `longevitymap` table.


In [None]:
# Summary of validation results (longevitymap only)
print("=" * 80)
print("VALIDATION SUMMARY (longevitymap)")
print("=" * 80)

if not longevitymap_rsid_col:
    raise ValueError("Cannot find rsid/snp column in longevitymap table")

before = len(longevitymap_rsids_before)
matched = len(longevitymap_matched)

summary_df = pl.DataFrame(
    [{"table": "longevitymap", "before": before, "matched": matched}]
).with_columns(
    (pl.col("matched") / pl.col("before") * 100).alias("match_rate_%")
)

print(summary_df)

print(f"\n{'=' * 80}")
print(f"Overall: {matched}/{before} ({matched/before*100:.2f}%) rsids matched")

# Close database connections (kept open for interactive exploration above)
modules_conn.close()
conn.close()


VALIDATION SUMMARY (longevitymap)


NameError: name 'longevitymap_rsid_col' is not defined