# Clinvar exploration notebook



In [35]:
import polars as pl
import polars_bio as pb
from pathlib import Path
from pycomfort import files

In [32]:
# Configure Polars to show more rows and columns
pl.Config.set_tbl_rows(-1)  # Show all rows (or use a large number like 1000)
pl.Config.set_tbl_cols(-1)  # Show all columns (or use a large number like 50)
pl.Config.set_tbl_width_chars(1000)  # Increase table width to prevent column truncation
pl.Config.set_fmt_str_lengths(1000)  # Show longer string values without truncation

polars.config.Config

In [33]:

from pathlib import Path

genobear_folder = (Path.home() / "genobear").absolute().resolve()
current_folder = Path.cwd().absolute().resolve()
if current_folder.name == "notebooks":
    current_folder = current_folder.parent

print(current_folder, genobear_folder)

/home/antonkulaga/sources/genobear /home/antonkulaga/genobear


In [34]:
import genobear as gb


Clinvar descriptions
====================

In [28]:
clinvar_folder= genobear_folder / "databases" / "clinvar"
clinvar_vcf_gz = clinvar_folder / "hg38" / "clinvar.vcf.gz"
clinvar_vcf = clinvar_folder / "hg38" / "clinvar.vcf"

assert clinvar_vcf.exists(), f"Clinvar VCF file {clinvar_vcf} does not exist"

clinvar_parquet = clinvar_folder / "clinvar_hg38.parquet"

In [29]:
pb.describe_vcf(str(clinvar_vcf))

name,type,description
str,str,str
"""AF_ESP""","""Float""","""allele frequencies from GO-ESP"""
"""AF_EXAC""","""Float""","""allele frequencies from ExAC"""
"""AF_TGP""","""Float""","""allele frequencies from TGP"""
"""ALLELEID""","""Integer""","""the ClinVar Allele ID"""
"""CLNDN""","""String""","""ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"""
"""CLNDNINCL""","""String""","""For included Variant : ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"""
"""CLNDISDB""","""String""","""Tag-value pairs of disease database name and identifier submitted for germline classifications, e.g. OMIM:NNNNNN"""
"""CLNDISDBINCL""","""String""","""For included Variant: Tag-value pairs of disease database name and identifier for germline classifications, e.g. OMIM:NNNNNN"""
"""CLNHGVS""","""String""","""Top-level (primary assembly, alt, or patch) HGVS expression."""
"""CLNREVSTAT""","""String""","""ClinVar review status of germline classification for the Variation ID"""


In [30]:
gb.get_info_fields(str(clinvar_vcf))

AttributeError: module 'genobear' has no attribute 'get_info_fields'

In [None]:
gb.get_info_fields(str(clinvar_vcf))

AttributeError: module 'genobear' has no attribute 'get_info_fields'

In [None]:
gb.get_info_fields(str(clinvar_vcf))

AttributeError: module 'genobear' has no attribute 'get_info_fields'

In [6]:
from genobear.io import read_vcf_file
vcf = clinvar_folder / "hg38" / "clinvar.vcf"
clinvar = read_vcf_file(vcf, to_parquet=False, streaming=False)
clinvar.head().collect()

chrom,start,end,id,ref,alt,qual,filter,af_esp,af_exac,af_tgp,alleleid,clndn,clndnincl,clndisdb,clndisdbincl,clnhgvs,clnrevstat,clnsig,clnsigconf,clnsigincl,clnsigscv,clnvc,clnvcso,clnvi,dbvarid,geneinfo,mc,oncdn,oncdnincl,oncdisdb,oncdisdbincl,onc,oncincl,oncrevstat,oncscv,oncconf,origin,rs,scidn,scidnincl,scidisdb,scidisdbincl,scirevstat,sci,sciincl,sciscv
str,u32,u32,str,str,str,f64,str,f32,f32,f32,i32,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,str,list[str],list[str],str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""1""",66926,66927,"""3385321""","""AG""","""A""",0.0,"""""",,,,3544463,"[""Retinitis_pigmentosa""]",,"[""Human_Phenotype_Ontology:HP:0000547"", ""MONDO:MONDO:0019200"", … ""Orphanet:791""]",,"[""NC_000001.11:g.66927del""]","[""criteria_provided"", ""_single_submitter""]","[""Uncertain_significance""]",,,"[""SCV005419006""]","""Deletion""","""SO:0000159""",,,"""OR4F5:79501""","[""SO:0001627|intron_variant""]",,,,,,,,,,"[""0""]",,,,,,,,,
"""1""",69134,69134,"""2205837""","""A""","""G""",0.0,"""""",,,,2193183,"[""not_specified""]",,"[""MedGen:CN169374""]",,"[""NC_000001.11:g.69134A>G""]","[""criteria_provided"", ""_single_submitter""]","[""Likely_benign""]",,,"[""SCV003526545""]","""single_nucleotide_variant""","""SO:0001483""","[""ClinGen:CA502008""]",,"""OR4F5:79501""","[""SO:0001583|missense_variant""]",,,,,,,,,,"[""1""]","[""781394307""]",,,,,,,,
"""1""",69308,69308,"""3925305""","""A""","""G""",0.0,"""""",,,,4039319,"[""not_specified""]",,"[""MedGen:CN169374""]",,"[""NC_000001.11:g.69308A>G""]","[""criteria_provided"", ""_single_submitter""]","[""Uncertain_significance""]",,,"[""SCV006120601""]","""single_nucleotide_variant""","""SO:0001483""",,,"""OR4F5:79501""","[""SO:0001583|missense_variant""]",,,,,,,,,,"[""1""]",,,,,,,,,
"""1""",69314,69314,"""3205580""","""T""","""G""",0.0,"""""",,,,3374047,"[""not_specified""]",,"[""MedGen:CN169374""]",,"[""NC_000001.11:g.69314T>G""]","[""criteria_provided"", ""_single_submitter""]","[""Uncertain_significance""]",,,"[""SCV004995495""]","""single_nucleotide_variant""","""SO:0001483""","[""ClinGen:CA338197388""]",,"""OR4F5:79501""","[""SO:0001583|missense_variant""]",,,,,,,,,,"[""1""]","[""2521653848""]",,,,,,,,
"""1""",69404,69404,"""3925306""","""T""","""C""",0.0,"""""",,,,4039320,"[""not_specified""]",,"[""MedGen:CN169374""]",,"[""NC_000001.11:g.69404T>C""]","[""criteria_provided"", ""_single_submitter""]","[""Uncertain_significance""]",,,"[""SCV006120602""]","""single_nucleotide_variant""","""SO:0001483""",,,"""OR4F5:79501""","[""SO:0001583|missense_variant""]",,,,,,,,,,"[""1""]",,,,,,,,,


Load Genomic Sample
===================

In [None]:
from genobear.io import read_vcf_file
from pathlib import Path

data_folder = current_folder / "data"
test_folder = data_folder / "tests"

assert test_folder.exists(), f"Test folder {test_folder} does not exist"


In [15]:
sample_vcf = test_folder / "antonkulaga.vcf"
sample = read_vcf_file(sample_vcf, info_fields=[], streaming=False)
sample.head().collect()

chrom,start,end,id,ref,alt,qual,filter
str,u32,u32,str,str,str,f64,str
"""1""",10009,10009,"""""","""A""","""AC""",0.0,"""RefCall"""
"""1""",10015,10015,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10021,10021,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10027,10027,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10033,10033,"""""","""A""","""G""",0.0,"""RefCall"""


## Annotation with clinvar

In [23]:
sample.columns

  sample.columns


['chrom', 'start', 'end', 'id', 'ref', 'alt', 'qual', 'filter']

In [None]:
sample.join(clinvar, on=["chrom", "start", "end"], how="left").collect()

In [21]:
pb.describe_vcf(str(clinvar_vcf))

name,type,description
str,str,str
"""AF_ESP""","""Float""","""allele frequencies from GO-ESP"""
"""AF_EXAC""","""Float""","""allele frequencies from ExAC"""
"""AF_TGP""","""Float""","""allele frequencies from TGP"""
"""ALLELEID""","""Integer""","""the ClinVar Allele ID"""
"""CLNDN""","""String""","""ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"""
"""CLNDNINCL""","""String""","""For included Variant : ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"""
"""CLNDISDB""","""String""","""Tag-value pairs of disease database name and identifier submitted for germline classifications, e.g. OMIM:NNNNNN"""
"""CLNDISDBINCL""","""String""","""For included Variant: Tag-value pairs of disease database name and identifier for germline classifications, e.g. OMIM:NNNNNN"""
"""CLNHGVS""","""String""","""Top-level (primary assembly, alt, or patch) HGVS expression."""
"""CLNREVSTAT""","""String""","""ClinVar review status of germline classification for the Variation ID"""


In [22]:
# Test the function with both regular and gzipped VCF files
sample_small = read_vcf_file(sample_small_vcf)

NameError: name 'sample_small_vcf' is not defined

In [None]:
data_folder = Path("../data").absolute().resolve()
test_folder = data_folder / "test"

files.tprint(test_folder)


test
	longevity_snps_1000genom.vcf.gz
	antonkulaga.vcf
	antku_small.vcf
	longevity_snps_1000genom.vcf


## Samples

Loading samples

In [None]:
sample_antku = pb.read_vcf(str(test_folder / "antku_small.vcf"))
sample_antonkulaga = pb.read_vcf(str(test_folder / "antonkulaga.vcf"))
sample_antku.head().collect()

chrom,start,end,id,ref,alt,qual,filter
str,u32,u32,str,str,str,f64,str
"""1""",10009,10009,"""""","""A""","""AC""",0.0,"""RefCall"""
"""1""",10015,10015,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10021,10021,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10027,10027,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10033,10033,"""""","""A""","""G""",0.0,"""RefCall"""


In [None]:
sample_antku.count().collect()

793rows [00:00, 750300.72rows/s]


chrom,start,end,id,ref,alt,qual,filter
u32,u32,u32,u32,u32,u32,u32,u32
793,793,793,793,793,793,793,793


In [None]:
sample_antonkulaga.count().collect()

6079744rows [00:04, 1414880.55rows/s]


chrom,start,end,id,ref,alt,qual,filter
u32,u32,u32,u32,u32,u32,u32,u32
6079744,6079744,6079744,6079744,6079744,6079744,6079744,6079744
