# Clinvar exploration notebook



In [1]:
import os
import polars as pl
import polars_bio as pb
from pathlib import Path
from pycomfort import files

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Configure Polars to show more rows and columns
pl.Config.set_tbl_rows(-1)  # Show all rows (or use a large number like 1000)
pl.Config.set_tbl_cols(-1)  # Show all columns (or use a large number like 50)
pl.Config.set_tbl_width_chars(1000)  # Increase table width to prevent column truncation
pl.Config.set_fmt_str_lengths(1000)  # Show longer string values without truncation

polars.config.Config

In [4]:

from pathlib import Path

genobear_folder = (Path.home() / "genobear").absolute().resolve()
current_folder = Path.cwd().absolute().resolve()
if current_folder.name == "notebooks":
    current_folder = current_folder.parent

print(current_folder, genobear_folder)

/home/antonkulaga/sources/genobear /home/antonkulaga/genobear


In [5]:
import genobear as gb
from genobear.io import vcf_to_parquet

Ensembl test
============

In [None]:
from pycomfort import files
variations = Path("/home/antonkulaga/.cache/ensembl_variation")
files.tprint(variations)

ensembl_variation
	homo_sapiens-chr1.vcf.gz.csi
	homo_sapiens-chr1.vcf.gz
	homo_sapiens-chr21.vcf.gz
	homo_sapiens-chr21.vcf
	homo_sapiens-chr22.vcf.gz.csi
	homo_sapiens-chr22.parquet
	homo_sapiens-chr22.vcf.gz
	homo_sapiens-chr22.vcf
	homo_sapiens-chr21.vcf.gz.csi
	homo_sapiens-chr1.vcf
	test
		homo_sapiens-chr1-cleaned.vcf
		homo_sapiens-chr21-cleaned.vcf
		homo_sapiens-chr21-cleaned.parquet
		homo_sapiens-chr21.vcf
		homo_sapiens-chr1.parquet
		homo_sapiens-chr22.vcf
		homo_sapiens-chr1.vcf


In [None]:
def check_semicolons(vcf_file: Path | str = Path("/home/antonkulaga/.cache/ensembl_variation/test/homo_sapiens-chr21.vcf")):
    vcf_file = Path(vcf_file)
    lines = []
    try:
        # Read file and search for ";;" pattern
        with open(vcf_file, 'r') as f:
            for line in f:
                if ";;" in line:
                    line = line.strip()
                    print(line)
                    lines.append(line)
    except FileNotFoundError:
        print(f"File not found: {vcf_file}")
    except Exception as e:
        print(f"Error reading file: {e}")
    return lines

check_semicolons()







21	33248751	rs549962048	A	C,G	.	.	dbSNP_156;TSA=SNV;E_Freq;E_Phenotype_or_Disease;E_ExAC;E_TOPMed;E_gnomAD;CLIN_uncertain_significance;;AA=A


['21\t33248751\trs549962048\tA\tC,G\t.\t.\tdbSNP_156;TSA=SNV;E_Freq;E_Phenotype_or_Disease;E_ExAC;E_TOPMed;E_gnomAD;CLIN_uncertain_significance;;AA=A']

In [17]:
from genobear.io import clean_extra_semicolons

clean_extra_semicolons("/home/antonkulaga/.cache/ensembl_variation/test/homo_sapiens-chr21.vcf", 
                       "/home/antonkulaga/.cache/ensembl_variation/test/homo_sapiens-chr21-cleaned.vcf")







PosixPath('/home/antonkulaga/.cache/ensembl_variation/test/homo_sapiens-chr21-cleaned.vcf')

In [18]:
check_semicolons("/home/antonkulaga/.cache/ensembl_variation/test/homo_sapiens-chr21-cleaned.vcf")

[]

In [19]:
import genobear as gb

vcf_path = "/home/antonkulaga/.cache/ensembl_variation/test/homo_sapiens-chr21-cleaned.vcf"

df_21 = gb.read_vcf_file(vcf_path)

df_21.head(100).collect()

14431660rows [04:09, 57851.75rows/s]


chrom,start,end,id,ref,alt,qual,filter,cosmic_100,dbsnp_156,hgmd-public_20204,clinvar_202409,tsa,e_cited,e_multiple_observations,e_freq,e_topmed,e_hapmap,e_phenotype_or_disease,e_esp,e_gnomad,e_1000g,e_exac,clin_risk_factor,clin_protective,clin_confers_sensitivity,clin_other,clin_drug_response,clin_uncertain_significance,clin_benign,clin_likely_pathogenic,clin_pathogenic,clin_likely_benign,clin_histocompatibility,clin_not_provided,clin_association,ma,maf,mac,aa
str,u32,u32,str,str,str,f64,str,bool,bool,bool,bool,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,f32,i32,str
"""21""",5025532,5025532,"""rs1879593094""","""G""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""
"""21""",5029766,5029766,"""rs2123010272""","""T""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""T"""
"""21""",5029945,5029946,"""rs2123010281""","""GG""","""G""",0.0,"""""",False,True,False,False,"""indel""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""GG"""
"""21""",5029992,5029992,"""rs2123010290""","""G""","""A""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""
"""21""",5030031,5030031,"""rs2123010295""","""C""","""T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""C"""
"""21""",5030088,5030088,"""rs1455320509""","""C""","""T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""C"""
"""21""",5030105,5030105,"""rs1173141359""","""C""","""A""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""C"""
"""21""",5030126,5030126,"""rs2123010324""","""G""","""A""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""
"""21""",5030137,5030137,"""rs2123010330""","""C""","""T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""C"""
"""21""",5030138,5030138,"""rs2123010341""","""G""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""


In [20]:
import genobear as gb
from genobear.io import clean_extra_semicolons

chr1 = "/home/antonkulaga/.cache/ensembl_variation/test/homo_sapiens-chr1.vcf"
check_semicolons(chr1)


[]

In [21]:
clean_extra_semicolons(chr1)
check_semicolons(chr1)

[]

In [22]:
import genobear as gb

df_1 = gb.read_vcf_file(chr1)
df_1.head(100).collect()

0rows [00:00, ?rows/s]

86805311rows [25:08, 57533.63rows/s] 


chrom,start,end,id,ref,alt,qual,filter,cosmic_100,dbsnp_156,hgmd-public_20204,clinvar_202409,tsa,e_cited,e_multiple_observations,e_freq,e_topmed,e_hapmap,e_phenotype_or_disease,e_esp,e_gnomad,e_1000g,e_exac,clin_risk_factor,clin_protective,clin_confers_sensitivity,clin_other,clin_drug_response,clin_uncertain_significance,clin_benign,clin_likely_pathogenic,clin_pathogenic,clin_likely_benign,clin_histocompatibility,clin_not_provided,clin_association,ma,maf,mac,aa
str,u32,u32,str,str,str,f64,str,bool,bool,bool,bool,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,f32,i32,str
"""1""",10001,10001,"""rs1570391677""","""T""","""A|C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",10002,10002,"""rs1570391692""","""A""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",10003,10003,"""rs1570391694""","""A""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",10007,10007,"""rs1639538116""","""T""","""C|G""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",10008,10008,"""rs1570391698""","""A""","""C|G|T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",10009,10009,"""rs1570391702""","""A""","""C|G""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",10013,10013,"""rs1639538192""","""T""","""C|G""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",10014,10014,"""rs1639538207""","""A""","""C|G|T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",10014,10015,"""rs1639538231""","""AA""","""A""",0.0,"""""",False,True,False,False,"""indel""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,
"""1""",10015,10015,"""rs1570391706""","""A""","""C|G|T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,


In [23]:
merged = pl.concat([df_21, df_1])
merged.head(10).collect()

chrom,start,end,id,ref,alt,qual,filter,cosmic_100,dbsnp_156,hgmd-public_20204,clinvar_202409,tsa,e_cited,e_multiple_observations,e_freq,e_topmed,e_hapmap,e_phenotype_or_disease,e_esp,e_gnomad,e_1000g,e_exac,clin_risk_factor,clin_protective,clin_confers_sensitivity,clin_other,clin_drug_response,clin_uncertain_significance,clin_benign,clin_likely_pathogenic,clin_pathogenic,clin_likely_benign,clin_histocompatibility,clin_not_provided,clin_association,ma,maf,mac,aa
str,u32,u32,str,str,str,f64,str,bool,bool,bool,bool,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,f32,i32,str
"""21""",5025532,5025532,"""rs1879593094""","""G""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""
"""21""",5029766,5029766,"""rs2123010272""","""T""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""T"""
"""21""",5029945,5029946,"""rs2123010281""","""GG""","""G""",0.0,"""""",False,True,False,False,"""indel""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""GG"""
"""21""",5029992,5029992,"""rs2123010290""","""G""","""A""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""
"""21""",5030031,5030031,"""rs2123010295""","""C""","""T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""C"""
"""21""",5030088,5030088,"""rs1455320509""","""C""","""T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""C"""
"""21""",5030105,5030105,"""rs1173141359""","""C""","""A""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""C"""
"""21""",5030126,5030126,"""rs2123010324""","""G""","""A""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""
"""21""",5030137,5030137,"""rs2123010330""","""C""","""T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""C"""
"""21""",5030138,5030138,"""rs2123010341""","""G""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""


In [None]:
merged.count().collect()

In [24]:
downloader = gb.EnsemblDownloader.for_chromosomes(["21", "1", "22"], clean_semicolons=True)
results = downloader.download_all()

86805311rows [21:19, 67826.22rows/s] 


In [None]:
downloader.lazy_frames['chr21'].head().collect()

chrom,start,end,id,ref,alt,qual,filter,cosmic_100,dbsnp_156,hgmd-public_20204,clinvar_202409,tsa,e_cited,e_multiple_observations,e_freq,e_topmed,e_hapmap,e_phenotype_or_disease,e_esp,e_gnomad,e_1000g,e_exac,clin_risk_factor,clin_protective,clin_confers_sensitivity,clin_other,clin_drug_response,clin_uncertain_significance,clin_benign,clin_likely_pathogenic,clin_pathogenic,clin_likely_benign,clin_histocompatibility,clin_not_provided,clin_association,ma,maf,mac,aa
str,u32,u32,str,str,str,f64,str,bool,bool,bool,bool,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,str,f32,i32,str
"""21""",5025532,5025532,"""rs1879593094""","""G""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""
"""21""",5029766,5029766,"""rs2123010272""","""T""","""C""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""T"""
"""21""",5029945,5029946,"""rs2123010281""","""GG""","""G""",0.0,"""""",False,True,False,False,"""indel""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""GG"""
"""21""",5029992,5029992,"""rs2123010290""","""G""","""A""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""G"""
"""21""",5030031,5030031,"""rs2123010295""","""C""","""T""",0.0,"""""",False,True,False,False,"""SNV""",False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,,"""C"""


In [None]:
downloader.lazy_frames['chr1'].head().collect()

chromosome,position,id,reference,alternate,quality,filter,info
i64,i64,str,str,str,str,str,str
1,10001,"""rs1570391677""","""T""","""A,C""",,,"""dbSNP_156;TSA=SNV;E_Freq"""
1,10002,"""rs1570391692""","""A""","""C""",,,"""dbSNP_156;TSA=SNV;E_Freq"""
1,10003,"""rs1570391694""","""A""","""C""",,,"""dbSNP_156;TSA=SNV;E_Freq"""
1,10007,"""rs1639538116""","""T""","""C,G""",,,"""dbSNP_156;TSA=SNV;E_Freq"""
1,10008,"""rs1570391698""","""A""","""C,G,T""",,,"""dbSNP_156;TSA=SNV;E_Freq"""


In [None]:
print(downloader.download_results['chr21'])

vcf=None index=None parquet=PosixPath('/home/antonkulaga/.cache/ensembl_variation/homo_sapiens-chr21.parquet') lazy_frame=<LazyFrame at 0x7C1E20FD9A90>


In [None]:
pb.describe_vcf(downloader.download_results['chr22'].vcf)

KeyError: 'chr22'

In [None]:
pb.describe_vcf(str(downloader.download_results['chr21'].vcf))


thread '<unnamed>' panicked at /root/.cargo/git/checkouts/datafusion-bio-formats-f4a7f32bff6627c2/c972aaa/datafusion/bio-format-vcf/src/storage.rs:275:68:
called `Result::unwrap()` on an `Err` value: Os { code: 2, kind: NotFound, message: "No such file or directory" }
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: called `Result::unwrap()` on an `Err` value: Os { code: 2, kind: NotFound, message: "No such file or directory" }

In [None]:

db = downloader.read_merged_parquet()
db.head().collect()

ComputeError: schema lengths differ

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'slice' <---
Parquet SCAN [/home/antonkulaga/.cache/ensembl_variation/homo_sapiens-chr21.parquet]
PROJECT */40 COLUMNS

In [None]:
test = pb.read_vcf("/home/antonkulaga/.cache/ensembl_variation/homo_sapiens-chr21.vcf", streaming=True)

In [None]:
test.head().collect()


thread '<unnamed>' panicked at src/streaming.rs:23:9:
not implemented


PanicException: not implemented

In [None]:
import genobear as gb
from pathlib import Path
chr21  = gb.read_vcf_file(Path("/home/antonkulaga/data/ensembl/homo_sapiens-chr21.vcf"), info_fields=[])
chr21

  from .autonotebook import tqdm as notebook_tqdm
9355264rows [01:10, 129406.38rows/s]
thread '<unnamed>' panicked at /root/.cargo/git/checkouts/datafusion-bio-formats-f4a7f32bff6627c2/c972aaa/datafusion/bio-format-vcf/src/physical_exec.rs:174:36:
called `Result::unwrap()` on an `Err` value: Kind(UnexpectedEof)
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
9363456rows [01:10, 132306.63rows/s]


ComputeError: caught exception during execution of a Python source, exception: PanicException: called `Result::unwrap()` on an `Err` value: Kind(UnexpectedEof)

In [None]:
import biobear as bb
sessions = bb.new_session()
df =sessions.read_vcf_file("/home/antonkulaga/.cache/ensembl_variation/homo_sapiens-chr21.vcf").to_polars()
    

ArrowInvalid: C Data interface error: External error: Arrow error: External error: Io error: unexpected end of file

In [None]:
df

AttributeError: 'builtins.ExecutionResult' object has no attribute 'head'

Clinvar descriptions
====================

In [28]:
clinvar_folder= genobear_folder / "databases" / "clinvar"
clinvar_vcf_gz = clinvar_folder / "hg38" / "clinvar.vcf.gz"
clinvar_vcf = clinvar_folder / "hg38" / "clinvar.vcf"

assert clinvar_vcf.exists(), f"Clinvar VCF file {clinvar_vcf} does not exist"

clinvar_parquet = clinvar_folder / "clinvar_hg38.parquet"

In [29]:
pb.describe_vcf(str(clinvar_vcf))

name,type,description
str,str,str
"""AF_ESP""","""Float""","""allele frequencies from GO-ESP"""
"""AF_EXAC""","""Float""","""allele frequencies from ExAC"""
"""AF_TGP""","""Float""","""allele frequencies from TGP"""
"""ALLELEID""","""Integer""","""the ClinVar Allele ID"""
"""CLNDN""","""String""","""ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"""
"""CLNDNINCL""","""String""","""For included Variant : ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"""
"""CLNDISDB""","""String""","""Tag-value pairs of disease database name and identifier submitted for germline classifications, e.g. OMIM:NNNNNN"""
"""CLNDISDBINCL""","""String""","""For included Variant: Tag-value pairs of disease database name and identifier for germline classifications, e.g. OMIM:NNNNNN"""
"""CLNHGVS""","""String""","""Top-level (primary assembly, alt, or patch) HGVS expression."""
"""CLNREVSTAT""","""String""","""ClinVar review status of germline classification for the Variation ID"""


In [30]:
gb.get_info_fields(str(clinvar_vcf))

AttributeError: module 'genobear' has no attribute 'get_info_fields'

In [None]:
gb.get_info_fields(str(clinvar_vcf))

AttributeError: module 'genobear' has no attribute 'get_info_fields'

In [None]:
gb.get_info_fields(str(clinvar_vcf))

AttributeError: module 'genobear' has no attribute 'get_info_fields'

In [6]:
from genobear.io import read_vcf_file
vcf = clinvar_folder / "hg38" / "clinvar.vcf"
clinvar = read_vcf_file(vcf, to_parquet=False, streaming=False)
clinvar.head().collect()

chrom,start,end,id,ref,alt,qual,filter,af_esp,af_exac,af_tgp,alleleid,clndn,clndnincl,clndisdb,clndisdbincl,clnhgvs,clnrevstat,clnsig,clnsigconf,clnsigincl,clnsigscv,clnvc,clnvcso,clnvi,dbvarid,geneinfo,mc,oncdn,oncdnincl,oncdisdb,oncdisdbincl,onc,oncincl,oncrevstat,oncscv,oncconf,origin,rs,scidn,scidnincl,scidisdb,scidisdbincl,scirevstat,sci,sciincl,sciscv
str,u32,u32,str,str,str,f64,str,f32,f32,f32,i32,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,str,list[str],list[str],str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""1""",66926,66927,"""3385321""","""AG""","""A""",0.0,"""""",,,,3544463,"[""Retinitis_pigmentosa""]",,"[""Human_Phenotype_Ontology:HP:0000547"", ""MONDO:MONDO:0019200"", … ""Orphanet:791""]",,"[""NC_000001.11:g.66927del""]","[""criteria_provided"", ""_single_submitter""]","[""Uncertain_significance""]",,,"[""SCV005419006""]","""Deletion""","""SO:0000159""",,,"""OR4F5:79501""","[""SO:0001627|intron_variant""]",,,,,,,,,,"[""0""]",,,,,,,,,
"""1""",69134,69134,"""2205837""","""A""","""G""",0.0,"""""",,,,2193183,"[""not_specified""]",,"[""MedGen:CN169374""]",,"[""NC_000001.11:g.69134A>G""]","[""criteria_provided"", ""_single_submitter""]","[""Likely_benign""]",,,"[""SCV003526545""]","""single_nucleotide_variant""","""SO:0001483""","[""ClinGen:CA502008""]",,"""OR4F5:79501""","[""SO:0001583|missense_variant""]",,,,,,,,,,"[""1""]","[""781394307""]",,,,,,,,
"""1""",69308,69308,"""3925305""","""A""","""G""",0.0,"""""",,,,4039319,"[""not_specified""]",,"[""MedGen:CN169374""]",,"[""NC_000001.11:g.69308A>G""]","[""criteria_provided"", ""_single_submitter""]","[""Uncertain_significance""]",,,"[""SCV006120601""]","""single_nucleotide_variant""","""SO:0001483""",,,"""OR4F5:79501""","[""SO:0001583|missense_variant""]",,,,,,,,,,"[""1""]",,,,,,,,,
"""1""",69314,69314,"""3205580""","""T""","""G""",0.0,"""""",,,,3374047,"[""not_specified""]",,"[""MedGen:CN169374""]",,"[""NC_000001.11:g.69314T>G""]","[""criteria_provided"", ""_single_submitter""]","[""Uncertain_significance""]",,,"[""SCV004995495""]","""single_nucleotide_variant""","""SO:0001483""","[""ClinGen:CA338197388""]",,"""OR4F5:79501""","[""SO:0001583|missense_variant""]",,,,,,,,,,"[""1""]","[""2521653848""]",,,,,,,,
"""1""",69404,69404,"""3925306""","""T""","""C""",0.0,"""""",,,,4039320,"[""not_specified""]",,"[""MedGen:CN169374""]",,"[""NC_000001.11:g.69404T>C""]","[""criteria_provided"", ""_single_submitter""]","[""Uncertain_significance""]",,,"[""SCV006120602""]","""single_nucleotide_variant""","""SO:0001483""",,,"""OR4F5:79501""","[""SO:0001583|missense_variant""]",,,,,,,,,,"[""1""]",,,,,,,,,


Load Genomic Sample
===================

In [None]:
from genobear.io import read_vcf_file
from pathlib import Path

data_folder = current_folder / "data"
test_folder = data_folder / "tests"

assert test_folder.exists(), f"Test folder {test_folder} does not exist"


In [15]:
sample_vcf = test_folder / "antonkulaga.vcf"
sample = read_vcf_file(sample_vcf, info_fields=[], streaming=False)
sample.head().collect()

chrom,start,end,id,ref,alt,qual,filter
str,u32,u32,str,str,str,f64,str
"""1""",10009,10009,"""""","""A""","""AC""",0.0,"""RefCall"""
"""1""",10015,10015,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10021,10021,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10027,10027,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10033,10033,"""""","""A""","""G""",0.0,"""RefCall"""


## Annotation with clinvar

In [23]:
sample.columns

  sample.columns


['chrom', 'start', 'end', 'id', 'ref', 'alt', 'qual', 'filter']

In [None]:
sample.join(clinvar, on=["chrom", "start", "end"], how="left").collect()

In [21]:
pb.describe_vcf(str(clinvar_vcf))

name,type,description
str,str,str
"""AF_ESP""","""Float""","""allele frequencies from GO-ESP"""
"""AF_EXAC""","""Float""","""allele frequencies from ExAC"""
"""AF_TGP""","""Float""","""allele frequencies from TGP"""
"""ALLELEID""","""Integer""","""the ClinVar Allele ID"""
"""CLNDN""","""String""","""ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"""
"""CLNDNINCL""","""String""","""For included Variant : ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB"""
"""CLNDISDB""","""String""","""Tag-value pairs of disease database name and identifier submitted for germline classifications, e.g. OMIM:NNNNNN"""
"""CLNDISDBINCL""","""String""","""For included Variant: Tag-value pairs of disease database name and identifier for germline classifications, e.g. OMIM:NNNNNN"""
"""CLNHGVS""","""String""","""Top-level (primary assembly, alt, or patch) HGVS expression."""
"""CLNREVSTAT""","""String""","""ClinVar review status of germline classification for the Variation ID"""


In [22]:
# Test the function with both regular and gzipped VCF files
sample_small = read_vcf_file(sample_small_vcf)

NameError: name 'sample_small_vcf' is not defined

In [None]:
data_folder = Path("../data").absolute().resolve()
test_folder = data_folder / "test"

files.tprint(test_folder)


test
	longevity_snps_1000genom.vcf.gz
	antonkulaga.vcf
	antku_small.vcf
	longevity_snps_1000genom.vcf


## Samples

Loading samples

In [None]:
sample_antku = pb.read_vcf(str(test_folder / "antku_small.vcf"))
sample_antonkulaga = pb.read_vcf(str(test_folder / "antonkulaga.vcf"))
sample_antku.head().collect()

chrom,start,end,id,ref,alt,qual,filter
str,u32,u32,str,str,str,f64,str
"""1""",10009,10009,"""""","""A""","""AC""",0.0,"""RefCall"""
"""1""",10015,10015,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10021,10021,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10027,10027,"""""","""A""","""G""",0.0,"""RefCall"""
"""1""",10033,10033,"""""","""A""","""G""",0.0,"""RefCall"""


In [None]:
sample_antku.count().collect()

793rows [00:00, 750300.72rows/s]


chrom,start,end,id,ref,alt,qual,filter
u32,u32,u32,u32,u32,u32,u32,u32
793,793,793,793,793,793,793,793


In [None]:
sample_antonkulaga.count().collect()

6079744rows [00:04, 1414880.55rows/s]


chrom,start,end,id,ref,alt,qual,filter
u32,u32,u32,u32,u32,u32,u32,u32
6079744,6079744,6079744,6079744,6079744,6079744,6079744,6079744
