# Insilico mutation effect of Alzheimer's risk

In [1]:
# Essential imports
import sys
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

sys.path.append(str(Path.cwd().parent))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from processors.variantprocessor import VariantProcessor

## Setup VariantProcessor to generate embeddings
**model_class** : `v4_ag` or `v4_pcg`

In [2]:
model_class = 'v4_pcg' # model class can be 'v4_ag', 'v4_pcg'. AG model is all-genes model trained on both protein-coding and non-coding genes.
# model_class = 'v4_pcg' # Uncomment to use the PCG model
vep = VariantProcessor(model_class=model_class)

In [3]:
# Example VCF integration parameters
vcf_path = os.path.join(str(Path.cwd().parent),"_artifacts/HG00096.vcf.gz") # Change path as needed
sample_name = "HG00096"  # European sample from 1000 Genomes Project

# Same variant for insilico analysis. Alt allele will be placed in the sample genotype context at that specific position. 
# Ref is from reference genome.
vcf_variant_data = {
    "chr": ["chr19"],
    "pos": [44908684],
    "ref": ["T"], # reference allele in hg38
    "alt": ["C"], # insilico edited allele
    "tissue": ["brain - cortex"],
    "gene_id": ["ENSG00000130203.9"],
}

vcf_variant_df = pd.DataFrame(vcf_variant_data)
print("\nüß¨ Analyzing the same variant with individual genotype data:")
print(vcf_variant_df.to_string(index=False))


üß¨ Analyzing the same variant with individual genotype data:
  chr      pos ref alt         tissue           gene_id
chr19 44908684   T   C brain - cortex ENSG00000130203.9


In [7]:
# Run VariantFormer with VCF integration
print("\nüî¨ Running VariantFormer analysis with VCF integration...")
print("‚è≥ Processing individual genotype data...")

vcf_predictions = vep.predict(
    var_df=vcf_variant_df,
    output_dir="/tmp/vep_output_vcf",
    vcf_path=vcf_path,
    sample_name=sample_name,
)

print("‚úÖ VCF-based predictions completed!")

2025-11-02 21:31:28 - processors.variantprocessor - INFO - Initializing Variant Processor...
2025-11-02 21:31:28 - processors.multi_datasets_loader - INFO - Loading gene annotations...
2025-11-02 21:31:28 - processors.multi_datasets_loader - INFO - Loading CRE annotations...



üî¨ Running VariantFormer analysis with VCF integration...
‚è≥ Processing individual genotype data...


2025-11-02 21:31:29 - processors.variantprocessor - INFO - Loading variants...
2025-11-02 21:31:29 - processors.multi_datasets_loader - INFO - Loaded 1 variants
2025-11-02 21:31:29 - processors.variantprocessor - INFO - Loaded 1 variants for processing
2025-11-02 21:31:29 - processors.variantprocessor - INFO - Mapped 1 gene-variant pairs
2025-11-02 21:31:29 - processors.variantprocessor - INFO - Loading BPE encoder...
2025-11-02 21:31:29 - processors.variantprocessor - INFO - Loading model...
2025-11-02 21:31:29 - processors.model_manager - INFO - Loading Seq2Reg model...


Loaded BPE vocabulary from /work/vocabs/bpe_vocabulary_500_using_huggingface.json


2025-11-02 21:31:29 - processors.model_manager - INFO - Loading Seq2Reg gene model...
2025-11-02 21:31:29 - processors.model_manager - INFO - Creating Seq2Gene model...
2025-11-02 21:31:35 - processors.model_manager - INFO - Model class: <class 'seq2gene.model_combined_modulator.Seq2GenePredictorCombinedModulator'>
2025-11-02 21:31:35 - processors.model_manager - INFO - Model architecture:
2025-11-02 21:31:35 - processors.model_manager - INFO - Model: Seq2GenePredictorCombinedModulator
2025-11-02 21:31:35 - processors.model_manager - INFO -   start_tkn: 96,768 params
2025-11-02 21:31:35 - processors.model_manager - INFO -   cre_tokenizer: 31,826,153 params
2025-11-02 21:31:35 - processors.model_manager - INFO -   gene_tokenizer: 31,826,153 params
2025-11-02 21:31:35 - processors.model_manager - INFO -   gene_map: 787,968 params
2025-11-02 21:31:35 - processors.model_manager - INFO -   cre_map: 787,968 params
2025-11-02 21:31:35 - processors.model_manager - INFO -   combined_modulator: 

Predicting: |          | 0/? [00:00<?, ?it/s]

2025-11-02 21:31:41 - utils.assets - INFO - Downloading from S3: s3://czi-variantformer/model/common/cres_all_genes_manifest.parquet
2025-11-02 21:31:41 - utils.assets - INFO - Loading parquet file: /tmp/tmp1tbt58o_/model/common/reference_genomes/cres_seqs_manifest.parquet
2025-11-02 21:31:41 - utils.assets - INFO - Validated schema - found columns: {'population', 'file_path', '__index_level_0__', 'chromosome'}
2025-11-02 21:31:41 - utils.assets - INFO - Downloading from S3: s3://czi-variantformer/model/common/reference_genomes/data_split/hg38/cres/data_split/hg38_chr19.pkl.gz
2025-11-02 21:31:42 - utils.assets - INFO - Loading parquet file: /tmp/tmp6bcf8u3k/model/common/cres_all_genes_manifest.parquet
2025-11-02 21:31:42 - utils.assets - INFO - Validated schema - found columns: {'file_path', 'gene_id'}
2025-11-02 21:31:42 - utils.assets - INFO - Downloading from S3: s3://czi-variantformer/model/common/cres_all_genes/ENSG00000130203.9/gene_vocab.csv
2025-11-02 21:31:43 - utils.assets -

‚úÖ VCF-based predictions completed!


In [8]:
vcf_predictions

Unnamed: 0,chrom,pos,ref,alt,genes,tissues,variant_type,population,sample_name,zygosity,gene_exp,gene_emb,gene_token_embedding,cre_token_embedding
0,chr19,44908684,T,C,ENSG00000130203.9,brain - cortex,Gene and CRE overlap,SAMPLE,HG00096,2,6.4079,"[7.6875, 1.859375, 3.765625, 5.96875, 0.009765...","[3.0, -1.5546875, -3.65625, 5.1875, -2.921875,...","[-0.095703125, -0.26367188, 0.13671875, -0.015..."
1,chr19,44908684,T,C,ENSG00000130203.9,brain - cortex,Gene and CRE overlap,SAMPLE,HG00096,1,6.439099,"[7.9375, 1.890625, 3.8125, 6.03125, -0.1767578...","[3.0625, -1.6328125, -3.703125, 5.125, -2.7343...","[-0.14453125, -0.30859375, 0.047851562, 0.0688..."
2,chr19,44908684,T,C,ENSG00000130203.9,brain - cortex,Gene and CRE overlap,SAMPLE,HG00096,0,6.4079,"[7.6875, 1.8046875, 3.765625, 6.03125, -0.0214...","[3.03125, -1.5546875, -3.609375, 5.21875, -2.8...","[-0.1171875, -0.18457031, 0.12451172, 0.008911..."
3,chr19,44908684,T,C,ENSG00000130203.9,brain - cortex,Gene and CRE overlap,REF_HG38,hg38,2,6.439099,"[7.65625, 1.8984375, 3.390625, 5.84375, -0.535...","[2.96875, -1.21875, -4.28125, 4.5, -2.875, -1....","[-0.115234375, -0.24609375, 0.115722656, -0.00..."
4,chr19,44908684,T,C,ENSG00000130203.9,brain - cortex,Gene and CRE overlap,REF_HG38,hg38,1,6.439099,"[7.9375, 1.9609375, 3.46875, 5.8125, -0.6875, ...","[3.046875, -1.171875, -4.1875, 4.40625, -2.859...","[-0.15625, -0.31054688, 0.045898438, 0.0678710..."
5,chr19,44908684,T,C,ENSG00000130203.9,brain - cortex,Gene and CRE overlap,REF_HG38,hg38,0,6.4079,"[7.78125, 1.875, 3.421875, 5.8125, -0.55859375...","[3.015625, -1.171875, -4.3125, 4.46875, -2.906...","[-0.12792969, -0.16796875, 0.10546875, 0.01928..."


**Prediction Output Schema**
- `chrom`, `pos`, `ref`, `alt`: Variant coordinates and alleles
- `genes`: Ensembl Gene ID
- `tissue`: Tissue name
- `population`: Population code (REF_HG38, EAS, EUR, AFR, SAS, AMR)
- `sample_name`: 1000 Genomes sample identifier
- `zygosity`: 0 (homozygous ref), 1 (heterozygous), 2 (homozygous alt)
- `gene_exp`: Predicted gene expression level for the gene in the specified tissue and sample
- `variant_type`: Gene Overlap / CRE Overlap
- `gene_emb`: Embedding of the gene conditioned on tissue and regulatory context from the last transformer layer
- `gene_token_embedding`:  Embedding of the token that overlaps with the variant position from the last transformer layer
- `cre_token_embedding`: Embeddings of regulatory elements overlapping with the variant position from the last transformer layer


## Predict Adrisk with and without insilico edits

In [9]:
from processors import ad_risk

### Subselect respective rows that corresponds to the edits

In [10]:
gene_embedding_without_insilico_edit = vcf_predictions[(vcf_predictions['zygosity']=='0')& (vcf_predictions['sample_name']==sample_name) ]
gene_embedding_with_homozygous_insilico_edit = vcf_predictions[(vcf_predictions['zygosity']=='2')& (vcf_predictions['sample_name']==sample_name) ]

In [11]:
# Load tissue vocab
tissue_vocab = vep.tissue_vocab

In [12]:
for row in gene_embedding_without_insilico_edit.itertuples():
    gene_id = row.genes
    tissue_id = tissue_vocab[row.tissues]
    embd = row.gene_emb.reshape(1, -1)
    risk = ad_risk.ADrisk(
        model_class=model_class,
        gene_id=gene_id,
        tissue_id=tissue_id)(embd)
    print(f"Gene: {gene_id}, Tissue: {row.tissues}, Risk without insilico edit: {risk[0]:.4f}")


2025-11-02 21:32:01 - utils.assets - INFO - Downloading from S3: s3://czi-variantformer/alzheimer_disease/v4_pcg/manifest.parquet
2025-11-02 21:32:03 - utils.assets - INFO - Loading parquet file: /tmp/tmpf496b595/alzheimer_disease/v4_pcg/manifest.parquet
2025-11-02 21:32:03 - utils.assets - INFO - Validated schema - found columns: {'file_path', 'gene_id', 'tissue_id'}
2025-11-02 21:32:03 - utils.assets - INFO - Downloading from S3: s3://czi-variantformer/alzheimer_disease/v4_pcg/ad/ENSG00000130203.9/model_tissue_20.tl


Gene: ENSG00000130203.9, Tissue: brain - cortex, Risk without insilico edit: 0.6713


In [13]:
for row in gene_embedding_with_homozygous_insilico_edit.itertuples():
    gene_id = row.genes
    tissue_id = tissue_vocab[row.tissues]
    embd = row.gene_emb.reshape(1, -1)
    risk = ad_risk.ADrisk(
        model_class=model_class,
        gene_id=gene_id,
        tissue_id=tissue_id)(embd)
    print(f"Gene: {gene_id}, Tissue: {row.tissues}, Risk with homozygous insilico edit: {risk[0]:.4f}")

2025-11-02 21:32:03 - utils.assets - INFO - Downloading from S3: s3://czi-variantformer/alzheimer_disease/v4_pcg/manifest.parquet
2025-11-02 21:32:05 - utils.assets - INFO - Loading parquet file: /tmp/tmp4i1f29t9/alzheimer_disease/v4_pcg/manifest.parquet
2025-11-02 21:32:05 - utils.assets - INFO - Validated schema - found columns: {'file_path', 'gene_id', 'tissue_id'}
2025-11-02 21:32:05 - utils.assets - INFO - Downloading from S3: s3://czi-variantformer/alzheimer_disease/v4_pcg/ad/ENSG00000130203.9/model_tissue_20.tl


Gene: ENSG00000130203.9, Tissue: brain - cortex, Risk with homozygous insilico edit: 0.7085
