# Insilico mutation effect of Alzheimer's risk

In [None]:
# Essential imports
import sys
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

import ipynbname

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from processors.variantprocessor import VariantProcessor
REPO_PATH = ipynbname.path().parent.parent

## Setup VariantProcessor to generate embeddings
**model_class** : `v4_ag` or `v4_pcg`

In [None]:
model_class = 'v4_pcg' # model class can be 'v4_ag', 'v4_pcg'. AG model is all-genes model trained on both protein-coding and non-coding genes.
# model_class = 'v4_pcg' # Uncomment to use the PCG model
vep = VariantProcessor(model_class=model_class)

In [None]:
# Example VCF integration parameters
vcf_path = os.path.join(REPO_PATH,"_artifacts/HG00096.vcf.gz") # Change path as needed
sample_name = "HG00096"  # European sample from 1000 Genomes Project

# Same variant for insilico analysis. Alt allele will be placed in the sample genotype context at that specific position. 
# Ref is from reference genome.
vcf_variant_data = {
    "chr": ["chr19"],
    "pos": [44908684],
    "ref": ["T"], # reference allele in hg38
    "alt": ["C"], # insilico edited allele
    "tissue": ["brain - cortex"],
    "gene_id": ["ENSG00000130203.9"],
}

vcf_variant_df = pd.DataFrame(vcf_variant_data)
print("\nüß¨ Analyzing the same variant with individual genotype data:")
print(vcf_variant_df.to_string(index=False))

In [None]:
# Run VariantFormer with VCF integration
print("\nüî¨ Running VariantFormer analysis with VCF integration...")
print("‚è≥ Processing individual genotype data...")
output_dir = "/tmp/variant_risk_output"  # Change output directory as needed
vcf_predictions = vep.predict(
    var_df=vcf_variant_df,
    output_dir=output_dir,
    vcf_path=vcf_path,
    sample_name=sample_name,
)

print("‚úÖ VCF-based predictions completed!")

In [None]:
vcf_predictions

**Prediction Output Schema**
- `chrom`, `pos`, `ref`, `alt`: Variant coordinates and alleles
- `genes`: Ensembl Gene ID
- `tissue`: Tissue name
- `population`: Population code (REF_HG38, EAS, EUR, AFR, SAS, AMR)
- `sample_name`: 1000 Genomes sample identifier
- `zygosity`: 0 (homozygous ref), 1 (heterozygous), 2 (homozygous alt)
- `gene_exp`: Predicted gene expression level for the gene in the specified tissue and sample
- `variant_type`: Gene Overlap / CRE Overlap
- `gene_emb`: Embedding of the gene conditioned on tissue and regulatory context from the last transformer layer
- `gene_token_embedding`:  Embedding of the token that overlaps with the variant position from the last transformer layer
- `cre_token_embedding`: Embeddings of regulatory elements overlapping with the variant position from the last transformer layer


## Predict Adrisk with and without insilico edits

In [None]:
from processors import ad_risk

### Subselect respective rows that corresponds to the edits

In [None]:
gene_embedding_without_insilico_edit = vcf_predictions[(vcf_predictions['zygosity']=='0')& (vcf_predictions['sample_name']==sample_name) ]
gene_embedding_with_homozygous_insilico_edit = vcf_predictions[(vcf_predictions['zygosity']=='2')& (vcf_predictions['sample_name']==sample_name) ]

In [None]:
# Load tissue vocab
tissue_vocab = vep.tissue_vocab

In [None]:
for row in gene_embedding_without_insilico_edit.itertuples():
    gene_id = row.genes
    tissue_id = tissue_vocab[row.tissues]
    embd = row.gene_emb.reshape(1, -1)
    risk = ad_risk.ADrisk(
        model_class=model_class,
        gene_id=gene_id,
        tissue_id=tissue_id)(embd)
    print(f"Gene: {gene_id}, Tissue: {row.tissues}, Risk without insilico edit: {risk[0]:.4f}")


In [None]:
for row in gene_embedding_with_homozygous_insilico_edit.itertuples():
    gene_id = row.genes
    tissue_id = tissue_vocab[row.tissues]
    embd = row.gene_emb.reshape(1, -1)
    risk = ad_risk.ADrisk(
        model_class=model_class,
        gene_id=gene_id,
        tissue_id=tissue_id)(embd)
    print(f"Gene: {gene_id}, Tissue: {row.tissues}, Risk with homozygous insilico edit: {risk[0]:.4f}")