**Set environment**

In [12]:
import numpy  as np
import pandas as pd
import os
import re

In [22]:
from alphagenome.data   import genome
from alphagenome.models import dna_client, variant_scorers

**Load model**

In [5]:
### get api key
from config_params import ALPHAGENOME_API_KEY
os.environ["ALPHAGENOME_API_KEY"] = ALPHAGENOME_API_KEY

### load model
dna_model = dna_client.create(os.environ["ALPHAGENOME_API_KEY"])
print(dna_model)

<alphagenome.models.dna_client.DnaClient object at 0x7f9bf0a00d90>


In [56]:
# Initialize an empty dictionary to serve as a variant effect prediction cache.
_prediction_cache = {}

## Define interval from a variant

In [29]:
organism = dna_client.Organism.HOMO_SAPIENS

In [43]:
txt_region = "chr11:5227021-5227208"
txt_chrom_name, txt_chrom_start, txt_chrom_end = re.split("[:\-]", txt_region)

num_chrom_start = int(txt_chrom_start)
num_chrom_end   = int(txt_chrom_end)
num_length = num_chrom_end - num_chrom_start

In [48]:
num_variant_position_0base = 5227021
txt_variant_reference_base = "G"
txt_variant_alternate_base = "A"

num_variant_position_1base = num_variant_position_0base + 1

In [49]:
variant = genome.Variant(
    chromosome = txt_chrom_name,
    position   = num_variant_position_1base,
    reference_bases = txt_variant_reference_base,
    alternate_bases = txt_variant_alternate_base,
)
print(variant)

chr11:5227022:G>A


In [50]:
dna_client.SUPPORTED_SEQUENCE_LENGTHS

{'SEQUENCE_LENGTH_2KB': 2048,
 'SEQUENCE_LENGTH_16KB': 16384,
 'SEQUENCE_LENGTH_100KB': 131072,
 'SEQUENCE_LENGTH_500KB': 524288,
 'SEQUENCE_LENGTH_1MB': 1048576}

In [51]:
### Specify length of sequence around variant to predict:
txt_sequence_length = '2KB'  # ["2KB", "16KB", "100KB", "500KB", "1MB"]
num_sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
    f'SEQUENCE_LENGTH_{txt_sequence_length}'
]
print(num_sequence_length)

2048


In [52]:
variant.reference_interval

Interval(chromosome='chr11', start=5227021, end=5227022, strand='.', name='')

In [100]:
interval = genome.Interval(
    chromosome = txt_chrom_name, 
    start      = num_chrom_start, 
    end        = num_chrom_start + num_sequence_length)
print(interval)

chr11:5227021-5229069:.


In [54]:
# The input interval is derived from the variant (centered on it).
interval = variant.reference_interval.resize(sequence_length)
print(interval)

chr11:5225998-5228046:.


## Genomic region

[FAQ — AlphaGenome](https://www.alphagenomedocs.com/faqs.html)

Using the genome.Interval class, which is initialized with a chromosome, a start, and an end position.

> AlphaGenome classes such as genome.Interval uses 0-based indexing, consistent with the underlying Python implementations.
> 
> This means an genome.Interval includes the base pair at the start position up to the base pair at the end-1 position.
For example, to specify the first base pair of chromosome 1, use genome.Interval('chr1', 0, 1). This interval has a width of 1, and contains only the base pair at the first position of chromosome 1.
> 
> To interpret interval overlaps, remember that 0-based indexing excludes the base pair at the end position itself, such that genome.Interval('chr1', 0, 1).overlaps(genome.Interval('chr1', 1, 2)) returns False.

## Genomic variant

[Essential commands — AlphaGenome](https://www.alphagenomedocs.com/colabs/essential_commands.html?utm_source=chatgpt.com)

> This variant changes the base A to a C at position 10_000 on chromosome 3. Note that the position attribute is 1-based to maintain compatibility with common public variant formats (see FAQ for more info.)

In [62]:
variant_scorers.RECOMMENDED_VARIANT_SCORERS["ATAC"]

CenterMaskScorer(requested_output=ATAC, width=501, aggregation_type=DIFF_LOG2_SUM)

In [101]:
###
variant_scores = dna_model.score_variant(
    interval = interval,
    variant  = variant,
    variant_scorers = [variant_scorers.RECOMMENDED_VARIANT_SCORERS["ATAC"]]
    #variant_scorers = list(variant_scorers.RECOMMENDED_VARIANT_SCORERS.values()),
)

dat_scores = variant_scorers.tidy_scores(variant_scores)
dat_scores.head()

Unnamed: 0,variant_id,scored_interval,gene_id,gene_name,gene_type,gene_strand,junction_Start,junction_End,output_type,variant_scorer,track_name,track_strand,Assay title,ontology_curie,biosample_name,biosample_type,raw_score,quantile_score
0,chr11:5227022:G>A,chr11:5227021-5229069:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",CL:0000084 ATAC-seq,.,ATAC-seq,CL:0000084,T-cell,primary_cell,-0.001603,-0.137554
1,chr11:5227022:G>A,chr11:5227021-5229069:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",CL:0000100 ATAC-seq,.,ATAC-seq,CL:0000100,motor neuron,in_vitro_differentiated_cells,0.04325,0.674555
2,chr11:5227022:G>A,chr11:5227021-5229069:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",CL:0000236 ATAC-seq,.,ATAC-seq,CL:0000236,B cell,primary_cell,-0.000247,-0.080577
3,chr11:5227022:G>A,chr11:5227021-5229069:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",CL:0000623 ATAC-seq,.,ATAC-seq,CL:0000623,natural killer cell,primary_cell,-0.000209,-0.080577
4,chr11:5227022:G>A,chr11:5227021-5229069:.,,,,,,,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",CL:0000624 ATAC-seq,.,ATAC-seq,CL:0000624,"CD4-positive, alpha-beta T cell",primary_cell,0.005339,0.160114


In [102]:
dat = dat_scores.loc[
    dat_scores["biosample_name"] == "K562",
    [
        "variant_id",
        "scored_interval",
        "output_type",
        "variant_scorer",
        "biosample_name",
        "raw_score",
        "quantile_score",
    ]
]
dat

Unnamed: 0,variant_id,scored_interval,output_type,variant_scorer,biosample_name,raw_score,quantile_score
60,chr11:5227022:G>A,chr11:5227021-5229069:.,ATAC,"CenterMaskScorer(requested_output=ATAC, width=...",K562,0.033792,0.721776


In [58]:
def _predict_variant_cached(
    interval, variant, organism, requested_outputs, ontology_terms
):
    """Cache wrapper of dna_model.predict_variant."""
    # Create a unique key from the function arguments.
    cache_key = (
        str(interval),
        str(variant),
        str(organism),
        tuple(requested_outputs),
        tuple(ontology_terms),
    )
    
    # Check if the result is already in the cache.
    if cache_key in _prediction_cache:
        return _prediction_cache[cache_key]
    
    # If not, compute the prediction and store it in the cache.
    result = dna_model.predict_variant(
        interval=interval,
        variant=variant,
        organism=organism,
        requested_outputs=requested_outputs,
        ontology_terms=ontology_terms,
    )
    _prediction_cache[cache_key] = result
    
    return result

```
EFO:0001187 → HeLa cell line
EFO:0002067 → K562 cell line
EFO:0002784 → GM12878 lymphoblastoid cell line
```

In [103]:
list(dna_client.OutputType)

[ATAC,
 CAGE,
 DNASE,
 RNA_SEQ,
 CHIP_HISTONE,
 CHIP_TF,
 SPLICE_SITES,
 SPLICE_SITE_USAGE,
 SPLICE_JUNCTIONS,
 CONTACT_MAPS,
 PROCAP]

In [104]:
[*dna_client.OutputType][0]

ATAC

In [105]:
output = _predict_variant_cached(
    interval=interval,
    variant=variant,
    organism=organism,
    requested_outputs=[[*dna_client.OutputType][0]],
    ontology_terms=["EFO:0002067"],
)

In [106]:
scorer = variant_scorers.RECOMMENDED_VARIANT_SCORERS["ATAC"]
mask_width = scorer.width
print(mask_width)

501


In [112]:
scorer.aggregation_type

DIFF_LOG2_SUM

In [107]:
ref = output.reference
alt = output.alternate

In [108]:
ref.atac.values.shape

(2048, 1)

In [109]:
track_idx = 0
ref_signal = ref.atac.values[:, track_idx]
alt_signal = alt.atac.values[:, track_idx]

ref_signal

array([0.00854492, 0.0390625 , 0.00335693, ..., 0.01928711, 0.0168457 ,
       0.01794434], dtype=float32)

In [111]:
scorer = variant_scorers.RECOMMENDED_VARIANT_SCORERS["ATAC"]
mask_width = scorer.width   # bp

# Get center index in the array
center_idx = len(ref_signal) // 2
half_mask = mask_width // 2
mask_slice = slice(center_idx - half_mask, center_idx + half_mask)

# Now calculate in the masked region
raw_score = alt_signal[mask_slice].sum() - ref_signal[mask_slice].sum()
raw_score

0.17733383

In [114]:
eps = 1e-6  # small pseudocount; AlphaGenome uses a tiny epsilon internally
S_ref = float(ref_signal[mask_slice].sum())
S_alt = float(alt_signal[mask_slice].sum())
raw_score_manual = np.log2(S_alt + eps) - np.log2(S_ref + eps)
raw_score_manual

0.02855767090331529