**Set environment**

In [1]:
import numpy  as np
import pandas as pd
import os
import re

In [2]:
from alphagenome.data   import genome
from alphagenome.models import dna_client, variant_scorers



**Load model**

In [3]:
### get api key
from config_params import ALPHAGENOME_API_KEY
os.environ["ALPHAGENOME_API_KEY"] = ALPHAGENOME_API_KEY

### load model
dna_model = dna_client.create(os.environ["ALPHAGENOME_API_KEY"])
print(dna_model)

<alphagenome.models.dna_client.DnaClient object at 0x7f917f8c9b90>


In [4]:
# Initialize an empty dictionary to serve as a variant effect prediction cache.
_prediction_cache = {}

In [5]:
organism = dna_client.Organism.HOMO_SAPIENS

## Define a genomic variant

[Essential commands — AlphaGenome](https://www.alphagenomedocs.com/colabs/essential_commands.html?utm_source=chatgpt.com)
```
variant = genome.Variant(
    chromosome='chr3', position=10_000, reference_bases='A', alternate_bases='C'
)
```
> This variant changes the base A to a C at position 10_000 on chromosome 3. Note that the position attribute is 1-based to maintain compatibility with common public variant formats (see FAQ for more info.)

In [6]:
txt_region = "chr11:5227021-5227208"
txt_chrom_name, txt_chrom_start, txt_chrom_end = re.split("[:\-]", txt_region)

num_chrom_start = int(txt_chrom_start)
num_chrom_end   = int(txt_chrom_end)
num_length = num_chrom_end - num_chrom_start

In [7]:
num_variant_position_0base = 5227021
txt_variant_reference_base = "G"
txt_variant_alternate_base = "A"

num_variant_position_1base = num_variant_position_0base + 1

In [8]:
variant = genome.Variant(
    chromosome = txt_chrom_name,
    position   = num_variant_position_1base,
    reference_bases = txt_variant_reference_base,
    alternate_bases = txt_variant_alternate_base,
)
print(variant)

chr11:5227022:G>A


## Set genomic region based on the variant

[FAQ — AlphaGenome](https://www.alphagenomedocs.com/faqs.html)

Using the genome.Interval class, which is initialized with a chromosome, a start, and an end position.

> AlphaGenome classes such as genome.Interval uses 0-based indexing, consistent with the underlying Python implementations.
> 
> This means an genome.Interval includes the base pair at the start position up to the base pair at the end-1 position.
For example, to specify the first base pair of chromosome 1, use genome.Interval('chr1', 0, 1). This interval has a width of 1, and contains only the base pair at the first position of chromosome 1.
> 
> To interpret interval overlaps, remember that 0-based indexing excludes the base pair at the end position itself, such that genome.Interval('chr1', 0, 1).overlaps(genome.Interval('chr1', 1, 2)) returns False.

In [9]:
dna_client.SUPPORTED_SEQUENCE_LENGTHS

{'SEQUENCE_LENGTH_2KB': 2048,
 'SEQUENCE_LENGTH_16KB': 16384,
 'SEQUENCE_LENGTH_100KB': 131072,
 'SEQUENCE_LENGTH_500KB': 524288,
 'SEQUENCE_LENGTH_1MB': 1048576}

In [10]:
### Specify length of sequence around variant to predict:
txt_sequence_length = '2KB'  # ["2KB", "16KB", "100KB", "500KB", "1MB"]
num_sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
    f'SEQUENCE_LENGTH_{txt_sequence_length}'
]
print(num_sequence_length)

2048


In [11]:
variant.reference_interval

Interval(chromosome='chr11', start=5227021, end=5227022, strand='.', name='')

```
### Mannual set interval
interval = genome.Interval(
    chromosome = txt_chrom_name, 
    start      = num_chrom_start, 
    end        = num_chrom_start + num_sequence_length)
print(interval)
```

In [12]:
# The input interval is derived from the variant (centered on it).
interval = variant.reference_interval.resize(num_sequence_length)
print(interval)

chr11:5225998-5228046:.


## Perform prediction

**Predict score types**

In [13]:
variant_scorers.RECOMMENDED_VARIANT_SCORERS

immutabledict({'ATAC': CenterMaskScorer(requested_output=ATAC, width=501, aggregation_type=DIFF_LOG2_SUM), 'CONTACT_MAPS': ContactMapScorer(), 'DNASE': CenterMaskScorer(requested_output=DNASE, width=501, aggregation_type=DIFF_LOG2_SUM), 'CHIP_TF': CenterMaskScorer(requested_output=CHIP_TF, width=501, aggregation_type=DIFF_LOG2_SUM), 'CHIP_HISTONE': CenterMaskScorer(requested_output=CHIP_HISTONE, width=2001, aggregation_type=DIFF_LOG2_SUM), 'CAGE': CenterMaskScorer(requested_output=CAGE, width=501, aggregation_type=DIFF_LOG2_SUM), 'PROCAP': CenterMaskScorer(requested_output=PROCAP, width=501, aggregation_type=DIFF_LOG2_SUM), 'RNA_SEQ': GeneMaskLFCScorer(requested_output=RNA_SEQ), 'RNA_SEQ_ACTIVE': GeneMaskActiveScorer(requested_output=RNA_SEQ), 'SPLICE_SITES': GeneMaskSplicingScorer(requested_output=SPLICE_SITES, width=None), 'SPLICE_SITE_USAGE': GeneMaskSplicingScorer(requested_output=SPLICE_SITE_USAGE, width=None), 'SPLICE_JUNCTIONS': SpliceJunctionScorer(), 'POLYADENYLATION': Polyade

In [14]:
variant_scorers.RECOMMENDED_VARIANT_SCORERS["ATAC"]

CenterMaskScorer(requested_output=ATAC, width=501, aggregation_type=DIFF_LOG2_SUM)

In [15]:
variant_scorers.RECOMMENDED_VARIANT_SCORERS["RNA_SEQ"]

GeneMaskLFCScorer(requested_output=RNA_SEQ)

**Perform prediction**

In [16]:
### perform prediction
variant_scores = dna_model.score_variant(
    interval = interval,
    variant  = variant,
    variant_scorers = [variant_scorers.RECOMMENDED_VARIANT_SCORERS["RNA_SEQ"]]
    #variant_scorers = list(variant_scorers.RECOMMENDED_VARIANT_SCORERS.values()),
)

dat_scores = variant_scorers.tidy_scores(variant_scores)
dat_scores.head()

Unnamed: 0,variant_id,scored_interval,gene_id,gene_name,gene_type,gene_strand,junction_Start,junction_End,output_type,variant_scorer,track_name,track_strand,Assay title,ontology_curie,biosample_name,biosample_type,gtex_tissue,raw_score,quantile_score
0,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000047 polyA plus RNA-seq,-,polyA plus RNA-seq,CL:0000047,neuronal stem cell,in_vitro_differentiated_cells,,-0.00336,-0.918669
1,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000062 total RNA-seq,-,total RNA-seq,CL:0000062,osteoblast,primary_cell,,-0.053278,-0.999837
2,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000084 polyA plus RNA-seq,-,polyA plus RNA-seq,CL:0000084,T-cell,primary_cell,,-0.095105,-0.999899
3,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000084 total RNA-seq,-,total RNA-seq,CL:0000084,T-cell,primary_cell,,-0.10872,-0.999918
4,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000115 total RNA-seq,-,total RNA-seq,CL:0000115,endothelial cell,in_vitro_differentiated_cells,,-0.00468,-0.976447


In [17]:
dat = dat_scores.loc[dat_scores["biosample_name"] == "K562"]
dat

Unnamed: 0,variant_id,scored_interval,gene_id,gene_name,gene_type,gene_strand,junction_Start,junction_End,output_type,variant_scorer,track_name,track_strand,Assay title,ontology_curie,biosample_name,biosample_type,gtex_tissue,raw_score,quantile_score
119,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),EFO:0002067 polyA plus RNA-seq,-,polyA plus RNA-seq,EFO:0002067,K562,cell_line,,-0.027582,-0.99916
120,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),EFO:0002067 total RNA-seq,-,total RNA-seq,EFO:0002067,K562,cell_line,,-0.005818,-0.974781
300,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),EFO:0002067 polyA plus RNA-seq,.,polyA plus RNA-seq,EFO:0002067,K562,cell_line,,-0.010384,-0.992335


In [22]:
dat = dat_scores
dat = dat.loc[dat_scores["biosample_name"] == "K562"]
dat = dat.assign(
    Region   = txt_region,
    Position = num_variant_position_0base,
    Ref = txt_variant_reference_base,
    Alt = txt_variant_alternate_base,
    Length = num_sequence_length,
    Scorer = "RNA_SEQ"
)
dat

Unnamed: 0,variant_id,scored_interval,gene_id,gene_name,gene_type,gene_strand,junction_Start,junction_End,output_type,variant_scorer,...,biosample_type,gtex_tissue,raw_score,quantile_score,Region,Position,Ref,Alt,Length,Scorer
119,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,cell_line,,-0.027582,-0.99916,chr11:5227021-5227208,5227021,G,A,2048,RNA_SEQ
120,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,cell_line,,-0.005818,-0.974781,chr11:5227021-5227208,5227021,G,A,2048,RNA_SEQ
300,chr11:5227022:G>A,chr11:5225998-5228046:.,ENSG00000244734,HBB,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,cell_line,,-0.010384,-0.992335,chr11:5227021-5227208,5227021,G,A,2048,RNA_SEQ


In [23]:
dat = dat_scores.loc[
    dat_scores["biosample_name"] == "K562",
    [
        "variant_id",
        "scored_interval",
        "output_type",
        "variant_scorer",
        "biosample_name",
        "raw_score",
        "quantile_score",
    ]
]
dat

Unnamed: 0,variant_id,scored_interval,output_type,variant_scorer,biosample_name,raw_score,quantile_score
119,chr11:5227022:G>A,chr11:5225998-5228046:.,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),K562,-0.027582,-0.99916
120,chr11:5227022:G>A,chr11:5225998-5228046:.,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),K562,-0.005818,-0.974781
300,chr11:5227022:G>A,chr11:5225998-5228046:.,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),K562,-0.010384,-0.992335
