## Reproduce Figure 5b

In [None]:
import pandas as pd
import ipynbname
REPO_PATH = ipynbname.path().parent.parent

## Launch predictions using Variantformer

In [None]:
# Essential imports
import sys
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from processors.variantprocessor import VariantProcessor

### Load data

In [None]:
variants = pd.read_parquet(os.path.join(REPO_PATH, '_artifacts/b23c9b69.pq'))
variants['tissue'].value_counts()

### Predict for first 10 variants

In [None]:
# tiisue map to align tissue names between VF and Eqtl catalog data
vf_tissue_map = {'skin' : 'skin - sun exposed (lower leg)',
                 'blood' : 'whole blood',
                 'adipose' : 'adipose - subcutaneous',
                 'brain - frontal cortex (ba9)': 'brain - frontal cortex (ba9)',
                 'brain - putamen (basal ganglia)': 'brain - putamen (basal ganglia)',
                 'brain - substantia nigra' : 'brain - substantia nigra'
                 }
variants['tissue'] = variants['tissue'].map(vf_tissue_map)

In [None]:

first10_variants = variants.head(10)
first10_variants['chr'] = first10_variants['variant_id'].apply(lambda x: x.split('_')[0])
first10_variants['pos'] = first10_variants['variant_id'].apply(lambda x: int(x.split('_')[1]))
first10_variants['ref'] = first10_variants['variant_id'].apply(lambda x: x.split('_')[2])
first10_variants['alt'] = first10_variants['variant_id'].apply(lambda x: x.split('_')[3])


In [None]:
variants['tissue'].value_counts()

In [None]:
# Initialize VariantFormer
print("ðŸš€ Initializing VariantFormer Variant Processor...")
model_class = 'v4_ag' # model class can be 'v4_ag', 'v4_pcg'. AG model is all-genes model trained on both protein-coding and non-coding genes.
# model_class = 'v4_pcg' # Uncomment to use the PCG model
vep = VariantProcessor(model_class=model_class)
# Run variant predictions
print("ðŸ”¬ Running VariantFormer variant analysis...")
output_dir = "/tmp/vep_eqtl_output"
# Predict expression effects for all variants
raw_predictions = vep.predict(var_df=first10_variants, output_dir=output_dir)
print("Formatting VariantFormer scores and computing eQTL statistics...")
formatted_scores = vep.format_scores(raw_predictions)
print("Computing eQTL statistics...")
final_results = vep.eqtl_scores(formatted_scores)

In [None]:
final_results.head()

## Launch predictions using Alphagenome

### Load data

In [None]:
variants = pd.read_parquet(os.path.join(REPO_PATH, '_artifacts/b23c9b69.pq'))

In [None]:
# tiisue map to align tissue names between VF and Eqtl catalog data
alphagenome_tissue_map = {'skin' : 'Skin_Sun_Exposed_Lower_leg',
                 'blood' : 'Whole_Blood',
                 'adipose' : 'Adipose_Subcutaneous',
                 'brain - frontal cortex (ba9)': 'Brain_Frontal_Cortex_BA9',
                 'brain - putamen (basal ganglia)': 'Brain_Putamen_basal_ganglia',
                 'brain - substantia nigra' : 'Brain_Substantia_nigra'
                 }
variants['tissue'] = variants['tissue'].map(alphagenome_tissue_map)

In [None]:
first10_variants = variants.head(10)
first10_variants['CHROM'] = first10_variants['variant_id'].apply(lambda x: x.split('_')[0])
first10_variants['POS'] = first10_variants['variant_id'].apply(lambda x: int(x.split('_')[1]))
first10_variants['REF'] = first10_variants['variant_id'].apply(lambda x: x.split('_')[2])
first10_variants['ALT'] = first10_variants['variant_id'].apply(lambda x: x.split('_')[3])
first10_variants['gene_id'] = first10_variants['gene_id'].apply(lambda x: x.split('.')[0])

In [None]:
# Install alphagenome
!uv pip install alphagenome
from io import StringIO
from alphagenome import colab_utils
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
import numpy as np

In [None]:
# Load the model.
API_KEY = "your_api_key_here"
dna_model = dna_client.create(API_KEY)

In [None]:
# Alphagenome specifications
organism = 'human'
# @markdown Specify length of sequence around variants to predict:
sequence_length = '1MB'
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
    f'SEQUENCE_LENGTH_{sequence_length}'
]
score_rna_seq = True
download_predictions = False
# Parse organism specification.
organism_map = {
    'human': dna_client.Organism.HOMO_SAPIENS,
    'mouse': dna_client.Organism.MUS_MUSCULUS,
}
organism = organism_map[organism]
# Parse scorer specification.
scorer_selections = {
    'rna_seq': score_rna_seq,
}
all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
selected_scorers = [
    all_scorers[key]
    for key in all_scorers
    if scorer_selections.get(key.lower(), False)
]


In [None]:
results = []
for i, vcf_row in first10_variants.iterrows():
  variant = genome.Variant(
      chromosome=str(vcf_row.CHROM),
      position=int(vcf_row.POS),
      reference_bases=vcf_row.REF,
      alternate_bases=vcf_row.ALT,
      name=vcf_row.variant_id,
  )
  interval = variant.reference_interval.resize(sequence_length)

  variant_scores = dna_model.score_variant(
      interval=interval,
      variant=variant,
      variant_scorers=selected_scorers,
      organism=organism,
  )
  df_scores = variant_scorers.tidy_scores([variant_scores])
  tissue_key = vcf_row.tissue
  df = df_scores[(df_scores['gene_id'] == vcf_row.gene_id) & (df_scores['gtex_tissue'] == tissue_key)]['raw_score']
  vcf_row['alphagenome_score_new'] = df.values[0].mean() if not df.empty else np.nan
  results.append(vcf_row)

In [None]:
df = pd.DataFrame(results)

**The alphagenome score might deviate a little because we believe the underlying model is evolving** 

## Visualize all the precalculated scores

In [None]:
variants = pd.read_parquet(os.path.join(REPO_PATH, '_artifacts/b23c9b69.pq'))

In [None]:
variants.head()

### VF results

In [None]:
# Tissue specific spoearman correlation plots
vf_spearman = variants.groupby('tissue').apply(
    lambda x: x['slope'].corr(x['VF-agg-log2fc-weighted_ag'], method='spearman')
).reset_index()
vf_spearman.columns = ['tissue', 'spearman_correlation']
vf_spearman


### Alphagenome results

In [None]:
# Tissue specific spoearman correlation plots
alphagenome_spearman = variants.groupby('tissue').apply(
    lambda x: x['slope'].corr(x['alphagenome_score'], method='spearman')
).reset_index()
alphagenome_spearman.columns = ['tissue', 'spearman_correlation']
alphagenome_spearman