# Run enformer on variants around the TSS 

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
from kipoi_enformer.veff.dataloader import VCF_Enformer_DL, get_tss_from_genome_annotation
import tensorflow as tf
from kipoi_enformer.veff.utils import Enformer
from pathlib import Path
from kipoi_enformer.logger import logger
import logging
import os
import pyarrow.parquet as pq
import numpy as np

2024-04-02 17:41:22.973279: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-02 17:41:22.973337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-02 17:41:22.974295: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-02 17:41:22.980948: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Config

In [2]:
# show logs in notebook
logging.basicConfig()
logger.setLevel(logging.DEBUG)

In [3]:
# Check if TensorFlow is able to see your GPU
tf.config.list_physical_devices('GPU')

2024-04-02 17:41:28.317164: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

2024-04-02 17:41:28.353150: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-02 17:41:28.353520: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [4]:
os.getcwd()

'/data/nasif12/home_if12/tsi/projects/kipoi_enformer/notebooks'

In [5]:
base = Path("../example_files/chr22")
files = {
        'fasta': base / "seq.chr22.fa",
        'gtf': base / "annot.chr22.gtf",
        'vcf': base / "promoter_variants.chr22.vcf",
        }

## Run enformer and save results

In [6]:
batch_size = 6
size = 10

dl = VCF_Enformer_DL(
    fasta_file=files['fasta'],
    gtf_file=files['gtf'],
    vcf_file=files['vcf'],
    is_onehot=True,
    downstream_tss=500,
    upstream_tss=500,
    shift=43,
    seq_length=393_216,
    size=size
)

enformer = Enformer()
results = enformer.predict(dl, batch_size=batch_size)
assert len(results) == size

  return {k: v for k, v in df.groupby(grpby_key)}
2024-04-02 17:41:30.548672: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-02 17:41:30.549028: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-02 17:41:30.549281: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/

In [7]:
output_dir = Path('../output')
output_dir.mkdir(exist_ok=True, parents=True)
Enformer.to_parquet(results, output_dir / 'enformer_test.parquet')

DEBUG:kipoi_enformer:Converting results to pyarrow table
DEBUG:kipoi_enformer:Constructing pyarrow table
DEBUG:kipoi_enformer:Writing pyarrow table to ../output/enformer_test.parquet


## Read results file

In [8]:
results_table = pq.read_table(output_dir / 'enformer_test.parquet')

In [9]:
assert results_table.shape == (size, 6 + 13)
results_table.shape

(10, 19)

In [10]:
results_table.column_names

['ref_-43',
 'alt_-43',
 'ref_0',
 'alt_0',
 'ref_43',
 'alt_43',
 'enformer_start',
 'enformer_stop',
 'landmark_pos',
 'chr',
 'strand',
 'gene_id',
 'transcript_id',
 'transcript_start',
 'transcript_end',
 'variant_start',
 'variant_stop',
 'ref',
 'alt']

In [11]:
X = results_table['ref_0'].to_pylist()
X = np.array(X)

In [12]:
assert X.shape  == (size, 896, 5313)
X.shape

(10, 896, 5313)