# Run enformer on variants around the TSS 

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
from kipoi_enformer.veff.dataloader import VCFEnformerDL, get_tss_from_genome_annotation
import tensorflow as tf
from kipoi_enformer.veff.utils import Enformer
from pathlib import Path
from kipoi_enformer.logger import logger
import logging
import os
import pyarrow.parquet as pq
import numpy as np

2024-04-03 14:18:27.167303: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-03 14:18:27.167379: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-03 14:18:27.425760: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-03 14:18:27.743813: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Config

In [2]:
# show logs in notebook
logging.basicConfig()
logger.setLevel(logging.DEBUG)

In [3]:
# Check if TensorFlow is able to see your GPU
tf.config.list_physical_devices('GPU')

2024-04-03 14:18:37.862788: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-03 14:18:39.268555: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-03 14:18:39.268878: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
os.getcwd()

'/data/nasif12/home_if12/tsi/projects/kipoi_enformer/notebooks'

In [5]:
base = Path("../example_files/chr22")
files = {
        'fasta': base / "seq.chr22.fa",
        'gtf': base / "annot.chr22.gtf",
        'vcf': base / "promoter_variants.chr22.vcf",
        }

## Run enformer and save results

In [6]:
size = 100

In [7]:
def get_dataloader(_size=None, _seq_length=393_216, _shift=43, _tss_ext=500):
    return VCF_Enformer_DL(
        fasta_file=files['fasta'],
        gtf_file=files['gtf'],
        vcf_file=files['vcf'],
        downstream_tss=_tss_ext,
        upstream_tss=_tss_ext,
        shift=_shift,
        seq_length=_seq_length,
        size=_size
    )

In [8]:
len([1 for _,_ in get_dataloader(_seq_length=21, _shift=1, _tss_ext=10, _size=size)])

  return {k: v for k, v in df.groupby(grpby_key)}
[W::vcf_parse] Contig 'chr22' is not defined in the header. (Quick workaround: index the file with tabix.)
  return {k: v for k, v in df.groupby(grpby_key)}


100

In [None]:
output_dir = Path('../output')
output_dir.mkdir(exist_ok=True, parents=True)
output_dir = output_dir / 'enformer_test'

In [10]:
batch_size = 2
dl = get_dataloader(_size=size)

enformer = Enformer()
enformer.predict(dl, batch_size=batch_size, output_dir=output_dir)

  return {k: v for k, v in df.groupby(grpby_key)}
DEBUG:kipoi_enformer:Predicting on dataloader
[W::vcf_parse] Contig 'chr22' is not defined in the header. (Quick workaround: index the file with tabix.)
  return {k: v for k, v in df.groupby(grpby_key)}
DEBUG:kipoi_enformer:Processing batch 1
2024-04-03 13:53:39.478271: W tensorflow/core/kernels/gpu_utils.cc:54] Failed to allocate memory for convolution redzone checking; skipping this check. This is benign and only means that we won't check cudnn for out-of-bounds reads and writes. This message will only be printed once.
2024-04-03 13:53:39.484629: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
DEBUG:kipoi_enformer:Converting results to pyarrow
DEBUG:kipoi_enformer:Constructing pyarrow record batch
DEBUG:kipoi_enformer:Processing batch 2
DEBUG:kipoi_enformer:Converting results to pyarrow
DEBUG:kipoi_enformer:Constructing pyarrow record batch
DEBUG:kipoi_enformer:Processing batch 3
DEBUG:kipoi_en

## Read results file

In [10]:
results_table = pq.read_table(output_dir, columns = ['enformer_start', 'enformer_end', 'landmark_pos', 'chr', 'strand',
                                                    'gene_id', 'transcript_id', 'transcript_start', 'transcript_end',
                                                    'variant_start', 'variant_end', 'ref', 'alt'])

In [11]:
assert results_table.shape == (size, 13)
results_table.shape

(100, 13)

In [13]:
results_table.schema

enformer_start: int64
enformer_end: int64
landmark_pos: int64
chr: string
strand: string
gene_id: string
transcript_id: string
transcript_start: int64
transcript_end: int64
variant_start: int64
variant_end: int64
ref: string
alt: string

In [21]:
results_table['transcript_id'][0]

<pyarrow.StringScalar: 'ENST00000424770.1'>

In [23]:
filtered_tbl = pq.read_table(output_dir, filters=[("transcript_id", "=", 'ENST00000424770.1')])

In [24]:
filtered_tbl.shape

(5, 19)

In [26]:
X = filtered_tbl['ref_0'].to_pylist()
X = np.array(X)

In [28]:
assert X.shape  == (len(X), 896, 5313)
X.shape

(5, 896, 5313)