In [None]:
import tensorflow as tf

In [None]:
# Make sure TensorFlow is below 2.16! Otherwise the model won't be able to load.
print(tf.__version__)

In [None]:
# GPU info:
!nvidia-smi

In [None]:
# Assign GPU to use:
GPU_id = '7'
import os
os.environ["CUDA_VISIBLE_DEVICES"] = GPU_id

# check:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
# check:
sess = tf.compat.v1.Session()
gpu_devices = tf.config.list_physical_devices('GPU')
if len(gpu_devices) > 0:
    print("GPU working")
else:
    print("GPU not working")

In [None]:
# check:
for device in gpu_devices:
    print("device name:", device.name)
    print("device type:", device.device_type)

In [None]:
# Input SNP seq. Make sure to add 44 extra bases upstream and downstream to the SNP:
ref = 'GACACGAACCTCAGTTAGCCTACATCCTACCAGAGGTCTGTGCCCCCGGTGGTGAGAAGTGCGGATTTCGTATTTGCAGCTCGTCAGTA'
alt = 'GACACGAACCTCAGTTAGCCTACATCCTACCAGAGGTCTGTGCCGCCGGTGGTGAGAAGTGCGGATTTCGTATTTGCAGCTCGTCAGTA'
# Both lengths should be 89 nt:
len(ref)

In [None]:
# function to split a string into k-mers:
def kmerize(string, k):
    return [string[i:i+k] for i in range(len(string)-k+1)]

ref_N45s = kmerize(ref, 45)
alt_N45s = kmerize(alt, 45)
alt_N45s

In [None]:
# Convert DNA to array:

# Function to convert a DNA sequence to vector:
vocab = ['pad','N','A','T','C','G']
char2idx = {u:i for i, u in enumerate(vocab)}
def vectorize_dna_seq(dna_seq):
    vectorized_dna_seq = [char2idx[char] for char in dna_seq]
    return vectorized_dna_seq

# Function to convert a list of DNA into x array for ANN inputs:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def prepare_x(dna_list, x_lenth):
    x = list(map(vectorize_dna_seq, dna_list))
    x = pad_sequences(x, maxlen=x_lenth, padding='post')
    return x

ref_N45s_array = prepare_x(ref_N45s, 46)
alt_N45s_array = prepare_x(alt_N45s, 46)
alt_N45s_array

In [None]:
# Predict:
from keras.models import load_model
model = '/rd4/users/liangn/L5-220528_em5-LSTM64x32x0.5-64x0.5-rep4.hdf5'
Model = load_model(model)
ref_N45s_predicts = Model.predict(ref_N45s_array)
alt_N45s_predicts = Model.predict(alt_N45s_array)
alt_N45s_predicts

In [None]:
# Calculate Delta log2(Cyt/DNA):
delta_log2_cyt = alt_N45s_predicts[:, 1] - ref_N45s_predicts[:, 1]
delta_log2_cyt

In [None]:
# Take median:
import numpy as np
np.median(delta_log2_cyt)

In [None]:
# Predict random SNPs:

# Function to generate a random DNA:
import random
def generate_random_dna(length=89):
    return ''.join(random.choice('ATGC') for _ in range(length))

random_dna = generate_random_dna(89)
random_dna

In [None]:
# Function to mrandomly mutate the central base:
def mutate_dna(dna, mutation_position):
    original_base = dna[mutation_position]
    bases = 'ATGC'
    mutated_base = random.choice([base for base in bases if base != original_base])
    return dna[:mutation_position] + mutated_base + dna[mutation_position + 1:]

mutation_position = 44
mutated_dna = mutate_dna(random_dna, mutation_position)
mutated_dna

In [None]:
# Don't show any progress:
import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
tf.get_logger().setLevel('ERROR')         

In [None]:
# Predict random SNPs:
def random_delta_log2_cyt ():
    ref = generate_random_dna(89)
    alt = mutate_dna(ref, mutation_position=44)
    ref_N45s = kmerize(ref, 45)
    alt_N45s = kmerize(alt, 45)
    ref_N45s_array = prepare_x(ref_N45s, 46)
    alt_N45s_array = prepare_x(alt_N45s, 46)
    ref_N45s_predicts = Model.predict(ref_N45s_array)
    alt_N45s_predicts = Model.predict(alt_N45s_array)
    delta_log2_cyt = alt_N45s_predicts[:, 1] - ref_N45s_predicts[:, 1]
    result = np.median(delta_log2_cyt)
    return result

results = []

for _ in range(10000):
    result = random_delta_log2_cyt()
    results.append(result)

In [None]:
results

In [None]:
len(results)

In [None]:
# Plot:
import matplotlib.pyplot as plt
plt.hist(results, bins=2048, edgecolor='black', alpha=0.7)
plt.xlim(-0.2, 0.2)
plt.title("Random SNP effects")
plt.xlabel("Delta log2(Cyt/DNA)")
plt.ylabel("Frequency")
plt.show()