In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os

import tensorflow as tf
from tensorflow import keras
from keras import datasets, layers, models, Input, Model, activations
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, f1_score, confusion_matrix, roc_auc_score, precision_recall_curve, PrecisionRecallDisplay, average_precision_score
from src.tf_tools import cnn_classifier

from src.mpra_tools.fasta_utils import *

2024-02-14 11:12:25.626766: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-14 11:12:26.718020: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-14 11:12:27.989241: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-14 11:12:27.989290: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-14 11:12:27.992869: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [69]:
def one_hot_seqs(seqs) -> np.array:
    static_1hotmap = {
        'A' : np.array([1,0,0,0]),
        'a' : np.array([1,0,0,0]),
        'C' : np.array([0,1,0,0]),
        'c' : np.array([0,1,0,0]),
        'G' : np.array([0,0,1,0]),
        'g' : np.array([0,0,1,0]),
        'T' : np.array([0,0,0,1]),
        't' : np.array([0,0,0,1]),
    }
    onehot_seqs = []
    for seq in seqs:
        onehot_seqs.append(
            [static_1hotmap[seq[i]] if seq[i] in static_1hotmap.keys() else static_1hotmap[random.choice(['A','C','G','T'])] for i in range(len(seq))]
        )
    return np.stack(onehot_seqs)


In [70]:
LABEL_KEY = 'open'
data_file = "Data/ATAC/labeled_data_164.csv"
FEATURE_KEY = 'sequence'
fold=1

In [None]:

data_df = pd.read_csv(data_file, index_col = 0)
test_df = data_df[data_df['set'] == 'TEST']
validation_df = data_df[(data_df['fold'] == fold) & ((data_df['set'] != 'TEST'))]
train_df = data_df[(data_df['fold'] != fold) & (data_df['set'] != 'TEST')]

print(len(train_df), ": training points", flush=True)
print(len(validation_df), ": validation points", flush=True)
print(len(test_df), ": reserved testing points", flush=True)

#############################################################
# Prepare data for fitting
x_train = one_hot_seqs(train_df[FEATURE_KEY])
x_validation = one_hot_seqs(validation_df[FEATURE_KEY])
x_test = one_hot_seqs(test_df[FEATURE_KEY])

encoder = LabelEncoder()
encoder.fit(data_df[LABEL_KEY])
classes = encoder.classes_
num_classes = len(classes)
y_train = encoder.transform(train_df[LABEL_KEY])
y_validation = encoder.transform(validation_df[LABEL_KEY])
y_test = encoder.transform(test_df[LABEL_KEY])

y_train = keras.utils.to_categorical(y_train, num_classes)
y_validation = keras.utils.to_categorical(y_validation, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)


In [None]:
# Create model and callbacks then fit

# Tensorboard setup
# tensor_logs = os.path.join(output_dir, "tb_logs")
# os.makedirs(tensor_logs, exist_ok=True)
# tensorboard_cb = keras.callbacks.TensorBoard(tensor_logs, histogram_freq=1)

# Early stopping setup
earlystop_cb = keras.callbacks.EarlyStopping('val_loss', patience=10)

# Load model
model = cnn_classifier.getClassCNN(len(x_train[0]),num_classes)

history = model.fit(
    x_train,
    y_train,
    epochs=epochs,
    validation_data=(x_validation, y_validation),
    batch_size=batch_size,
    callbacks =[earlystop_cb],
    verbose=0,
)