In [34]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os

from scipy.stats import linregress, pearsonr
from scipy.io import loadmat

import tensorflow as tf
from tensorflow import keras
from keras import datasets, layers, models, Input, Model, activations
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, f1_score, confusion_matrix, roc_auc_score, precision_recall_curve, PrecisionRecallDisplay, average_precision_score

from src.mpra_tools.fasta_utils import *
from src.tf_tools.cnn_regression import MCDropout

In [35]:
def one_hot_seqs(seqs) -> np.array:
    static_1hotmap = {
        'A' : np.array([1,0,0,0]),
        'a' : np.array([1,0,0,0]),
        'C' : np.array([0,1,0,0]),
        'c' : np.array([0,1,0,0]),
        'G' : np.array([0,0,1,0]),
        'g' : np.array([0,0,1,0]),
        'T' : np.array([0,0,0,1]),
        't' : np.array([0,0,0,1]),
    }
    onehot_seqs = []
    for seq in seqs:
        onehot_seqs.append(
            [static_1hotmap[seq[i]] if seq[i] in static_1hotmap.keys() else static_1hotmap[random.choice(['A','C','G','T'])] for i in range(len(seq))]
        )
    return np.stack(onehot_seqs)


In [36]:
LABEL_KEY = 'expression_log2'
activity_df = pd.read_csv("Data/activity.csv")
retinopathy_df = pd.read_csv("Data/retinopathy.csv")
x_ret = one_hot_seqs(retinopathy_df['sequence'])

test_df = activity_df[activity_df['test_set']]
len(test_df)
x_test = one_hot_seqs(test_df['sequence'])

y_test = test_df['expression_log2'].values

# encoder = LabelEncoder()
# encoder.fit(data_df[LABEL_KEY])
# classes = encoder.classes_
# num_classes = len(classes)
# y_test = encoder.transform(test_df[LABEL_KEY])
# y_test = keras.utils.to_categorical(y_test, num_classes)


In [37]:
folder = "Reg2_MCD_11597680"
nsamples = range(20)
model_name = "cnn_model.keras"
folds = np.arange(7)+1

In [38]:
# results = []
# predictions = []
# ret_predictions = []
# for fold in folds:
#     model = keras.models.load_model(os.path.join(folder,str(fold),model_name), custom_objects={'MCDropout':MCDropout})
#     results.append(model.evaluate(x_test,y_test,batch_size=64, verbose=1))
#     drop = []
#     ret_drop = []
#     for _ in nsamples:
#         drop.append(model.predict(x_test, verbose=0).flatten())
#         ret_drop.append(model.predict(x_ret, verbose=0).flatten())
#     predictions.append(np.stack(drop))
#     ret_predictions.append(np.stack(ret_drop))
#     print(f"Done with fold {fold}")

Done with fold 1
Done with fold 2
Done with fold 3
Done with fold 4
Done with fold 5
Done with fold 6
Done with fold 7


In [80]:
# MCD_predictions = predictions

In [87]:

# mean_preds = [np.mean(preds, axis=0) for preds in predictions]
# std_preds = [np.std(preds,axis=0) for preds in predictions]
# mean_rvals = [linregress(preds, predictions[i][1]).rvalue for i, preds in enumerate(mean_preds)]

ret_mean_preds = [np.mean(preds, axis=0) for preds in ret_predictions]
ret_std_preds = [np.std(preds,axis=0) for preds in ret_predictions]
ret_mean_rvals = [linregress(preds, retinopathy_df['expression_log2']).rvalue for preds in ret_mean_preds]

In [88]:
ret_mean_rvals

[0.4139677064395499,
 0.4311395237414155,
 0.3840810766690879,
 0.43808107750881625,
 0.40739761586247675,
 0.4559503706595654,
 0.4367598479834121]

In [89]:
mean_rvals

[0.408320916260335,
 0.41134238842927384,
 0.4552449643612125,
 0.43610947811315776,
 0.3853631649160763,
 0.4412920642424504,
 0.41730414653867126,
 0.43411587005808133,
 0.42887860614961015,
 0.43178759697686364]

In [81]:
folder = "Reg2_Rand_11600107"
folds = np.arange(10)+1

predictions = []

for fold in folds:
    mat_path = os.path.join(folder,str(fold),'test_preds.mat')
    pmat = loadmat(mat_path)
    predictions.append((pmat['preds'], pmat['truths']))
    
    

In [82]:
mean_preds = [np.mean(preds[0], axis=0) for preds in predictions]
std_preds = [np.std(preds[0],axis=0) for preds in predictions]
mean_rvals = [linregress(preds, predictions[i][1]).rvalue for i, preds in enumerate(mean_preds)]

In [83]:
mean_rvals

[0.408320916260335,
 0.41134238842927384,
 0.4552449643612125,
 0.43610947811315776,
 0.3853631649160763,
 0.4412920642424504,
 0.41730414653867126,
 0.43411587005808133,
 0.42887860614961015,
 0.43178759697686364]

In [84]:
best_model = keras.models.load_model(os.path.join(folder,'3',model_name), custom_objects={'MCDropout':MCDropout})

In [85]:
q_preds = []

for _ in range(20):
    q_preds.append(model.predict(x_ret, verbose=1).flatten())

