# Reconstruction Error vs all GT folds

## 1. Imports, functions and constants

In [1]:
import os

from CNN import *
from Utils import *

from sklearn import preprocessing

In [2]:
"""Constants"""
# sequence length indicate the maximum length for all of the sequnence 798
SEQUENCE_LENGTH = 798

BATCH_SIZE = 16

vocab = {'C': [0,0,1], 'H': [0,1,0], 'E': [1,0,0], '-':[0,0,0]}

cuda_gpu = False

# Transform the labels from String to Integer via LabelEncoder
le_fold = preprocessing.LabelEncoder()
le_fam = preprocessing.LabelEncoder()

In [3]:
pwd

'/home/esbg-lab/Dropbox (ESBG LAB)/GT_strML/Github_Folder/GT-CNN/Codes'

## 2. Reading saved datasets and model

In [4]:
# Define criterion and read in the model
criterion = nn.MSELoss(reduction="sum")

# Reading the model
f=open( "../PretrainedModels/Autoencoder_gtAll.pickle", "rb" )
model = CPU_Unpickler(f).load()

# Reconstruction error for the training data (Saved in this file from model training)
df_rerr_all = pd.read_csv("../Datasets/RE_data/rerr_gtAll_training.csv")

## 3. For any new set of sequences

In [5]:
# Read in the test dataset
df_test=pd.read_csv("../Datasets/gtu/gtu.processed.csv")
df_test.shape

(4072, 8)

In [6]:
df_test.columns

Index(['Name', 'fold', 'family', 'q3seq', 'rawseq', 'q3seqTokens',
       'rawseqTokens', 'paddings'],
      dtype='object')

In [7]:
# Calculate RE

rerr_test = reconstruction_error_calculation(model, df_test, le_fam, le_fold, cuda_gpu, criterion, vocab)



In [8]:
df=rerr_test.groupby('Family').median().reset_index().sort_values('Err')
df

Unnamed: 0,Family,Err
16,GT53-u,0.084841
4,GT105-u,0.08537
10,GT110-u,0.087367
22,GT89-u,0.091285
21,GT76-u,0.091351
5,GT106-u,0.093037
8,GT109-u,0.096032
12,GT29-u,0.098119
3,GT103-u,0.102328
6,GT107-u,0.102565


In [9]:
# Save RE for the test dataset to file
rerr_test.to_csv("../ExampleOutputs/gtu.RE_all.csv",index=False)

In [10]:
# df.to_csv("../Data/gtu_domainOnly/results_allgtu_DomainOnlyCut_medianRE.csv",index=False)
df.to_csv("../ExampleOutputs/gtu.medianRE_all.csv",index=False)

# Reconstruction Error vs GT clusters

Tries running all subclusters at once.<br>
If run out of memory, run each one separately then clear memory and run others.

In [11]:
grp=['gta0','gta1','gtb0','gtb1','gtb2','gtc0','gtc1','gtc2','gtlyso']

## 1. Reading saved datasets

In [12]:
# Define criterion and read in the models for each subcluster
criterion = nn.MSELoss(reduction="sum")
model=dict()
for i in grp:
    f=open( "../PretrainedModels/Autoencoder_"+i+".pickle", "rb" )
    model[i] = CPU_Unpickler(f).load()

## 2. For any new set of sequences

In [13]:
# Read in the test dataset
df_test=pd.read_csv("../Datasets/gtu/gtu.processed.csv")
df_test.shape

(4072, 8)

In [14]:
# Calculate RE for the test dataset against each cluster
rerr_test=dict()
for i in grp:
    rerr_test[i] = reconstruction_error_calculation(model[i], df_test, le_fam, le_fold, cuda_gpu, criterion, vocab)



In [15]:
# Save RE for every cluster into separate files
for i in grp:
    outfile="../ExampleOutputs/gtu.RE_"+i+".csv"
    rerr_test[i].to_csv(outfile,index=False)