# Reconstruction Error vs GT clusters

Tries running all subclusters at once.<br>
If run out of memory, run each one separately then clear memory and run others.

## Imports, functions and constants

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import init
from torch.utils.data import Dataset, DataLoader

from CNN import *
# from GradCAM import *
# from GradCAMUtils import *
from Utils import *

import numpy as np
import pandas as pd
import seaborn as sns

import os

from fastai import *
from fastai.text import *
from fastai.vision import *
from fastai.imports import *

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import umap

from matplotlib.widgets import Button
from matplotlib.widgets import TextBox
from matplotlib.ticker import MultipleLocator
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

from scipy.stats import norm
from scipy.stats import genextreme

In [3]:
"""Constants"""
# sequence length indicate the maximum length for all of the sequnence 626/798
SEQUENCE_LENGTH = 798

BATCH_SIZE = 256

vocab = {'C': [0,0,1], 'H': [0,1,0], 'E': [1,0,0], '-':[0,0,0]}

sns.set(rc={'figure.figsize':(15,15)})

model_path = Path("./Models/")
path = Path("./Datasets/")

#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)

# Transform the labels from String to Integer via LabelEncoder
le_fold = preprocessing.LabelEncoder()
le_fam = preprocessing.LabelEncoder()

# torch.cuda.set_device()
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

cuda_gpu = False   #判断GPU是否存在可用

torch.cuda.empty_cache()

In [4]:
"""def reconstruction_error_calculation(model, df, le_fam, le_fold, cuda_gpu, criterion):
    gt_ds, gt_dl = Dataset_Loader(df, le_fam, le_fold, vocab, BATCH_SIZE=1, cuda = cuda_gpu)
    reconstruction_err = []
    for i, data in enumerate(gt_dl, 0):
        model.eval()
        xb, yb, p = data
        output = model(xb)
        xb = xb.float()
        loss = criterion(output, xb)/(p.sum())
        reconstruction_err.append([df.iloc[i].Name,df.iloc[i].fold,df.iloc[i].family, loss.item()])
    return pd.DataFrame(reconstruction_err, columns=["Name","Fold","Family","Err"])

def Plot_Len_Dis_Extreme(r_err, GT_val, interval1 = 0.95, interval2 = 0.99, bins_val=100):
    covMat = np.array(r_err["Err"], dtype=float)
    median = np.median(covMat)
    c, loc, scale = genextreme.fit(covMat, floc=median)

    min_extreme1,max_extreme1 = genextreme.interval(interval1,c,loc,scale)
    min_extreme2,max_extreme2 = genextreme.interval(interval2,c,loc,scale)
    
    x = np.linspace(min(covMat),max(covMat),2000)

    fig,ax = plt.subplots(1, 1)
    plt.xlim(0,0.4)
    plt.plot(x, genextreme.pdf(x, *genextreme.fit(covMat)))
    plt.hist(np.array(r_err["Err"], dtype=float),bins=100,alpha=0.7, density=True)
    plt.hist(np.asarray(GT_val["Err"]), edgecolor='k', alpha=0.35, bins=bins_val, density=True) 
    plt.xlabel('Lengths Counts')
    plt.ylabel('Probability')
    plt.title(r'max_extreme1=%.3f,max_extreme2=%.3f' %(max_extreme1, max_extreme2))
    plt.annotate('Max Extreme Value 1',xy=(max_extreme1,0),xytext=(max_extreme1,1),arrowprops=dict(arrowstyle="->",connectionstyle="arc3",color="black"),color="black")
    plt.annotate('Max Extreme Value 2',xy=(max_extreme2,0),xytext=(max_extreme2,1),arrowprops=dict(arrowstyle="->",connectionstyle="arc3",color="black"),color="black")
    plt.grid(True)
    median = GT_val.median()
    print("95% CI upper bound:",max_extreme1)
    print("99% CI upper bound:",max_extreme2)
    print("Median RE:",median.values)
    return max_extreme1, max_extreme2, median

class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)"""""

In [5]:
# grp=['gta0','gta1','gta2','gtb0','gtb1','gtb2','gtc0','gtc1','gtlyso']
grp=['gta0','gta1','gtb0','gtb1','gtb2','gtc0','gtc1','gtc2','gtlyso']

In [5]:
pwd

'/Users/rtaujale/Dropbox (Edison_Lab@UGA)/Projects/GT_informatics/GT/GT_strML/finalized_v1/code'

## Reading saved datasets

In [6]:
# Define criterion and read in the models for each subcluster
criterion = nn.MSELoss(reduction="sum")
model=dict()
for i in grp:
    ## On computer with gpu
#     name="../models/Autoencoder_"+i+".pickle"
#     model[i] = pickle.load(open(name,'rb')).cpu()
    
    # On computer with CPU only
#     f=open( "../models/Autoencoder_"+i+".pickle", "rb" )
    f=open( "../revise1/models/Autoencoder_"+i+".pickle", "rb" )
    model[i] = CPU_Unpickler(f).load()

In [9]:
df_all['fold'].value_counts()

NameError: name 'df_all' is not defined

## For any new set of sequences

In [7]:
# Read in the test dataset
# df_test = pd.read_csv("../output/gt105_preprocessed.csv")
# df_test = pd.read_csv("../output/allgtu_newdomain_RegularCut.csv")
# df_test = pd.read_csv("../output/allgtu_newdomain_DomainOnlyCut.e1.csv")
# df_test = pd.read_csv("../../GT_deepLearning_code/run_12/GTU_GTD.csv")
# df_test = pd.read_csv("../../GT_deepLearning_code/run_12/GT_cluster.csv")

#df_test = pd.read_csv("../Data/gtu_domainOnly/allgtu_DomainOnlyCut_processed.csv")

df_test = pd.read_csv("../Data/gtu_107_114/newgtus107-114_DomainOnlyCut_processed.csv")

df_test.shape

(679, 8)

In [8]:
df_test

Unnamed: 0,Name,fold,family,q3seq,rawseq,q3seqTokens,rawseqTokens,paddings
0,GT107-B|AUL15450|B.bronchisepticaA310_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",56
1,GT107-B|ABM49925|B.malleiSAVP1_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",136
2,GT107-B|AMD49973|B.holmesiiF627_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",61
3,GT107-B|AHK74629|C.coliRM5611_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",61
4,GT107-B|AHI64692|B.thailandensisH0587_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",61
5,GT107-B|AOY03532|C.jejuniBCW_6920_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",54
6,GT107-B|QDJ50963|B.hinziiTR-1212_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",61
7,GT107-B|SQF65496|A.pleuropneumoniaeNCTC11384_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",54
8,GT107-B|AJX30636|B.oklahomensisC6786_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",131
9,GT107-B|BAG42586|B.multivoransATCC17616_,u,GT107-u,----------------------------------------------...,----------------------------------------------...,"['-', '-', '-', '-', '-', '-', '-', '-', '-', ...","['-', '-', '-', '-', '-', '-', '-', '-', '-', ...",64


In [9]:
df_test.columns
df_test['Name']=df_test['family']

In [10]:
df_test.columns

Index(['Name', 'fold', 'family', 'q3seq', 'rawseq', 'q3seqTokens',
       'rawseqTokens', 'paddings'],
      dtype='object')

In [66]:
df_test_103=df_test[df_test['family']=='GT103-u']

In [11]:
# Calculate RE for the test dataset against each cluster
rerr_test=dict()
for i in grp:
    rerr_test[i] = reconstruction_error_calculation(model[i], df_test, le_fam, le_fold, cuda_gpu, criterion)



In [12]:
# Save RE for every cluster into separate files
for i in grp:
#     outfile="../output/results_allgtu_olddomain_RegularCut_"+i+".csv"
#     outfile="../revise1/output/temp3MAC"+i+".csv"
    outfile="../revise1/output/temp5MAC_newgtu_"+i+".csv"
    rerr_test[i].to_csv(outfile,index=False)

## For single cluster

In [10]:
pwd

'/Users/rtaujale/Dropbox (Edison_Lab@UGA)/Projects/GT_informatics/GT/GT_strML/finalized_v1/code'

In [11]:
# Define criterion and read in the models for each subcluster
criterion = nn.MSELoss(reduction="sum")
model=dict()
i='gtb1'
f=open( "../../GT_deepLearning_code/run_13/B_subcluster_1.pickle", "rb" )
model[i] = CPU_Unpickler(f).load()

In [12]:
# Read in the test dataset
df_test = pd.read_csv("../Data/gtu_domainOnly/allgtu_DomainOnlyCut_processed.csv")
df_test.shape

(3945, 8)

In [13]:
# Calculate RE for the test dataset against each cluster
rerr_test=dict()
rerr_test[i] = reconstruction_error_calculation(model[i], df_test, le_fam, le_fold, cuda_gpu, criterion)

In [14]:
# Save RE for every cluster into separate files
outfile="../revise1/output/only2_"+i+".csv"
rerr_test[i].to_csv(outfile,index=False)

## Additional analysis of results

Can be done in a separate notebook "RE_analysis.ipynb" - (section Clusters) <br>
Done this way to isolate parts of the workflow that requre a gpu to this notebook.

In [None]:
def histogram(x, **kwargs):
    y, x = np.histogram(x, **kwargs)
    x, y = (x[1:]+x[:-1])/2, y
    return x, y

def Intergral(X1, X2, bins=300):
    kwargs  = {'bins':bins, 'range':[0,3]}
    x1, y1  = histogram(X1['Err'].values, **kwargs)
    y1 = [float(i)/sum(y1) for i in y1]
    x2, y2  = histogram(X2['Err'].values, **kwargs)
    y2 = [float(i)/sum(y2) for i in y2]
    overlap = np.array(list(map(min, zip(y1, y2))))
    
    plt.figure(figsize=(15,5))
    plt.plot(x1,y1)    
    plt.plot(x2,y2)    
    plt.plot(x1,overlap)    
    print(overlap.sum())
    
    return overlap.sum()

In [None]:
def overlap_score(df_train, df_test):
    grouped = df_test.groupby('fold')
    intergral_listu = []
    for name, group in grouped:
    #     print("--")
        overlap =  Intergral(df_train, group, bins=100)
    #     print(len(group))
        intergral_listu.append([name, overlap])
    return pd.DataFrame(intergral_listu)   

In [None]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [None]:
overlap_table['A']= overlap_table.iloc[:, 1:4].sum(axis=1)
overlap_table['B']= overlap_table.iloc[:, 4:6].sum(axis=1)
overlap_table['C']= overlap_table.iloc[:, 6:8].sum(axis=1)

overlap_table[["A_score","B_score","C_score"]] = overlap_table.iloc[:, 10:13].apply(softmax, axis=1) # equiv to df.sum(1)