In [None]:
import pandas as pd

In [None]:
# Load negative instances (non-cancer)
df_normal=pd.read_csv('primarysite_HeadandNeckregion_sampletype_SolidTissueNormal.csv')
df_normal.shape

In [None]:
df_normal=df_normal.T
print('before insert label at last column: ', df_normal.shape)
# Insert 'label' attribute with value 0
df_normal['label']=0
print('after insert label at last column: ', df_normal.shape)

In [None]:
# Load positive instances (cancer)
df_abnormal=pd.read_csv('primarysite_HeadandNeckregion_sampletype_PrimaryTumor.csv')
print(df_abnormal.shape)
df_abnormal=df_abnormal.T
print('before insert label at last column: ', df_abnormal.shape)
# Insert 'label' attribute with value 1
df_abnormal['label']=1
print('after insert label at last column: ', df_abnormal.shape)

In [None]:
# Find common attributes for both classes that
# only have zero values

import numpy as np

aux_abnormal=df_abnormal.iloc[1:,]
aux_abnormal_cols=aux_abnormal.columns[(aux_abnormal == 0).all()]
print(aux_abnormal_cols)

aux_normal=df_normal.iloc[1:,]
aux_normal_cols=aux_normal.columns[(aux_normal == 0).all()]
print(aux_normal_cols)

# print(np.where(aux_normal_cols=='label'))
# Delete the 'label' column here, as it only has zero
# values because it is the negative class label
aux_normal_cols = aux_normal_cols.delete(18742)

comun = np.intersect1d(aux_abnormal_cols, aux_normal_cols)
print(comun, comun.size)

In [None]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Delete the first row for both sets, it only 
# has identifiers that are not useful here
df_normal = df_normal.iloc[1:,]
df_normal = df_normal.drop(comun, axis=1)
df_abnormal = df_abnormal.iloc[1:,]
df_abnormal = df_abnormal.drop(comun, axis=1)

In [None]:
# Concatenate both datasets
data = pd.concat([df_normal,df_abnormal], ignore_index=True)
data = data.sample(n=data.shape[0], random_state=2)
# Number of trees for Random Forest
ntrees=100

# Selecting the last column as label
Y= data['label']
X= data.iloc[:,:-1] 
X = np.asarray(X) 
Y = np.asarray(Y)

# Training and test sets
test_size = int(np.floor(0.30*X.shape[0]) )
trainX, testX = X[:-test_size], X[-test_size:]
trainY, testY = Y[:-test_size], Y[-test_size:]
print(trainY.shape,testY.shape)

In [None]:
# Normalize train and test set
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
trainX = scaler.fit_transform(trainX)
testX = scaler.fit_transform(testX)

In [None]:
# Train first Random Forest
from sklearn import preprocessing

clf=RandomForestRegressor(n_estimators=ntrees, random_state=50)

clf.fit(trainX,trainY)
clf.score(testX, testY)

In [None]:
# Visualize Random Forest predictions
pred = clf.predict(testX)
fig,ax = plt.subplots(nrows=1,ncols=1,figsize=(6,6))# 6,6
plt.figure(1) 
plt.style.use('seaborn-deep')  

c=np.where(testY==0)
print("test 0s number:  ",len(c[0]))
plt.hist(pred[c[0]],50,histtype='step',color='darkorange', label= "normal (%d cases in testset) " % (len(c[0]) ))
c=np.where(testY==1)
print("test 1s number:  ", len(c[0]))
plt.hist(pred[c[0]],50,histtype='step',color='blue', label= "abnormal (%d cases in testset) " % (len(c[0]) ))
plt.ylabel('Numbers of events')
plt.xlabel('Predicted score')
plt.yscale("log")
title="normal versus abnormal as Primary Site"
plt.title(title)
plt.legend(loc='upper center')

In [None]:
indexes = (-clf.feature_importances_).argsort()[:5000]

In [None]:
# Keep the 5000 important attributes
new_normal = df_normal.iloc[:, indexes]
new_normal['label'] = 0

new_abnormal = df_abnormal.iloc[:, indexes]
new_abnormal['label'] = 1

In [None]:
# Concatenate both sets again
new_data = pd.concat([new_normal,new_abnormal], ignore_index=True)
new_data = new_data.sample(n=new_data.shape[0],random_state=2)
# Some parameters
ntrees=100

# Selecting the last column as label
new_Y= new_data['label']
new_X= new_data.iloc[:,:-1] 
new_X = np.asarray(new_X) 
new_Y = np.asarray(new_Y)

# Training and test sets
test_size = int(np.floor(0.30*new_X.shape[0]) )
new_trainX, new_testX = new_X[:-test_size], new_X[-test_size:]
new_trainY, new_testY = new_Y[:-test_size], new_Y[-test_size:]
print(new_trainY.shape,new_testY.shape)

In [None]:
# Normalize the new train/test sets
new_trainX = scaler.fit_transform(new_trainX)
new_testX = scaler.fit_transform(new_testX)

In [None]:
new_clf=RandomForestRegressor(n_estimators=ntrees, random_state=8) #random_state=50
# Train the new classifier using the reduced dataset
new_clf.fit(new_trainX,new_trainY)
new_clf.score(new_testX, new_testY)

In [None]:
# Show the new predictions
pred = new_clf.predict(new_testX)
fig,ax = plt.subplots(nrows=1,ncols=1,figsize=(6,6))# 6,6
plt.figure(1) 
plt.style.use('seaborn-deep')  

c=np.where(testY==0)
print("test 0s number:  ",len(c[0]))
plt.hist(pred[c[0]],50,histtype='step',color='darkorange', label= "normal (%d cases in testset) " % (len(c[0]) ))
c=np.where(testY==1)
print("test 1s number:  ", len(c[0]))
plt.hist(pred[c[0]],50,histtype='step',color='blue', label= "abnormal (%d cases in testset) " % (len(c[0]) ))
plt.ylabel('Numbers of events')
plt.xlabel('Predicted score')
plt.yscale("log")
title="normal versus abnormal as Primary Site"
plt.title(title)
plt.legend(loc='upper center')

**VAE**

In [None]:
from tensorflow import keras
from keras.layers.merge import concatenate as concat
from tensorflow.keras.layers import Lambda, Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import mnist
from tensorflow.keras.losses import mse, binary_crossentropy, BinaryFocalCrossentropy
from tensorflow.keras.metrics import binary_focal_crossentropy
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend as K

In [None]:
def sampling(args):
    """Reparameterization trick by sampling 
        fr an isotropic unit Gaussian.
    # Arguments:
        args (tensor): mean and log of variance of Q(z|X)
    # Returns:
        z (tensor): sampled latent vector
    """

    z_mean, z_log_var = args
    # K is the keras backend
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean=0 and std=1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [None]:
# build encoder model
inputs = Input(shape=5000, name='encoder_input')
x = Dense(4000, activation='relu')(inputs)
x = Dense(3000, activation='relu')(x)
z_mean = Dense(2000, name='z_mean')(x)
z_log_var = Dense(2000, name='z_log_var')(x)

# use reparameterization trick to push the sampling out as input
# note that "output_shape" isn't necessary 
# with the TensorFlow backend
z = Lambda(sampling,
           output_shape=(2000,), 
           name='z')([z_mean, z_log_var])

# instantiate encoder model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

In [None]:
# build decoder model
latent_inputs = Input(shape=(2000,), name='z_sampling')
x = Dense(3000, activation='relu')(latent_inputs)
x = Dense(4000, activation='relu')(x)
outputs = Dense(5000, activation='sigmoid')(x)

# instantiate decoder model
decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

In [None]:
# instantiate VAE model

outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae_mlp')

reconstruction_loss = binary_crossentropy(inputs,outputs)
reconstruction_loss *= 10000
# loss = BinaryFocalCrossentropy(gamma=10)
# reconstruction_loss = loss(inputs,outputs)
# reconstruction_loss *= 5000
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(keras.optimizers.Adam(learning_rate=0.0001))
vae.summary()

In [None]:
loss_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
# save_callback = keras.callbacks.ModelCheckpoint(filepath='training_2d.ckpt',save_best_only=True,save_weights_only=True)
vae.fit(new_trainX, epochs=1000, batch_size=12, validation_data=(new_testX, None), callbacks=[loss_callback])

In [None]:
valores = vae.predict(new_testX)
print("**VALORES PREDICHOS**")
print(valores)
print("----------------------------------------------")
print("**VALORES REALES**")
print(new_testX)
print("----------------------------------------------")
pred=new_clf.predict(valores)
print(new_clf.score(valores, new_testY))
print("----------------------------------------------")
print(pred)

In [None]:
fig,ax = plt.subplots(nrows=1,ncols=1,figsize=(6,6))# 6,6
plt.figure(1) 
plt.style.use('seaborn-deep')  

c=np.where(new_testY==0)
print("test 0s number:  ",len(c[0]))
plt.hist(pred[c[0]],50,histtype='step',color='darkorange', label= "normal (%d cases in testset) " % (len(c[0]) ))
c=np.where(new_testY==1)
print("test 1s number:  ", len(c[0]))
plt.hist(pred[c[0]],50,histtype='step',color='blue', label= "abnormal (%d cases in testset) " % (len(c[0]) ))
plt.ylabel('Numbers of events')
plt.xlabel('Predicted score')
plt.yscale("log")
title="normal versus abnormal as Primary Site"
plt.title(title)
plt.legend(loc='upper center')

In [None]:
# z_mean_train, _, _ = encoder.predict(new_trainX)

# plt.figure(figsize=(12, 10))
# zero_index = np.where(new_trainY == 0)
# one_index = np.where(new_trainY == 1)
# plt.scatter(z_mean_train[one_index, 0], z_mean_train[one_index, 1], c='yellow')
# plt.scatter(z_mean_train[zero_index, 0], z_mean_train[zero_index, 1], c='purple')
# plt.colorbar()
# plt.xlabel("z[0]")
# plt.ylabel("z[1]")
# plt.show()

In [None]:
# z_mean_test, _, _ = encoder.predict(new_testX)
# plt.figure(figsize=(12, 10))
# zero_index = np.where(new_testY == 0)
# one_index = np.where(new_testY == 1)
# plt.scatter(z_mean_test[one_index, 0], z_mean_test[one_index, 1], c='yellow')
# plt.scatter(z_mean_test[zero_index, 0], z_mean_test[zero_index, 1], c='purple')
# plt.colorbar()
# plt.xlabel("z[0]")
# plt.ylabel("z[1]")
# plt.show()

In [None]:
# newer_trainY = np.expand_dims(new_trainY,axis=1)
# train_set = np.concatenate((z_mean_train,newer_trainY), axis=1)
# newer_testY = np.expand_dims(new_testY,axis=1)
# test_set = np.concatenate((z_mean_test,newer_testY), axis=1)
# dataset_2d = np.concatenate((train_set,test_set))
# np.savetxt("dataset_2d.csv", dataset_2d, delimiter=',')

In [None]:
np.savetxt("dataset1.csv", new_data, delimiter=',')
new_data2 = scaler.fit_transform(new_data)
np.savetxt("dataset2.csv", new_data2, delimiter=',')



In [None]:
z_mean_new, _, _ = encoder.predict(new_X)
newer_Y = np.expand_dims(new_Y,axis=1)
new_set = np.concatenate((z_mean_new,newer_Y), axis=1)
np.savetxt("new_dataset.csv", new_set, delimiter=',')
new_set_normalized = scaler.fit_transform(new_set)
np.savetxt("new_dataset_normalized.csv", new_set_normalized, delimiter=',')