# Deep Convolutional Autoencoders for Clustering genes

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D, UpSampling1D
from keras import backend as K
from keras.callbacks import TensorBoard

from IPython.display import Image, SVG
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from time import time

In [12]:
df = pd.read_csv('arcene_train.data', sep=' ', header = None)
df.drop([10000], axis=1, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0,71,0,95,0,538,404,20,0,0,...,255,570,86,0,36,0,80,0,0,524
1,0,41,82,165,60,554,379,0,71,0,...,213,605,69,7,473,0,57,0,284,423
2,0,0,1,40,0,451,402,0,0,0,...,235,593,28,0,24,0,90,0,34,508
3,0,56,44,275,14,511,470,0,0,0,...,91,600,0,26,86,0,102,0,0,469
4,105,0,141,348,0,268,329,0,0,1,...,813,0,0,0,0,190,301,0,0,354


In [13]:
df['labels'] = pd.read_csv('arcene_train.labels')

In [36]:
df.labels.isnull().sum()

1

In [14]:
X = df.drop(['labels'], axis=1)
y = df['labels']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
#This is necessary for the way Keras handles inputs, it doesn't change the data at all
X_train = np.expand_dims(X_train, axis=2)

X_test = np.expand_dims(X_test, axis=2)

In [17]:
X_train.shape

(80, 10000, 1)

In [18]:
model = Sequential()
 
#1st convolution layer
model.add(Conv1D(64,3, activation='relu', padding='same', input_shape = (X_train.shape[1], 1)))
model.add(MaxPooling1D(2))
 
#2nd convolution layer
model.add(Conv1D(32,3, activation='relu', padding='same')) 
model.add(MaxPooling1D(2))
 
#here compressed version
 
#3rd convolution layer
model.add(Conv1D(32, 3, activation='relu', padding='same'))
model.add(UpSampling1D((2)))
 
#4th convolution layer
model.add(Conv1D(64, 3, activation='relu', padding='same'))
model.add(UpSampling1D((2)))
 
model.add(Conv1D(1, 3, padding='same'))
model.add(Activation('sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 10000, 64)         256       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 5000, 64)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 5000, 32)          6176      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2500, 32)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2500, 32)          3104      
_________________________________________________________________
up_sampling1d_1 (UpSampling1 (None, 5000, 32)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 5000, 64)          6208      
__________

In [19]:
tb_callback = TensorBoard(log_dir='./logs/{}'.format(time()), histogram_freq=0, write_graph=True, write_images=True)

model.compile(optimizer='adadelta', loss='binary_crossentropy')
model.fit(X_train, X_train, 
          epochs=3,
          validation_data=(X_test, X_test),
          callbacks = [tb_callback])

Train on 80 samples, validate on 20 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1ffcacc37b8>

In [20]:
restored_representations = model.predict(X_test)

In [21]:
#layer[7] is activation_3 (Activation), it is compressed representation
get_3rd_layer_output = K.function([model.layers[0].input], [model.layers[7].output])
compressed = get_3rd_layer_output([X_test])[0]

In [22]:
compressed.shape

(20, 10000, 64)

In [23]:
#layer[7] is size of (None, 10000, 16).  We will flatten this tensor
compressed = compressed.reshape(20,10000*64)

In [24]:
#Training K-Means
from tensorflow.contrib.factorization.python.ops import clustering_ops
import tensorflow as tf
 
def train_input_fn():
    data = tf.constant(compressed, tf.float32)
    return (data, None)
 
unsupervised_model = tf.contrib.learn.KMeansClustering(
2 #num of clusters
, distance_metric = clustering_ops.SQUARED_EUCLIDEAN_DISTANCE
, initial_clusters=tf.contrib.learn.KMeansClustering.RANDOM_INIT
)
 
unsupervised_model.fit(input_fn=train_input_fn, steps=1000)

Instructions for updating:
Please use tf.contrib.factorization.KMeansClustering instead of tf.contrib.learn.KMeansClustering. It has a similar interface, but uses the tf.estimator.Estimator API instead of tf.contrib.learn.Estimator.
Instructions for updating:
Please replace uses of any Estimator from tf.contrib.learn with an Estimator from tf.estimator.*
Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_save_checkpoints_steps': None, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_num_worker_replicas': 0, '_session_config': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001FFDB4C6630>, '_model_dir': 'C:\\Users\\SYMPHO~1\\AppData\\Local\\Temp\\tmpa3ts3ygw', '_environment': 'local', '_log_step_count_steps': 100, '_task_id': 0, '_save_summary_steps': 100, '_save_checkpoints_secs': 60

KMeansClustering(params={'relative_tolerance': None, 'distance_metric': 'squared_euclidean', 'training_initial_clusters': 'random', 'random_seed': 0, 'kmeans_plus_plus_num_retries': 2, 'num_clusters': 2, 'use_mini_batch': True, 'mini_batch_steps_per_iteration': 1})

In [25]:
#Getting clusters for data points
clusters = unsupervised_model.predict(input_fn=train_input_fn)
predicted = []
 
index = 0
for i in clusters:
    current_cluster = i['cluster_idx']
    predicted.append(current_cluster)
    features = X_test[index]
    index = index + 1

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\SYMPHO~1\AppData\Local\Temp\tmpa3ts3ygw\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [26]:
predicted

[1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]

In [27]:
#Changing from 1vs0 classes to 1vs-1
predicted = [x if x==1 else -1 for x in predicted]

predicted = np.asarray(predicted)

In [28]:
predicted

array([ 1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1,
        1,  1,  1], dtype=int64)

In [29]:
y_test.values

array([ 1.,  1.,  1., -1., -1.,  1.,  1.,  1.,  1., -1., -1.,  1.,  1.,
       -1., -1., -1.,  1., -1.,  1.,  1.])

In [35]:
score = 0

for i in range(len(predicted)):
    if predicted[i] == y_test.values[i]:
        score += 1
        
percent = (score / len(predicted))*100

print("Model is {}% accurate".format(percent))

Model is 70.0% accurate
