In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
os.chdir('/content/drive/My Drive/514model/514proj3')

In [0]:
import numpy as np
import sys
import time
import h5py
import keras.backend as kb
import tensorflow as tf
from spliceai import *
from utils import *
from multi_gpu import *

OUT_MAP0=[[1,0],
          [0,1]]
OUT_MAP1=[[1,0,0,0,0],
          [0,1,0,0,0],
          [0,0,1,0,0],
          [0,0,0,1,0],
          [0,0,0,0,1]]

OUT_MAP2=[[1, 0, 0, 0, 0, 0, 0, 0],
          [0, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 0, 0, 0, 0],
          [0, 0, 0, 0, 1, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 0, 0],
          [0, 0, 0, 0, 0, 0, 1, 0],
          [0, 0, 0, 0, 0, 0, 0, 1]]

IN_MAP = np.asarray([
                     [1, 0, 0, 0],
                     [0, 1, 0, 0],
                     [0, 0, 1, 0],
                     [0, 0, 0, 1]])

def read_txt_file(file_path):
    x_all=[]
    with open(file_path,'r') as f:
        for l in f.readlines():
            line=list(map(int,l.split()))
            x_all.append(line)
    return x_all

def transform_X_to_one_hot(X):
    new_X=list(map(lambda x: IN_MAP[x-1], X))
    return new_X

def transform_x_one_hot_set(x_all):
    x_transform=list(map(transform_X_to_one_hot,x_all))
    return np.asarray(x_transform)

def transform_Y_one_hot_set(Y,type):
    if(type==0):
        new_Y=list(map(lambda x: OUT_MAP0[x[0]], Y))
        return new_Y
    elif(type==1):
        new_Y=list(map(lambda x: OUT_MAP1[x[0]], Y))
        return new_Y
    else:
        new_Y=list(map(lambda x: OUT_MAP2[x[0]], Y))
        return new_Y

def clip_datapoints(X, Y, CL, N_GPUS):
  # This function is necessary to make sure of the following:
  # Each time model_m.fit is called, the number of datapoints is a
  # multiple of N_GPUS. Failure to ensure this often results in crashes.
  rem = X.shape[0]%N_GPUS
  if rem != 0:
      return X[:-rem,:], [Y[t][:-rem] for t in range(1)]
  else:
      return X, [Y[t] for t in range(1)]

def SpliceAImodel(x_all,y_all,x_pd,nt):
  # x_all, y_all are the input data generated by creat_data_file.py
  # Xp is the input of prediction
  # nt = 80,400,1200,2000,10000
  L = 32
  N_GPUS = 2
  if nt == 80:
    W = np.asarray([11, 11, 11, 11])
    AR = np.asarray([1, 1, 1, 1])
    BATCH_SIZE = 18*N_GPUS
  elif nt == 400:
    W = np.asarray([11, 11, 11, 11, 11, 11, 11, 11])
    AR = np.asarray([1, 1, 1, 1, 4, 4, 4, 4])
    BATCH_SIZE = 18*N_GPUS
  elif nt == 1200:
    W = np.asarray([11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11])
    AR = np.asarray([1, 1, 1, 1, 4, 4, 4, 4, 10, 10, 10, 10])
    BATCH_SIZE = 12*N_GPUS
  elif nt == 2000:
    W = np.asarray([11, 11, 11, 11, 11, 11, 11, 11, 21, 21, 21, 21])
    AR = np.asarray([1, 1, 1, 1, 4, 4, 4, 4, 10, 10, 10, 10])
    BATCH_SIZE = 12*N_GPUS
  elif nt == 10000:
    W = np.asarray([11, 11, 11, 11, 11, 11, 11, 11, 21, 21, 21, 21, 41, 41, 41, 41])
    AR = np.asarray([1, 1, 1, 1, 4, 4, 4, 4, 10, 10, 10, 10, 25, 25, 25, 25])
    BATCH_SIZE = 6*N_GPUS
  else:
    raise Exception("Invalid nt!")

  CL = 2 * np.sum(AR*(W-1))

  model = SpliceAI(L, W, AR)
  model_m = make_parallel(model, N_GPUS)
  model_m.compile(loss="categorical_crossentropy", optimizer='adam')

  X_all_1h = transform_x_one_hot_set(x_all)
  Y_all_1h = transform_Y_one_hot_set(y_all,1)
  Y_all_1h = np.array(Y_all_1h)[np.newaxis,:,np.newaxis,:]
  Xc, Y = clip_datapoints(X_all_1h, Y_all_1h, CL, N_GPUS)
  Yc=np.reshape(Y,(len(Y[0]),-1))
  model_m.fit(Xc, Yc, batch_size=BATCH_SIZE,epochs=30, verbose=1)

  Xp = transform_x_one_hot_set(x_pd)
  rem = Xp.shape[0]%N_GPUS
  if rem != 0:
    Xpc = Xp[:-rem,:]
  else:
    Xpc = Xp
  Yp = model_m.predict(Xpc, batch_size=BATCH_SIZE)
  return Yp,Yc

In [12]:
x_all = read_txt_file('prostate_cancer_train_data/total_x.txt')
y_all = read_txt_file('prostate_cancer_train_data/total_y.txt')
nt = 80
Yp,Yc = SpliceAImodel(x_all,y_all,x_all,nt)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [0]:
y_ture=[np.argmax(t) for t in Yc]
y_pre=[np.argmax(t) for t in Yp]

In [14]:
from sklearn.metrics import confusion_matrix
print confusion_matrix(y_ture,y_pre)

[[   0   74    0    0    0]
 [   0 3370    0    0    0]
 [   0  274    0    0    0]
 [   0  141    0    0    0]
 [   0  101    0    0    0]]
