#### Import Data

In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [120]:
working_directory = '/Users/a.su/Documents/MultiClassCancer_RNAseq_CNV_lncRNA/'

#Import Data
y_multi = pd.read_table(working_directory + 'CancerTypes_y_multiClass.txt', sep = '\t', header = None)
x_cnv = pd.read_table(working_directory + 'CNV_processed_multiClass.txt', sep = '\t', header = 0)
x_rna = pd.read_table(working_directory + 'RNAseq_processed_multiClass.txt', sep = '\t', header = 0)
x_lnc = pd.read_table(working_directory + 'lncRNA_processed_multiClass.txt', sep = '\t', header = 0)

  interactivity=interactivity, compiler=compiler, result=result)


#### Process X data

In [150]:
def print_dropped_columns(df, df_dropped, df_name):
    print('Dropped {0} Columns from {1}'.format(len(df.columns) - len(df_dropped.columns), df_name))
    
def count_all_zeros (df, axis = 0):
    return len(df.columns) - np.count_nonzero(df.sum(axis = axis), axis = axis)

def preprocess_x(df, df_name, drop_threshold = 0.5):
    
    drop_theshold = drop_threshold*len(df.index)
    df = df.drop('GeneID', axis = 1)    #Remove GeneID column
    df = df.transpose()    #Transpose
    print('{0} shape is {1}'.format(df_name, df.shape))
    
    #Drop columns with more than drop_threshold NaN values
    df_dropped = df.dropna(thresh = drop_threshold)
    print_dropped_columns(df, df_dropped, df_name)
    
    #Impute
    impute_median = SimpleImputer(strategy = 'median')
    #impute_median = Imputer(strategy = 'median') #Use for delta
    df_imputed = pd.DataFrame(impute_median.fit_transform(df_dropped))
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_imputed).any().any()))
    
    #Count columns with all zeros
    print('{0} has {1} column(s) with all zeros'.format(df_name, count_all_zeros(df_imputed, 0)))
    #Delete columns with all zeros
    df_nozero = df_imputed.loc[:, (df_imputed != 0).any(axis = 0)]
    print_dropped_columns(df_imputed, df_nozero, df_name)
    print('{0} has {1} column(s) with all zeros'.format(df_name, count_all_zeros(df_nozero, 0)))
    
    #Scale data
    zscore = lambda x: (x-x.mean())/x.std()
    df_processed = df_nozero.transform(zscore)
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_processed).any().any()))
    
    #Visualise
    rand_columns = np.random.choice(df_processed.columns.values, size = 5, replace = False)
    print(df_processed[rand_columns].describe())
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_processed).any().any()))
    
    #Reset Index
    df_processed = df_processed.reset_index(drop = True)
    print('Are there NaN values in {0}? {1}'.format(df_name, np.isnan(df_processed).any().any()))
    
    return df_processed

In [151]:
x_cnv_processed = preprocess_x(x_cnv, 'x_cnv', 0.5)
x_rna_processed = preprocess_x(x_rna, 'x_rna', 0.5)
x_lnc_processed = preprocess_x(x_lnc, 'x_lnc', 0.5)

x_cnv shape is (668, 26374)
Dropped 0 Columns from x_cnv
Are there NaN values in x_cnv? False
x_cnv has 0 column(s) with all zeros
Dropped 0 Columns from x_cnv
x_cnv has 0 column(s) with all zeros
Are there NaN values in x_cnv? False
              16544         17800         22725         14603         12714
count  6.680000e+02  6.680000e+02  6.680000e+02  6.680000e+02  6.680000e+02
mean  -3.300753e-16  1.475865e-16  1.221993e-16 -3.464461e-16 -5.941688e-16
std    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00
min   -2.494738e+00 -3.273517e+00 -3.594973e+00 -3.166957e+00 -2.581381e+00
25%   -6.303693e-01 -6.006478e-01 -5.713506e-01 -5.965231e-01 -6.184890e-01
50%    1.234626e-02 -7.009156e-02 -8.915400e-02 -1.725662e-01  5.907442e-02
75%    4.724636e-01  6.005396e-01  5.806892e-01  5.542270e-01  5.404153e-01
max    9.795283e+00  4.034152e+00  7.047240e+00  4.297736e+00  4.798223e+00
Are there NaN values in x_cnv? False
Are there NaN values in x_cnv? False
x_rna sh

In [152]:
#Combine all data 
x_all = pd.concat([x_cnv_processed, x_rna_processed, x_lnc_processed], axis = 1)
x_all.shape

(668, 73753)

In [144]:
x_cnv_processed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26364,26365,26366,26367,26368,26369,26370,26371,26372,26373
0,-0.103358,-0.103358,-0.103358,-0.103358,-0.103358,-0.103358,-0.103358,-0.103358,-0.103358,-0.103358,...,-0.701119,-0.701119,-0.701119,-0.701119,-0.701119,-0.701119,-0.701119,-0.701119,-0.701119,-0.701119
1,-0.056543,-0.056543,-0.056543,-0.056543,-0.056543,-0.056543,-0.056543,-0.056543,-0.056543,-0.056543,...,-0.528272,-0.528272,-0.528272,-0.528272,-0.528272,-0.528272,-0.528272,-0.528272,-0.528272,-0.528272
2,-0.126789,-0.126789,-0.126789,-0.126789,-0.126789,-0.126789,-0.126789,-0.126789,-0.126789,-0.126789,...,-0.563395,-0.563395,-0.563395,-0.563395,-0.563395,-0.563395,-0.563395,-0.563395,-0.563395,-0.563395
3,-0.330441,-0.330441,-0.330441,-0.330441,-0.330441,-0.330441,-0.330441,-0.330441,-0.330441,-0.330441,...,-0.553628,-0.553628,-0.553628,-0.553628,-0.553628,-0.553628,-0.553628,-0.553628,-0.553628,-0.553628
4,0.15392,0.15392,0.15392,0.15392,0.15392,0.15392,0.15392,0.15392,0.15392,0.15392,...,-0.61536,-0.61536,-0.61536,-0.61536,-0.61536,-0.61536,-0.61536,-0.61536,-0.61536,-0.61536


In [153]:
np.isnan(x_all).any().any()

False

#### Process Y data

In [58]:
y_condensed = y_multi
#Make a new column containing the number instances a cancer type occurs
y_condensed['Instances'] = y_condensed[0].map(y_condensed[0].value_counts())
#Replace cancer type with 'Other' if that cancer type occurs less than 30 times
y_condensed[0].where(y_condensed['Instances']>=30, 'Other', inplace = True)
#Drop Instances column
y_condensed.drop(columns = ['Instances'], inplace = True)

In [59]:
y_condensed.head()

Unnamed: 0,0
0,Carcinoma Non-Small Cell
1,Other
2,Carcinoma Non-Small Cell
3,Carcinoma Non-Small Cell
4,Carcinoma Squamous Cell


In [60]:
y_condensed[0].value_counts()

Other                       238
Adenocarcinoma              146
Carcinoma                    98
Melanoma                     48
Carcinoma Non-Small Cell     47
Adenocarcinoma Ductal        31
Carcinoma Squamous Cell      30
Carcinoma Small Cell         30
Name: 0, dtype: int64

In [61]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

In [62]:
#Encode Cancer Types as Integers
le = LabelEncoder()
y_integers = le.fit_transform(y_condensed.values)
#One hot encoding 
y_encoded = to_categorical(y_integers)
y_encoded[0:5]

  y = column_or_1d(y, warn=True)


array([[0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.]], dtype=float32)

### Neural Network

In [63]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras import regularizers

In [154]:
x_train, x_test, y_train, y_test = train_test_split(x_all, y_encoded, test_size = 0.20, random_state = 0)

In [163]:
model = Sequential()
#model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l1_l2(0.01, 0.01)))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(8, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [164]:
model.fit(x_train.values, y_train, epochs = 50, batch_size = 64, verbose = 1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a35bb50f0>

In [165]:
model.evaluate(x_test, y_test)



[7.5404360543436075, 0.5746268656716418]

In [69]:
from sklearn.metrics import confusion_matrix

In [166]:
y_prediction = model.predict(x_test)

In [167]:
y_prediction[8]

array([0.36374235, 0.04263556, 0.42822048, 0.0232671 , 0.00977596,
       0.03012019, 0.00496911, 0.09726927], dtype=float32)

In [168]:
y_pred_int = np.argmax(y_prediction, axis = 1)

In [169]:
y_pred_int

array([7, 2, 7, 3, 7, 4, 6, 0, 2, 0, 5, 7, 0, 0, 0, 1, 0, 6, 2, 7, 0, 4,
       0, 7, 5, 4, 7, 7, 0, 0, 7, 2, 0, 7, 3, 2, 7, 7, 2, 2, 0, 7, 3, 7,
       0, 7, 0, 2, 0, 3, 7, 0, 7, 3, 3, 7, 2, 7, 0, 0, 0, 7, 4, 7, 2, 6,
       2, 6, 0, 7, 2, 7, 0, 7, 2, 6, 7, 6, 3, 2, 7, 6, 0, 2, 0, 7, 0, 7,
       2, 7, 3, 2, 0, 3, 7, 0, 7, 5, 0, 0, 7, 4, 7, 2, 0, 5, 7, 0, 3, 0,
       0, 0, 0, 3, 0, 7, 2, 7, 0, 6, 0, 6, 0, 1, 0, 7, 7, 0, 7, 7, 3, 3,
       0, 3])

#### Cross Validation

In [86]:
from keras.optimizers import Adam

In [87]:
#adam = Adam(lr = 0.5)

In [49]:
#128, 32 gives 20% accuracy 10% std
#1024, 1024 gives 23.5% accuracy 7.6% std

def create_model():
    model = Sequential()
    model.add(Dense(32, activation = 'sigmoid', kernel_regularizer = regularizers.l1(0.01)))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(8, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn = create_model, epochs = 100, batch_size = 64)

In [50]:
kfold = KFold(n_splits=5, shuffle = True)

In [51]:
results = cross_val_score(estimator, x_train.values, y_train, cv = kfold)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [53]:
print('Accuracy: {0}% ({1}%)'.format(results.mean()*100, results.std()*100))

Accuracy: 34.45071458974186% (5.615438121359801%)
