# Use of Machine Learning Methods with Keras for predicting cancer on non-carcinogenic cell line with SNP data

**Terminal Setup**

Enter this line on the Terminal and restart the .ipynb file.

jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10



In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
from pprint import pprint

In [56]:
df = pd.read_csv('functionalElementDataARBindingProstate.csv')

### Main informations about the data

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457933 entries, 0 to 457932
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   RS_ID               457933 non-null  object 
 1   chr                 457933 non-null  object 
 2   pos                 457933 non-null  int64  
 3   ref                 457933 non-null  object 
 4   alt                 457933 non-null  object 
 5   scoreA              379835 non-null  float64
 6   scoreB              392889 non-null  float64
 7   functional_element  457933 non-null  object 
 8   n_experiment        457933 non-null  int64  
 9   file_type           457933 non-null  object 
 10  cell_line           457933 non-null  object 
 11  cancer_type         285609 non-null  object 
 12  cell_line_cancer    457933 non-null  object 
dtypes: float64(2), int64(2), object(9)
memory usage: 45.4+ MB


In [58]:
df.head()

Unnamed: 0,RS_ID,chr,pos,ref,alt,scoreA,scoreB,functional_element,n_experiment,file_type,cell_line,cancer_type,cell_line_cancer
0,rs10201930,2,189874460,G,A,0.0719,0.06594,H3K36me3-human,1,narrow,prostate,,normal
1,rs10178969,2,190315723,A,C,0.0657,0.068314,EZH2-human,1,narrow,PC-3,prostate,cancer
2,rs10178969,2,190315723,A,C,0.0657,0.068314,H3K36me3-human,1,narrow,PC-3,prostate,cancer
3,rs1037532,18,50038602,T,C,0.028,0.034459,H3K27me3-human,1,narrow,PC-3,prostate,cancer
4,rs10039204,5,101180077,T,C,,,EZH2-human,1,narrow,PC-3,prostate,cancer


In [59]:
df.isnull().sum() # check for null value

RS_ID                      0
chr                        0
pos                        0
ref                        0
alt                        0
scoreA                 78098
scoreB                 65044
functional_element         0
n_experiment               0
file_type                  0
cell_line                  0
cancer_type           172324
cell_line_cancer           0
dtype: int64

In [60]:
list_name_functional_element=list(df['functional_element'].unique() )
pprint(list_name_functional_element)

['H3K36me3-human',
 'EZH2-human',
 'H3K27me3-human',
 'POLR2A-human',
 'EP300-human',
 'CTCF-human',
 'H3K27ac-human',
 'H3K4me3-human',
 'H3K4me1-human',
 'H3K4me2-human',
 'H3K79me2-human',
 'H3K9ac-human',
 'H4K20me1-human',
 'POLR2AphosphoS5-human',
 'ZFX-human',
 'H2AFZ-human',
 'H3F3A-human',
 'H3K9me3-human',
 'EZH2phosphoT487-human',
 'H3K9me2-human']


In [1]:
list_name_functional_element=list(df['cell_line'].unique() )
pprint(list_name_functional_element)

NameError: name 'df' is not defined

### Functions

In [62]:
# MAIN FUNCTIONS for PIPELINE

def start_pipeline(dataf):
    return dataf.copy()

def clean_colums_name(dataf):
    dataf.columns = [c.replace(" ","") for c in dataf]
    return dataf

def transform_cancer_type_into_integer(dataf):
    """ associate  empty space value  = 2  # normal
        and        'prostate' value   = 1  # prostate
    """
    dataf['cancer_type']=dataf['cancer_type'].replace(r'^\s*$', np.nan, regex=True)
    dataf['cancer_type']=dataf['cancer_type'].fillna(1) 
    dataf['cancer_type']=dataf['cancer_type'].replace('prostate',0, regex=True) 
    return dataf


def adjust_score_1(dataf):     
    """ I have 2 score, I want to to the mean of these two
        and use only this latter value for the DataFrame
        I try to use every maf possible
            
            if each maf score miss, drop the row
            if just one miss, use as scoreC the other
            if both are present use the mean for calculate scoreC
    """
    list_=[]
    for row in dataf.itertuples(index=True, name='Pandas'):
        A,B = getattr(row, "scoreA"), getattr(row, "scoreB")
        
        if A and B:
            C=(A+B)/2
        if A and not B:
            C = A
        if B and not A:
            C = 0
            
        list_.append(C)
    dataf['scoreC']=list_

    return dataf

def adjust_score_2(dataf):
    " drop values for A and B, and drop null values for C"
    dataf=dataf.drop(['scoreA'], axis=1) # drop values
    dataf=dataf.drop(['scoreB'], axis=1) # drop values
    dataf=dataf.drop(dataf.loc[dataf['scoreC'].isnull()].index) # drop null values
    return dataf

def drop_file_type(dataf):
    "I eliminate file_type, because 100% of file_type=='narrow'"
    dataf=dataf.drop(['file_type'], axis=1)
    return dataf

def chr_into_integer(dataf):
    # if X = 23 , if Y = 24
    dataf['chr'] = dataf['chr'].astype(str)
    dataf['chr'] = dataf['chr'].replace(to_replace=r'^X$', value='23', regex=True)
    dataf['chr'] = dataf['chr'].replace(to_replace=r'^Y$', value='24', regex=True)
    dataf['chr'] = dataf['chr'].astype(int)
    return dataf

def drop_n_experiment(dataf):
    """
     Number of experiment with 1 replicate =  0.93045 %
     Number of experiment with 2 replicate =  0.06955 %
     
     I drop it, because probably this feature 
     it will be useless to predict prostate cancer
     due to the majority of 1 replicate.
     """
    dataf=dataf.drop(['n_experiment'], axis=1)
    return dataf

def drop_chr(dataf):
    dataf=dataf.drop(['chr'], axis=1)
    return dataf

def drop_pos(dataf):
    dataf=dataf.drop(['pos'], axis=1)
    return dataf

def convert_RS_ID_into_integer(dataf):
    """replace 'rs' for each col with empty 
        and transform ID into integer
    """
    dataf['RS_ID'] = dataf['RS_ID'].str.replace("rs","") # replace rs with nothing
    dataf.RS_ID = pd.to_numeric(dataf.RS_ID, errors='coerce') # convert rs_id to integer
    return dataf

def find_number_cell_lines(dataf):
    return len(dataf['cell_line'].unique()) # different cell lines

def find_number_functional_elements(dataf):
    return len(dataf['functional_element'].unique()) # different cell lines

def transform_cell_line_cancer_into_integer(dataf):
    """ normal = 2 ; cancer = 1
    """
    dataf['cell_line_cancer'] = dataf['cell_line_cancer'].str.replace("cancer","0")
    dataf['cell_line_cancer'] = dataf['cell_line_cancer'].str.replace("normal","1")
    dataf.cell_line_cancer = pd.to_numeric(dataf.cell_line_cancer, errors='coerce')
    return dataf

def tranform_nucleotide_into_integer(dataf):
    # REFERENCE
    dataf['ref'] = dataf['ref'].str.replace("T","1") # T=1
    dataf['ref'] = dataf['ref'].str.replace("C","2") # C=2
    dataf['ref'] = dataf['ref'].str.replace("A","3") # A=3
    dataf['ref'] = dataf['ref'].str.replace("G","4") # G=4

    # ALTERNATIVE (mutated)
    dataf['alt'] = dataf['alt'].str.replace("T","1") # T=1
    dataf['alt'] = dataf['alt'].str.replace("C","2") # C=2
    dataf['alt'] = dataf['alt'].str.replace("A","3") # A=3
    dataf['alt'] = dataf['alt'].str.replace("G","4") # G=4

    # transform Nucleotide label into integer
    dataf.alt = pd.to_numeric(dataf.alt, errors='coerce')
    dataf.ref = pd.to_numeric(dataf.ref, errors='coerce')
    
    return dataf

def transform_cell_lines_into_integer(dataf):
    list_name_cell_line=list(dataf['cell_line'].unique() )
    dataf['cell_line']=dataf['cell_line'].replace(to_replace=list_name_cell_line ,
                                              value=range(len(list_name_cell_line)))
    return dataf

def transform_functional_elements_into_integer(dataf):
    list_name_functional_element=list(dataf['functional_element'].unique())
    dataf['functional_element']=dataf['functional_element'].replace(to_replace=list_name_functional_element,
                                                            value=range(len(list_name_functional_element))) 
    return dataf

In [63]:
# General Utilities PIPE FUNCTION

def drop_column(dataf, col_name):
    try:
        if type(colname)== str:
            dataf = dataf.drop(col_name, axis=1) # drop values
    except:
            print('Error input column name')
            
    return dataf

def remove_nan_from_column(dataf, colname):
    try:
        if type(colname)== str:
            dataf = dataf[dataf[colname].notna()]
    except:
            print(' colname input is not a string ')
    return dataf.copy()

def check_null(dataf):
    for _ in dataf.isnull().sum():
        if _ is not 0:
            print(' Re-check for null value!')

In [64]:
# Machine Learning Data Preparation Functions

def num_features(dataf):
    return len(dataf.iloc[0]) 

In [65]:
df2=(df
 .pipe(start_pipeline)
 .pipe(clean_colums_name)
 .pipe(transform_cancer_type_into_integer)
 .pipe(transform_cell_line_cancer_into_integer)
 .pipe(transform_cell_lines_into_integer)
 .pipe(tranform_nucleotide_into_integer)
 .pipe(transform_functional_elements_into_integer)
 .pipe(adjust_score_1)
 .pipe(adjust_score_2)
 .pipe(drop_file_type)
 .pipe(drop_n_experiment)
 .pipe(convert_RS_ID_into_integer)
 .pipe(chr_into_integer)

) # wait at least 5 seconds

In [66]:
df2.isnull().sum() # FINAL CHECK for NULL VALUES 

RS_ID                 0
chr                   0
pos                   0
ref                   0
alt                   0
functional_element    0
cell_line             0
cancer_type           0
cell_line_cancer      0
scoreC                0
dtype: int64

### Data gathering functions for ML model

In [67]:
def X_train(dataf, p_train ):
    # Generate TRAIN X
    LEN= len(dataf)
    df_train_data = dataf.loc[:round(LEN*p_train)]
    x_train = df_train_data.to_numpy()
    return x_train

def Y_train(dataf, p_train):
    # Generate TRAIN Y
    LEN= len(dataf)
    df_train_data = dataf.loc[:round(LEN*p_train)]
    y_train = df_train_data.to_numpy()
    return y_train

def X_test(dataf, p_train ):
    # Generate TEST X
    LEN= len(dataf)
    df_test_data = dataf.loc[round(LEN*p_train):]
    x_test = df_test_data.to_numpy()
    return x_test

def Y_test(dataf,p_train):
    # Generate TEST Y
    LEN= len(dataf)
    df_test_label = dataf.loc[round(LEN*p_train):]
    y_test = df_test_label.to_numpy()
    return y_test

### Decide the features for the Prediction Model

In [68]:
pprint(df2.columns)

Index(['RS_ID', 'chr', 'pos', 'ref', 'alt', 'functional_element', 'cell_line',
       'cancer_type', 'cell_line_cancer', 'scoreC'],
      dtype='object')


### INPUT | Data input for the machine learning model

In [69]:
df_data = df2[['RS_ID','ref', 'alt','functional_element','cell_line','cell_line_cancer', 'scoreC']]
# with 8 | 16 | 32 | 16 | 1 neurons on the layer
# ['RS_ID', 'chr', 'pos', 'ref', 'alt', 'functional_element', 'cell_line',
#           'cancer_type', 'cell_line_cancer', 'scoreC']

### Past models features used

df_data = df2[['RS_ID','ref','alt','functional_element','cell_line','scoreC']]

df_data = df2[['chr','pos', 'functional_element','cell_line_cancer', 'scoreC']]

In [70]:
df_data.dtypes # all must be Int or Float

RS_ID                   int64
ref                     int64
alt                     int64
functional_element      int64
cell_line               int64
cell_line_cancer        int64
scoreC                float64
dtype: object

In [71]:
df_data.head()

Unnamed: 0,RS_ID,ref,alt,functional_element,cell_line,cell_line_cancer,scoreC
0,10201930,4,3,0,0,2,0.06892
1,10178969,3,2,1,1,1,0.067007
2,10178969,3,2,0,1,1,0.067007
3,1037532,1,2,2,1,1,0.03123
5,1002223,3,2,1,1,1,0.301696


### LABEL | Data for target prediction

In [72]:
label_target = 'cancer_type'
df_target = df2[[label_target]] # Example ; df_label = df2[['cancer_type']]
df_data.head()

Unnamed: 0,RS_ID,ref,alt,functional_element,cell_line,cell_line_cancer,scoreC
0,10201930,4,3,0,0,2,0.06892
1,10178969,3,2,1,1,1,0.067007
2,10178969,3,2,0,1,1,0.067007
3,1037532,1,2,2,1,1,0.03123
5,1002223,3,2,1,1,1,0.301696


### Data variables for ML model

In [73]:
EPOCHS = 3

BATCH_SIZE = 16

LEN = len(df_data) # tot of rows

NUM_features = num_features(df_data) # tot of features to INPUT

P_train = 0.9 # percenteage of train = (1-P_test)

In [74]:
x_train = X_train(df_data , P_train)
y_train = Y_train(df_target ,P_train )

x_test = X_test(df_data , P_train)
y_test = Y_test(df_target ,P_train )

## Build Model of Deep Learning
#### for the prediction of Cancer based on SNPs, Functional Elements, Cell line and Maf Score


In [75]:
# MACHINE LEARNING | KERAS SEQUENTIAL MODEL 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# BUILD MODEL of Deep Learning for the prediction of Cancer based on SNPs, Functional Elements, Cell line and Maf
model = Sequential()
model.add(Dense(8, input_dim=NUM_features, activation='relu')) # Input layer 1
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu')) # layer 2
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu')) # layer 3
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu')) # layer 4
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid')) # Output layer

# COMPILE MODEL
model.compile(loss='binary_crossentropy',
              optimizer='RMSprop',
              metrics=['accuracy'])
# FIT MODEL
model.fit(x_train, y_train,
          epochs=EPOCHS,
          batch_size=BATCH_SIZE)


Train on 279856 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f0d06e4e150>

In [22]:
# possible Error : IOPub data rate exceeded.
# Solution below to enter into the terminal 
# >>> jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
# and restart the file

score = model.evaluate(x_test, y_test, batch_size = BATCH_SIZE)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#### Accuracy
Tested: Final accuracy ~ 62% of the model



### Extract weights from the Neural Network
Must be understand Neural Nets concept to use it into another model. Or simply we can use model.predict_class(X_unknow) or model.predict_proba(X_unknow) for the class prediction of new datapoints.


In [23]:
# with  # 0, 2, 4, 6 , 8

w1 = model.layers[0].get_weights()

w2 = model.layers[2].get_weights() 

w3 = model.layers[4].get_weights() 

w4 = model.layers[6].get_weights() 

w5 = model.layers[8].get_weights() 


weights =[w1,w2,w3,w4,w5]

for w in weights:
    print(len(w[0]))

for w in weights:
    print('-'*80,'\n')
    print(' numbers of neurons : ',len(w[0]),'\n')
    print(' each with number of weights : ',len(w[0][0]),'\n')
    print(' _ '*20,'\n')
    print(w[0])
    
for len_w in weights:
    print(len(len_w[0]))
    
#print(len(w1[1]))
#print(len(weight[0]))
#print (model.compile)
#weight = model.layers[0].get_weights()
#weight = model.layers[0].get_weights()
#weight = model.layers[0].get_weights()



4
8
16
32
16
-------------------------------------------------------------------------------- 

 numbers of neurons :  4 

 each with number of weights :  8 

 _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  

[[-5.6616485e-01 -5.7319260e-01 -3.3723986e-01 -4.7237009e-01
   3.7233593e+01  3.7604187e+01  3.6875500e+01  3.7107483e+01]
 [-2.4341464e-02 -1.1576694e-01 -1.6179132e-01  8.5654557e-02
   4.2790577e+01  4.2789604e+01  4.2661270e+01  4.3327187e+01]
 [-5.7013869e-02  2.9027998e-01 -4.4085130e-01  2.7851528e-01
   4.4873726e+01  4.4807583e+01  4.4887554e+01  4.4579979e+01]
 [ 2.1595287e-01  1.8564844e-01  1.2289530e-01  6.3997608e-01
   3.9417873e+01  3.9404499e+01  3.8391563e+01  3.8488064e+01]]
-------------------------------------------------------------------------------- 

 numbers of neurons :  8 

 each with number of weights :  16 

 _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  _  

[[-3.0159724e-01 -9.6341848e-02  2.0576024e-01 -4.8422861e-01
  -2.

### Prediction

In [140]:
# new instance where we do not know the answer
import random as rnd
num_random_row = rnd.randrange(len(df_data)) # give random datapoint to predict
Xnew = np.array(df_data.take([num_random_row]))
 # TRY NEW DATA POINT

num_features_has_to_be = len(df_data.iloc[-1])

num_features_given = len(Xnew)

print(' Data point to predict ',Xnew,'\n\n')


# PREDICT CLASS
try:
    ynew = model.predict_classes(Xnew)
    print("Predicted=%s" % (ynew[0]),'\n\n') # show the inputs and predicted outputs


except ValueError:
    print('''Data shape not correct for predicting class, 
need input array shape = {0} instead of shape = {1} 
          '''.format(num_features_has_to_be,num_features_given))
    

# PREDICT PROBABILITY
try:
    ynew = model.predict_proba(Xnew)
    print("Predicted=%s" % (ynew[0])) # probability prediction

except ValueError:
    print('''Data shape not correct for predicting probability
need input array shape = {0} instead of shape = {1} 
          '''.format(num_features_has_to_be,num_features_given))

 Data point to predict  [[7.7486104e+07 4.0000000e+00 3.0000000e+00 6.0000000e+00 8.0000000e+00
  2.0000000e+00 7.9051150e-02]] 


Predicted=[1] 


Predicted=[1.]
