# Use of Machine Learning Methods with Keras for predicting cancer on non-carcinogenic cell line with SNP data
**Terminal Setup**

Enter this line on the Terminal and restart the .ipynb file.

jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
from pprint import pprint

In [2]:
df_input = pd.read_csv('functionalElementDataARBindingProstate.csv')
df_input.head()

Unnamed: 0,RS_ID,chr,pos,ref,alt,scoreA,scoreB,functional_element,n_experiment,file_type,cell_line,cancer_type,cell_line_cancer
0,rs10201930,2,189874460,G,A,0.0719,0.06594,H3K36me3-human,1,narrow,prostate,,normal
1,rs10178969,2,190315723,A,C,0.0657,0.068314,EZH2-human,1,narrow,PC-3,prostate,cancer
2,rs10178969,2,190315723,A,C,0.0657,0.068314,H3K36me3-human,1,narrow,PC-3,prostate,cancer
3,rs1037532,18,50038602,T,C,0.028,0.034459,H3K27me3-human,1,narrow,PC-3,prostate,cancer
4,rs10039204,5,101180077,T,C,,,EZH2-human,1,narrow,PC-3,prostate,cancer


In [4]:
def gen_dict_FE(dataf):
    list_name_functional_element=list(dataf['functional_element'].unique() )
    dict_FE ={}
    for k in range(len(list_name_functional_element)):
        dict_FE[list_name_functional_element[k]]=k
    pprint(dict_FE)
    return(dict_FE)
functional_element_dict = gen_dict_FE(df_input)
print(len(functional_element_dict))

{'CTCF-human': 5,
 'EP300-human': 4,
 'EZH2-human': 1,
 'EZH2phosphoT487-human': 18,
 'H2AFZ-human': 15,
 'H3F3A-human': 16,
 'H3K27ac-human': 6,
 'H3K27me3-human': 2,
 'H3K36me3-human': 0,
 'H3K4me1-human': 8,
 'H3K4me2-human': 9,
 'H3K4me3-human': 7,
 'H3K79me2-human': 10,
 'H3K9ac-human': 11,
 'H3K9me2-human': 19,
 'H3K9me3-human': 17,
 'H4K20me1-human': 12,
 'POLR2A-human': 3,
 'POLR2AphosphoS5-human': 13,
 'ZFX-human': 14}
20


In [5]:
def gen_dict_cell_lines(dataf):
    list_name_cell_lines=list(dataf['cell_line'].unique() )
    dict_cell_lines ={}
    for k in range(len(list_name_cell_lines)):
        dict_cell_lines[list_name_cell_lines[k]]=k
    pprint(dict_cell_lines)
    return(dict_cell_lines)

cellular_lines_dict = gen_dict_cell_lines(df_input)
print(len(cellular_lines_dict))

{'22Rv1': 3,
 'C4-2B': 4,
 'LNCAP': 9,
 'LNCaP clone FGC': 6,
 'PC-3': 1,
 'RWPE1': 8,
 'RWPE2': 10,
 'VCaP': 7,
 'epithelial cell of prostate': 5,
 'prostate': 0,
 'prostate gland': 2}
11


In [6]:
nt_dict={'T':1,'C':2,'A':3,'G':4}

In [7]:
# MAIN FUNCTIONS for PIPELINE

def start_pipeline(dataf):
    return dataf.copy()

def clean_colums_name(dataf):
    dataf.columns = [c.replace(" ","") for c in dataf]
    return dataf

def transform_cancer_type_into_integer(dataf):
    """ associate  empty space value  = 2  # normal
        and        'prostate' value   = 1  # prostate
    """
    dataf['cancer_type']=dataf['cancer_type'].replace(r'^\s*$', np.nan, regex=True)
    dataf['cancer_type']=dataf['cancer_type'].fillna(1) 
    dataf['cancer_type']=dataf['cancer_type'].replace('prostate',0, regex=True) 
    return dataf


def adjust_score_1(dataf):     
    """ I have 2 score, I want to to the mean of these two
        and use only this latter value for the DataFrame
        I try to use every maf possible
            
            if each maf score miss, drop the row
            if just one miss, use as scoreC the other
            if both are present use the mean for calculate scoreC
    """
    list_=[]
    for row in dataf.itertuples(index=True, name='Pandas'):
        A,B = getattr(row, "scoreA"), getattr(row, "scoreB")
        
        if A and B:
            C=(A+B)/2
        if A and not B:
            C = A
        if B and not A:
            C = 0
            
        list_.append(C)
    dataf['scoreC']=list_

    return dataf

def adjust_score_2(dataf):
    " drop values for A and B, and drop null values for C"
    dataf=dataf.drop(['scoreA'], axis=1) # drop values
    dataf=dataf.drop(['scoreB'], axis=1) # drop values
    dataf=dataf.drop(dataf.loc[dataf['scoreC'].isnull()].index) # drop null values
    return dataf

def drop_file_type(dataf):
    "I eliminate file_type, because 100% of file_type=='narrow'"
    dataf=dataf.drop(['file_type'], axis=1)
    return dataf

def chr_into_integer(dataf):
    # if X = 23 , if Y = 24
    dataf['chr'] = dataf['chr'].astype(str)
    dataf['chr'] = dataf['chr'].replace(to_replace=r'^X$', value='23', regex=True)
    dataf['chr'] = dataf['chr'].replace(to_replace=r'^Y$', value='24', regex=True)
    dataf['chr'] = dataf['chr'].astype(int)
    return dataf

def drop_n_experiment(dataf):
    """
     Number of experiment with 1 replicate =  0.93045 %
     Number of experiment with 2 replicate =  0.06955 %
     
     I drop it, because probably this feature 
     it will be useless to predict prostate cancer
     due to the majority of 1 replicate.
     """
    dataf=dataf.drop(['n_experiment'], axis=1)
    return dataf

def drop_chr(dataf):
    dataf=dataf.drop(['chr'], axis=1)
    return dataf

def drop_pos(dataf):
    dataf=dataf.drop(['pos'], axis=1)
    return dataf

def convert_RS_ID_into_integer(dataf):
    """replace 'rs' for each col with empty 
        and transform ID into integer
    """
    dataf['RS_ID'] = dataf['RS_ID'].str.replace("rs","") # replace rs with nothing
    dataf.RS_ID = pd.to_numeric(dataf.RS_ID, errors='coerce') # convert rs_id to integer
    return dataf

def find_number_cell_lines(dataf):
    return len(dataf['cell_line'].unique()) # different cell lines

def find_number_functional_elements(dataf):
    return len(dataf['functional_element'].unique()) # different cell lines

def transform_cell_line_cancer_into_integer(dataf):
    """ normal = 2 ; cancer = 1
    """
    dataf['cell_line_cancer'] = dataf['cell_line_cancer'].str.replace("cancer","0")
    dataf['cell_line_cancer'] = dataf['cell_line_cancer'].str.replace("normal","1")
    dataf.cell_line_cancer = pd.to_numeric(dataf.cell_line_cancer, errors='coerce')
    return dataf

def tranform_nucleotide_into_integer(dataf,nt_dict):
    newdf0 = dataf.replace({"ref": nt_dict}) # REFERENCE 
    newdf1 = newdf0.replace({"alt": nt_dict}) # ALTERNATIVE
    return newdf1

def transform_cell_lines_into_integer(dataf,dict_cellular_lines):
    newdf = dataf.replace({"cell_line": dict_cellular_lines})
    return newdf

def transform_functional_elements_into_integer(dataf,dict_fe):
    newdf = dataf.replace({"functional_element": functional_element_dict})
    return newdf


In [8]:
# Machine Learning Data Preparation Functions

def num_features(dataf):
    return len(dataf.iloc[0]) 

In [9]:
# WorkFlow PIPELINE for Data preparation
# wait at least 5 seconds
df=(df_input
 .pipe(start_pipeline)
 .pipe(convert_RS_ID_into_integer)
 .pipe(adjust_score_1)
 .pipe(adjust_score_2)
 .pipe(drop_file_type) 
 .pipe(chr_into_integer)
 .pipe(clean_colums_name)
 .pipe(drop_n_experiment)
 .pipe(tranform_nucleotide_into_integer, nt_dict)
 .pipe(transform_cancer_type_into_integer)
 .pipe(transform_cell_line_cancer_into_integer)
 .pipe(transform_functional_elements_into_integer, functional_element_dict)
 .pipe(transform_cell_lines_into_integer, cellular_lines_dict)
) 
    

In [10]:
def X_train(dataf, p_train ):
    # Generate TRAIN X
    LEN= len(dataf)
    df_train_data = dataf.loc[:round(LEN*p_train)]
    x_train = df_train_data.to_numpy()
    return x_train

def Y_train(dataf, p_train):
    # Generate TRAIN Y
    LEN= len(dataf)
    df_train_data = dataf.loc[:round(LEN*p_train)]
    y_train = df_train_data.to_numpy()
    return y_train

def X_test(dataf, p_train ):
    # Generate TEST X
    LEN= len(dataf)
    df_test_data = dataf.loc[round(LEN*p_train):]
    x_test = df_test_data.to_numpy()
    return x_test

def Y_test(dataf,p_train):
    # Generate TEST Y
    LEN= len(dataf)
    df_test_label = dataf.loc[round(LEN*p_train):]
    y_test = df_test_label.to_numpy()
    return y_test

In [11]:
# MACHINE LEARNING | KERAS SEQUENTIAL MODEL 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [12]:
df.head()

Unnamed: 0,RS_ID,chr,pos,ref,alt,functional_element,cell_line,cancer_type,cell_line_cancer,scoreC
0,10201930,2,189874460,4,3,0,0,1,1,0.06892
1,10178969,2,190315723,3,2,1,1,0,0,0.067007
2,10178969,2,190315723,3,2,0,1,0,0,0.067007
3,1037532,18,50038602,1,2,2,1,0,0,0.03123
5,1002223,7,21172073,3,2,1,1,0,0,0.301696


### Feature to use for model the predictions

In [13]:
df_data = df[['ref', 'alt','functional_element','cell_line_cancer', 'scoreC']]

In [14]:
    # ALTERNATIVE MODEL 
# df_data = df[['RS_ID','ref', 'alt','functional_element','cell_line','cell_line_cancer', 'scoreC']]

### Target feature to predict

In [15]:
label_target = 'cancer_type'
df_target = df[[label_target]] # Example ; df_label = df2[['cancer_type']]

In [16]:
EPOCHS = 3

LEN = len(df_data) # tot of rows

n_features = num_features(df_data) # tot of features to INPUT

P_train = 0.7 # percenteage of train = (1-P_test)

In [17]:
idx=round(P_train*len(df_data))
train_x = df_data[:idx]
train_y = df_target[:idx]

# The output variable is string values. We must convert them into integer values 0 and 1.
# split into input (X) and output (Y) variables

Encode class values as integers

In [18]:
len(train_x)
train_x.head()

Unnamed: 0,ref,alt,functional_element,cell_line_cancer,scoreC
0,4,3,0,1,0.06892
1,3,2,1,0,0.067007
2,3,2,0,0,0.067007
3,1,2,2,0,0.03123
5,3,2,1,0,0.301696


### Neural Network model

In [19]:
    # create simple model 
model = Sequential()
model.add(Dense(n_features, input_dim=n_features, activation='relu'))
model.add(Dense(16, activation='relu')) # layer 2
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu')) # layer 3
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu')) # layer 4
model.add(Dense(1, activation='sigmoid'))
    # Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


Evaluate model with standardized dataset

In [20]:
model.fit(train_x, train_y, validation_split=0.8, epochs=EPOCHS, verbose=1, shuffle=True)

Train on 52961 samples, validate on 211848 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f930a89a990>

In [22]:
# idx = round(len(df_data)*P_train) # the testing sample
test_x = df_data[idx:]
test_y = df_target[idx:]

In [26]:
score = model.evaluate(test_x, test_y, verbose=0)



In [27]:
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.223526207814748
Test accuracy: 0.8865


In [28]:
# print out how many 1s and 0s are on the test target DataFrame
zeroes=ones=0
for i in df_target['cancer_type'][idx:]:
    if i==1:
        ones+=1
    if i==0:
        zeroes+=1

tot = df_target.sum()

print('1',ones)
print('0',zeroes)


1 41950
0 71539


## Make prediction

In [30]:
predictions = model.predict_classes(test_x)



In [31]:
ones_prediction = 0
zeroes_prediction = 0
for i in predictions:
    if i == 1: 
        ones_prediction+=1
    if i == 0: 
        zeroes_prediction+=1
print('0 predicted :',ones_prediction)
print('1 predicted :',zeroes_prediction)

0 predicted : 54831
1 predicted : 58658


In [41]:
num_test = 1000

x=test_x
y=test_y['cancer_type'].tolist()

### Test if prediction are correct

In [33]:
predictions = model.predict_classes(x)

correct=0

for i in range(num_test):
    if predictions[i] == y[i]: 
        correct+=1
        
print('*'*70)

print(' correct classifications :',round(correct/num_test, 2), ' % ' )

**********************************************************************
 correct classifications : 0.87  % 


### Print out expected vs predicted

In [61]:
for i in range(num_test):
    print('{0} {1} (expected {2})'.format(x[i:i+1],predictions[i], y[i] ) )
    print(' ')

        ref  alt  functional_element  cell_line_cancer    scoreC
323243    3    4                   3                 1  0.032799 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323244    3    4                   6                 1  0.032799 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323245    3    4                   6                 0  0.032799 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323246    1    2                   8                 0  0.084214 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323248    3    4                  17                 1  0.190381 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323249    1    3                   8                 0  0.121507 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323251    2    1                   1     

323386    2    1                   5                 0  0.184757 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323387    2    1                  18                 0  0.184757 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323388    3    1                   5                 1  0.016136 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323389    1    2                   1                 0  0.080523 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323390    2    3                   2                 0  0.037854 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323391    3    4                  15                 0  0.059807 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer   scoreC
323392    3    4                   6                 0  0.06731 [0] (expected 0)
 
        ref  alt  functi

323510    1    2                   6                 0  0.119544 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323511    2    1                   4                 1  0.038039 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323512    2    1                   5                 0  0.104855 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323513    2    1                   5                 1  0.104855 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323514    2    1                   5                 1  0.104855 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323515    2    1                   5                 1  0.104855 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323516    2    1                   5                 0  0.104855 [0] (expected 0)
 
        ref  alt  func

 
        ref  alt  functional_element  cell_line_cancer    scoreC
323659    1    4                   7                 0  0.098406 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323660    1    4                  15                 0  0.098406 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323661    1    4                   6                 0  0.098406 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323662    1    4                   9                 0  0.098406 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323663    1    4                   7                 0  0.098406 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323664    1    4                  10                 0  0.098406 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323665    1    4                  11   

323800    4    3                   5                 1  0.025663 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323801    4    3                   3                 1  0.025663 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323802    4    3                  13                 1  0.025663 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323803    4    3                   6                 1  0.025663 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323804    4    3                   5                 1  0.025663 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323805    4    3                   6                 1  0.025663 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323806    3    4                   5                 0  0.090008 [0] (expected 0)
 
        ref  alt  func

323952    3    4                   6                 1  0.032893 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323953    3    4                   6                 1  0.032893 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323954    3    4                   6                 1  0.032893 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323955    3    4                   8                 0  0.032893 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer   scoreC
323961    4    3                   5                 0  0.16262 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323962    1    2                   6                 0  0.118436 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
323963    1    2                   6                 0  0.118436 [0] (expected 0)
 
        ref  alt  functi

324085    1    2                   6                 0  0.020529 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324086    1    2                   6                 1  0.020529 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324087    1    2                   6                 0  0.020529 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324088    1    2                  13                 1  0.020529 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324089    3    1                   4                 1  0.121459 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324091    2    3                   4                 1  0.110996 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324092    2    1                   6                 1  0.006977 [1] (expected 1)
 
        ref  alt  func

324212    3    2                  13                 1  0.061647 [1] (expected 1)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324213    1    2                   6                 0  0.043756 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324214    1    2                   6                 1  0.043756 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324215    1    2                   6                 1  0.043756 [1] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324216    1    2                   8                 0  0.043756 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324217    4    1                   6                 0  0.191238 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324218    2    4                   0                 0  0.075594 [0] (expected 0)
 
        ref  alt  func

 
        ref  alt  functional_element  cell_line_cancer    scoreC
324356    4    1                   1                 0  0.031698 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324357    4    1                  14                 0  0.031698 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324358    3    4                  14                 0  0.029433 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324359    1    4                   1                 0  0.015691 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324360    1    4                  18                 0  0.015691 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324361    2    1                   8                 0  0.155471 [0] (expected 0)
 
        ref  alt  functional_element  cell_line_cancer    scoreC
324362    3    4                   6   


The model is very flexible with different data set based on SNP.

Each feature must be firstly prepared for convert: obj, strings and other data types into Integer or Float values.

The input features for prediction can be easly changed (in 'Feature to use for model the predictions') as well the target label, but the latter can be only binary.

Other type for prediction could include:
    
    Subsetting the SNPs based on 
        > cellular lines
        > functional element 
        > limit the positions of SNPs on the chr
