In [31]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle

# Random Forest

In [17]:
def Random_Forest_Class(dataset, cols, file):
    x = dataset[cols[1:]].values  # data
    y = np.array(dataset[cols[0]])  # targets

    kfold = KFold(5)  # split data into n parts. shuffle is req. for sorted data
    valueList = []  # storing the measurements as tn, fp, fn, tp

    n_estimators = [100, 110, 120, 130, 140]
    criterion = ['gini', 'entropy']
    param_grid = {'n_estimators': n_estimators, 'criterion': criterion}
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid)
    grid_search.fit(x,y)

    n_estimator = grid_search.best_params_['n_estimators']
    criteria = grid_search.best_params_['criterion']
    
    print("Number of estimators " + str(n_estimator))
    print("Criteria " + str(criteria))
    
    for train_index, test_index in kfold.split(x):
        model = RandomForestClassifier(n_estimators=n_estimator, criterion=criteria)
        x_train, x_test = x[train_index], x[test_index]  # Spliting the training and test data based on indices
        y_train, y_test = y[train_index], y[test_index]  # Spliting the training and test labels based on indices
        model.fit(x_train, y_train)  # Fit the model

        # Evaluation
        y_pred = model.predict(x_test)  # get predicted labels of test data
        confuse = confusion_matrix(y_test, y_pred, labels=[1, 0])
        valueList.append(confuse.ravel())

    #Measures such as TP , FP , TN and FN
    measures = np.array(valueList)  # 2d array with each iterations tn, fp, fn, tp
    tn, fp, fn, tp = measures.sum(axis=0)

    #Calculate overall accuracy etc.
    accuracy = (tp + tn) / len(dataset)
    specificty = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    # ppv = tp / (tp + fp)
    # npv = tn / (tn + fn)

    print("Measurements for " + file + " dataset classification:")
    print("Accuracy %.2f\nSpecificty %.2f\nSensitivity %.2f\n" %
          (accuracy * 100, specificty * 100, sensitivity * 100))
    print()

In [18]:
# data = pd.read_csv('datasets/cross validation/clinical_cna_norm_cv.csv')
# data = pd.read_csv('datasets/cross validation/clinical_cna_linear_cv.csv')
# data = pd.read_csv('datasets/cross validation/clinical_gene_exp_norm_cv.csv')
# data = pd.read_csv('datasets/cross validation/clinical_gene_exp_cv.csv')
data = pd.read_csv('datasets/cross_validation/clinical_dna_meth_cv.csv')

data = shuffle(data)
data = data.drop('case_id',axis=1) #drop cases

In [19]:
Random_Forest_Class(data,data.columns.values.tolist(), 'DFS status')

Number of estimators 100
Criteria gini
Measurements for DFS status dataset classification:
Accuracy 81.00
Specificty 80.00
Sensitivity 82.00




# SVM (RBF)

In [26]:
def SVM(dataset, cols, file):
    x = dataset[cols[1:]].values  # data
    y = np.array(dataset[cols[0]])  # targets

    kfold = KFold(5)  # split data into n parts. shuffle is req. for sorted data
    valueList = []  # storing the measurements as tn, fp, fn, tp

    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma': gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid)
    grid_search.fit(x,y)

    C = grid_search.best_params_['C']
    gamma = grid_search.best_params_['gamma']

    print("C value " + str(C))
    print("gamm value " + str(gamma))
    
    for train_index, test_index in kfold.split(x):
        model = svm.SVC(kernel='rbf', gamma=gamma, C=C)                             # Creating a SVM RBF
        x_train, x_test = x[train_index], x[test_index]  # Spliting the training and test data based on indices
        y_train, y_test = y[train_index], y[test_index]  # Spliting the training and test labels based on indices
        model.fit(x_train, y_train)  # Fit the model

        # Evaluation
        y_pred = model.predict(x_test)  # get predicted labels of test data
        confuse = confusion_matrix(y_test, y_pred, labels=[1, 0])
        valueList.append(confuse.ravel())

    #Measures such as TP , FP , TN and FN
    measures = np.array(valueList)  # 2d array with each iterations tn, fp, fn, tp
    tn, fp, fn, tp = measures.sum(axis=0)

    #Calculate overall accuracy etc.
    accuracy = (tp + tn) / len(dataset)
    specificty = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    # ppv = tp / (tp + fp)
    # npv = tn / (tn + fn)

    print("Measurements for " + file + " dataset classification:")
    print("Accuracy %.2f\nSpecificty %.2f\nSensitivity %.2f\n" %
          (accuracy * 100, specificty * 100, sensitivity * 100))
    print()

In [28]:
# data = pd.read_csv('datasets/cross validation/clinical_cna_norm_cv.csv')
# data = pd.read_csv('datasets/cross validation/clinical_cna_linear_cv.csv')
# data = pd.read_csv('datasets/cross validation/clinical_gene_exp_norm_cv.csv')
# data = pd.read_csv('datasets/cross validation/clinical_gene_exp_cv.csv')
data = pd.read_csv('datasets/cross_validation/clinical_dna_meth_cv.csv')

data = shuffle(data)
data = data.drop('case_id',axis=1) #drop cases

In [29]:
SVM(data,data.columns.values.tolist(), 'DFS status')

C value 10
gamm value 0.001
Measurements for DFS status dataset classification:
Accuracy 79.00
Specificty 76.00
Sensitivity 82.00




# Naive Baise

In [30]:
def NBayesClass(dataset, cols, file):
    x = dataset[cols[1:]].values  # data
    y = np.array(dataset[cols[0]])  # targets

    kfold = KFold(5)  # split data into n parts. shuffle is req. for sorted data
    valueList = []  # storing the measurements as tn, fp, fn, tp

    for train_index, test_index in kfold.split(x):
        model = GaussianNB()                             # Creating a Naive Bayes model
        x_train, x_test = x[train_index], x[test_index]  # Spliting the training and test data based on indices
        y_train, y_test = y[train_index], y[test_index]  # Spliting the training and test labels based on indices
        model.fit(x_train, y_train)  # Fit the model

        # Evaluation
        y_pred = model.predict(x_test)  # get predicted labels of test data
        confuse = confusion_matrix(y_test, y_pred, labels=[1, 0])
        valueList.append(confuse.ravel())

    #Measures such as TP , FP , TN and FN
    measures = np.array(valueList)  # 2d array with each iterations tn, fp, fn, tp
    tn, fp, fn, tp = measures.sum(axis=0)

    #Calculate overall accuracy etc.
    accuracy = (tp + tn) / len(dataset)
    specificty = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    # ppv = tp / (tp + fp)
    # npv = tn / (tn + fn)

    print("Measurements for " + file + " dataset classification:")
    print("Accuracy %.2f\nSpecificty %.2f\nSensitivity %.2f\n" %
          (accuracy * 100, specificty * 100, sensitivity * 100))
    print()

In [32]:
# data = pd.read_csv('datasets/cross validation/clinical_cna_norm_cv.csv')
# data = pd.read_csv('datasets/cross validation/clinical_cna_linear_cv.csv')
# data = pd.read_csv('datasets/cross validation/clinical_gene_exp_norm_cv.csv')
# data = pd.read_csv('datasets/cross validation/clinical_gene_exp_cv.csv')
data = pd.read_csv('datasets/cross_validation/clinical_dna_meth_cv.csv')

data = shuffle(data)
data = data.drop('case_id',axis=1) #drop cases

In [33]:
NBayesClass(data,data.columns.values.tolist(), 'DFS status')

Measurements for DFS status dataset classification:
Accuracy 73.00
Specificty 82.00
Sensitivity 64.00


