In [1]:
# load data and class label
import numpy as np
import pandas as pd
from pandas import DataFrame


dflabels = pd.read_csv('project_class_labels_300.csv',index_col='Unnamed: 0')
processeddf = pd.read_csv('project_data_down_300.csv',index_col='Unnamed: 0')

In [2]:
# check to see if there is any feature (i.e. column) has all zero values so we will delete them
removedAllZeroColdf = processeddf.loc[:, (processeddf != 0).any(axis=0)]
removedAllZeroColdf.shape

(9900, 20317)

In [3]:
# Feature Selection - Variance Threshold 

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=500000)
reduced = sel.fit_transform(removedAllZeroColdf)
reduceddf = DataFrame(reduced)
reduced.shape

(9900, 6729)

In [4]:
# data scaling
# method 1 : standardization
from sklearn.preprocessing import StandardScaler

stdscaler = StandardScaler()
stdscalerfit = stdscaler.fit_transform(removedAllZeroColdf)

stddf = DataFrame(stdscalerfit)
stddf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20307,20308,20309,20310,20311,20312,20313,20314,20315,20316
0,-0.17872,0.007263,0.576432,-0.617724,-0.950087,-0.039408,-0.013745,-0.070064,-0.101265,-0.014152,...,2.335216,1.065582,0.338502,0.070045,0.864523,-0.715678,0.841293,2.689865,0.27871,-0.162154
1,-0.17872,-0.714527,-0.329412,-0.506457,0.849333,-0.039408,2.125386,-0.274288,0.225202,-0.014152,...,-0.530446,1.033751,-0.352114,0.247356,-0.955815,-0.685151,-1.069078,-0.775234,-0.0103,-0.162154
2,-0.17872,-0.069212,-0.38774,0.769218,-0.082723,-0.039408,0.248331,1.092781,-0.101265,-0.014152,...,0.143998,0.673286,-0.056255,-0.017657,-0.96617,0.255371,0.474685,-0.847764,-0.286717,-0.162154
3,-0.17872,-0.70003,-0.768504,-0.22905,1.431617,-0.039408,-0.168322,-0.615445,-0.101265,-0.014152,...,-1.284596,-1.034881,-1.108975,-0.07958,-0.815166,-0.252349,-0.685219,-0.976411,-0.288684,-0.162154
4,-0.17872,-0.56957,-0.539865,0.665249,0.309515,-0.039408,-0.206394,0.088578,-0.101265,-0.014152,...,-0.143753,-0.262679,-0.464457,-0.177959,-0.243623,0.55068,-0.192411,-0.904487,-0.261637,-0.162154


In [5]:
# split 80% training set; 20% testing set
from sklearn.model_selection import train_test_split

trainData, testData, trainLabel, testLabel = train_test_split(reduceddf, dflabels, test_size=0.20)

In [6]:
# svm model - Polynomial kernel function
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# measure training time
import time


poly_svm_clf = svm.SVC(kernel='poly')
start = time.time()
poly_svm_clf.fit(trainData, trainLabel.values.ravel())
end = time.time()
print("ploy SVM training time: ", end - start)

pred = poly_svm_clf.predict(testData)
accuracy = accuracy_score(testLabel, pred)
print("Accuracy Score (polynomial kernel):", accuracy)

# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(testLabel, pred)
print(confusion_matrix)
# classficiation report
from sklearn.metrics import classification_report
print(classification_report(testLabel, pred))

ploy SVM training time:  90.77363085746765
Accuracy Score (polynomial kernel): 0.936363636364
[[59  0  0 ...,  0  0  0]
 [ 0 48  0 ...,  0  0  0]
 [ 0  0 51 ...,  0  0  0]
 ..., 
 [ 0  0  0 ..., 47  4  0]
 [ 0  0  0 ...,  0 55  0]
 [ 0  0  0 ...,  0  0 56]]
             precision    recall  f1-score   support

        ACC       1.00      1.00      1.00        59
       BLCA       0.83      0.89      0.86        54
       BRCA       1.00      0.94      0.97        54
       CESC       0.76      0.87      0.81        54
       CHOL       0.96      1.00      0.98        69
       COAD       0.80      0.83      0.82        64
       DLBC       0.98      1.00      0.99        58
       ESCA       0.94      0.92      0.93        74
        GBM       0.98      1.00      0.99        48
       HNSC       0.82      0.77      0.79        60
       KICH       0.94      1.00      0.97        58
       KIRC       0.94      0.93      0.93        54
       KIRP       0.96      0.89      0.92        61

In [7]:
# 5-fold cross validation
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn import cross_validation

k_fold = KFold(len(reduceddf), n_folds=5, shuffle=True, random_state=0)

clf = svm.SVC(kernel='poly')

scores = cross_val_score(clf, reduceddf, dflabels.values.ravel(), cv=k_fold, n_jobs=1)
print(np.mean(scores))




0.940303030303
