In [2]:
import os
import pandas as pd
import numpy as np
import scipy
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn import metrics
import time



In [6]:
sparse_matrix = scipy.sparse.load_npz('../data/TCGA/npz/TCGA_nofiltering.npz')
feature_array = np.array(pd.read_csv("../data/TCGA/npz/features.txt",header=None)[0].tolist())
print(sparse_matrix.shape)
feature_index = np.where(sparse_matrix.sum(axis=0)>4)[1]
sparse_matrix_temp = sparse_matrix[:,feature_index]
print(sparse_matrix_temp.shape)
sample_index = np.where(sparse_matrix_temp.sum(axis=1)!=0)[0]
sparse_matrix_temp = sparse_matrix_temp[sample_index,:]
print(sparse_matrix_temp.shape)


X = sparse_matrix_temp

feature_array = feature_array[feature_index]
print(feature_array.shape)



label_array = np.array(pd.read_csv("../data/TCGA/npz/labels.txt",header=None)[0].tolist())
label_array = label_array[sample_index]
le = preprocessing.LabelEncoder()
le.fit(label_array)
y = le.transform(label_array)
print(y.shape)

X, y = shuffle(X, y, random_state=0)

(9822, 1700753)
(9822, 5696)
(8342, 5696)
(5696L,)
(8342L,)


In [131]:
clf_list = [KNeighborsClassifier(n_neighbors =5,algorithm='auto'),
            LogisticRegression(penalty='l2',multi_class='multinomial',solver='lbfgs'),
            LinearSVC(penalty='l2', multi_class='crammer_singer',dual=False),
            RandomForestClassifier(n_estimators=500),
            GradientBoostingClassifier(n_estimators=500),
            ExtraTreesClassifier(n_estimators=500),
            MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 20), random_state=1)]


In [134]:
for clf in clf_list:
    print(clf)
    start_t = time.time()

    cv_preds = cross_val_predict(clf, X, y, cv=10)
    print("Accuracy : ",metrics.accuracy_score(cv_preds,y))
    print("F1-Micro : ",metrics.f1_score(cv_preds,y,average='micro'))
    print("F1-Macro : ",metrics.f1_score(cv_preds,y,average='macro'))
    print("F1-Weighted : ",metrics.f1_score(cv_preds,y,average='weighted'))

    end_t = time.time()
    print(str((end_t-start_t)/60.0)+" minutes elapsed")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)




Accuracy :  0.46877637130801686
F1-Micro :  0.46877637130801686
F1-Macro :  0.28106630388765824
F1-Weighted :  0.514000618469542
0.1253614862759908 minutes elapsed


  'recall', 'true', average, warn_for)


### Multi-class Feature Selection

In [83]:
skf = StratifiedKFold(y,n_folds=10,shuffle=True)
topk_list = [100,152,200,280,300,400,500,600,700,800,900,1000,1100,1200]

cv_preds_list =[]
cv_y_list =[]
for topk in topk_list:
    print(topk)
    cv_y = []
    cv_preds = []
    for train_index, test_index in skf:
        X_train, X_test = X[train_index,:], X[test_index,:]
        y_train, y_test = y[train_index], y[test_index]

        clf=LinearSVC(penalty='l2', multi_class='crammer_singer',dual=False) 
        clf.fit(X_train,y_train)
        coef_ = clf.coef_
        coef_ = np.abs(coef_)
        coef_ = coef_.mean(axis=0)
        fis = pd.Series(coef_,index=feature_array)

                
        fis = fis[fis>0]    
        fis = fis.sort_values(ascending=False)
        fis_topk_list = fis.index.tolist()[0:topk]
        fis_topk_list_index = np.isin(feature_array,fis_topk_list)
        

        clf = LogisticRegression(penalty='l2',multi_class='multinomial',solver='lbfgs')

        X_train = X_train[:,fis_topk_list_index]
        X_test = X_test[:, fis_topk_list_index]        
        
        clf.fit(X_train,y_train)
        pred = clf.predict(X_test).tolist()
        cv_preds.extend(pred)
        cv_y.extend(y_test.tolist())
    
    cv_preds_list.append(cv_preds)
    cv_y_list.append(cv_y)
        

100
('X_train shape', (7492, 100))
('X_test shape', (850, 100))
('X_train shape', (7496, 100))
('X_test shape', (846, 100))
('X_train shape', (7497, 100))
('X_test shape', (845, 100))
('X_train shape', (7506, 100))
('X_test shape', (836, 100))
('X_train shape', (7508, 100))
('X_test shape', (834, 100))
('X_train shape', (7510, 100))
('X_test shape', (832, 100))
('X_train shape', (7514, 100))
('X_test shape', (828, 100))
('X_train shape', (7516, 100))
('X_test shape', (826, 100))
('X_train shape', (7518, 100))
('X_test shape', (824, 100))
('X_train shape', (7521, 100))
('X_test shape', (821, 100))
152
('X_train shape', (7492, 152))
('X_test shape', (850, 152))
('X_train shape', (7496, 152))
('X_test shape', (846, 152))
('X_train shape', (7497, 152))
('X_test shape', (845, 152))
('X_train shape', (7506, 152))
('X_test shape', (836, 152))
('X_train shape', (7508, 152))
('X_test shape', (834, 152))
('X_train shape', (7510, 152))
('X_test shape', (832, 152))
('X_train shape', (7514, 152))
(