# 14 Cancer Microarray
**PAGE 654.** Ramaswamy et al. (2001) present a difficult microarray classification problem, involving a training set of 144 patients with 14 different types of cancer, and a test set of 54 patients. Gene expression measurements were available for 16,063 genes.

**DATA INFO.** One gene per row, one sample per column. Cancer classes are labelled as follows:

|     |           |     |        |     |        |
|----:|:----------|----:|:-------|----:|:-------|
|1.   |breast     |2.   |prostate|3.   |lung    |
|4.   |collerectal|5.   |lymphoma|6.   |bladder |
|7.   |melanoma   |8.   |uterus  |9.   |leukemia|
|10.  |renal      |11.  |pancreas|12.  |ovary   |
|13.  |meso       |14.  |cns     |

In [11]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, Normalizer
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier

%matplotlib inline

# define commonly used colors
GRAY1, GRAY4, PURPLE = '#231F20', '#646369', '#A020F0'
BLUE, ORANGE, BLUE1 = '#57B5E8', '#E69E00', '#174A7E'
# configure plot font family to Arial
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.linewidth'] = 0.5

## Load and Prepare Data

In [2]:
data = np.load('../data/14cancer.npy.npz')['data']
# last column contains 'is test' flag
is_test = data[:,-1].astype(int)
data_test = data[is_test == 1, :]
data_train = data[is_test == 0, :]
# pre-last column contains class
y_train = data_train[:, -2].astype(int) - 1
y_test = data_test[:, -2].astype(int) - 1
X_train = data_train[:, :-2]
X_test = data_test[:, :-2]

# all 144 training samples are slitted into 8 CV folds
cv_indices = np.array([
    [5,   2,   1,   3,   6,   4,   7,   8],
    [14,  15,  12,  9,   11,  16,  10,  13],
    [23,  19,  20,  17,  21,  24,  18,  22],
    [31,  32,  29,  28,  26,  30,  25,  27],
    [35,  48,  38,  46,  42,  34,  47,  33],
    [44,  45,  41,  40,  37,  43,  39,  36],
    [55,  56,  49,  51,  53,  50,  52,  54],
    [63,  59,  64,  61,  60,  62,  57,  58],
    [69,  71,  67,  66,  72,  68,  70,  65],
    [87,  91,  76,  86,  81,  88,  83,  96],
    [92,  74,  89,  93,  95,  84,  79,  73],
    [85,  90,  75,  77,  82,  94,  80,  78],
    [99,  103, 98,  100, 97,  104, 102, 101],
    [105, 111, 106, 109, 107, 112, 108, 110],
    [117, 118, 120, 113, 116, 115, 119, 114],
    [128, 121, 122, 124, 125, 127, 123, 126],
    [133, 139, 137, 138, 132, 142, 144, 135],
    [136, 129, 130, 134, 141, 131, 143, 140]])
# in order to be used in GridSearchCV we need to reformat
# cv folds into the list of train-test indices
cv_indices = (cv_indices.T - 1).tolist()
cv_folds = []
for i in range(len(cv_indices)):
    train = [j for i in cv_indices[:i] + cv_indices[i + 1:] for j in i]
    cv_folds.append([train, cv_indices[i]])

## Cross-Validation
Let's write an auxilar function that calculates CV errors out of 144, its standard error and test errors out of 54.

In [3]:
def calc_cv_stat(grid_search):
    cv_errors = 18*(1 - np.vstack(
        [grid_search.cv_results_[f'split{i}_test_score']
         for i in range(8)]).T)
    best_cv_errors = cv_errors[grid_search.best_index_, :]
    cv_errors_cnt = np.sum(best_cv_errors)
    cv_errors_cnt_std = np.sqrt(np.var(best_cv_errors, ddof=1)*8)
    test_errors_cnt = np.sum(
        grid_search.best_estimator_.predict(X_test) != y_test)
    return cv_errors_cnt, cv_errors_cnt_std, test_errors_cnt

## Nearest Shrunken Centroids
Let's implement nearest shrunken centroid model ourselves. The implementation is the same as proposed in the book, but the result for this task is different.

In [4]:
class ShrunkenCentroid(BaseEstimator, ClassifierMixin):
    """Nearest shrunken centroid classifier.
    Each class is represented by its centroid, with test samples classified to
    the class with the nearest centroid.
    Parameters
    ----------
    delta : float, optional (default = None)
        Delta for shrinking centroids to remove features.
    Attributes
    ----------
    classes_:
        All classes found in the training data set.
    centroids_ :
        Centroid of each class.
    overall_centroid_:
        The overall mean of each feature.
    priors_:
        Class prior probabilities.
    vars_:
        Pooled within-class variances of features.
    shrunken_centroids_:
        Shrunken centroid of each class.
    features_used_:
        The indices of features that are not shrunken to the overall centroid.
    """
    def __init__(self, delta: float = 0):
        self.delta = delta

    def fit(self, X: np.ndarray, y: np.array) -> 'ShrunkenCentroid':
        """
        Fit the ShrunkenCentroid model according to the given training data.
        Parameters
        ----------
        X :
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y :
            Target values (integers)
        """
        # relabel target values to start from zero
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)
        self.classes_ = label_encoder.classes_
        N, p, K = *X.shape, self.classes_.size
        # calculated overall centroid and prior probabilities
        self.overall_centroid_ = np.mean(X, axis=0)
        _, counts = np.unique(y, return_counts=True)
        self.priors_ = counts / N
        # calculate correction coefficients for each class
        m = np.atleast_2d((1/counts - 1/N)**0.5).T
        self.centroids_ =\
            np.vstack([np.mean(X[y == k, :], axis=0) for k in range(K)])
        # pooled within-class variance and deviation of features
        self.vars_ = np.zeros(shape=p)
        for k in range(K):
            self.vars_ += np.sum((X[y == k, :]-self.centroids_[k])**2, axis=0)
        self.vars_ /= (N - K)
        stds = self.vars_ ** 0.5
        # calculate shrunken centroids
        distances = self.centroids_ - self.overall_centroid_
        mean_std = np.median(stds)
        t_stats = distances / (stds + mean_std) / m
        t_stats_shrunken =\
            np.sign(t_stats) * (np.abs(t_stats) - self.delta).clip(0)
        self.shrunken_centroids_ =\
            self.overall_centroid_ + m * (stds + mean_std) * t_stats_shrunken
        self.features_used_ = np.squeeze(np.argwhere(np.sum(np.abs(
            self.shrunken_centroids_ - self.overall_centroid_), axis=0) > 0))
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        """Perform classification on an array of test vectors X.
        The predicted class C for each sample in X is returned.
        Parameters
        ----------
        X : array, shape = [n_samples, n_features]
        Returns
        -------
        C : array, shape = [n_samples]
        """
        N, K = X.shape[0], self.classes_.size
        discriminators = np.zeros(shape=(N, K))
        for i in range(N):
            discriminators[i] = -np.sum(
                (self.shrunken_centroids_ - X[i])**2 / self.vars_, axis=1) +\
                                2*np.log(self.priors_)
        return self.classes_[np.argmax(discriminators, axis=1)]

In [5]:
shrunken_centroid_classifier = Pipeline([
    ('scale', Normalizer()),
    ('scc', ShrunkenCentroid())])

shrunken_centroid_grid_search = GridSearchCV(
    shrunken_centroid_classifier,
    {'scc__delta': np.linspace(0, 5, 10)},
    cv=cv_folds, iid=True, scoring='accuracy'
).fit(X_train, y_train)
shrunken_centroid_grid_search.best_estimator_[1].features_used_.size

5417

In [6]:
calc_cv_stat(shrunken_centroid_grid_search)

(34.0, 4.720774754816658, 17)

## Support Vector Classifier

In [7]:
support_vector_classifier = Pipeline([
    ('scale', Normalizer()),
    ('svc', LinearSVC(tol=1e-3))])

support_vector_grid_search = GridSearchCV(
    support_vector_classifier,
    {'svc__C': np.linspace(1000, 3000, 3)},
    cv=cv_folds, iid=True, scoring='accuracy'
).fit(X_train, y_train)
support_vector_grid_search.best_params_

{'svc__C': 1000.0}

In [8]:
calc_cv_stat(support_vector_grid_search)

(27.0, 2.5911938781738653, 14)

## K-Nearest Neighbors

In [9]:
k_nearest_neighbors_classifier = Pipeline([
    ('scale', Normalizer()),
    ('knc', KNeighborsClassifier())])

k_nearest_neighbors_grid_search = GridSearchCV(
    k_nearest_neighbors_classifier,
    {'knc__n_neighbors': list(range(1, 5))},
    cv=cv_folds, iid=True, scoring='accuracy'
).fit(X_train, y_train)
k_nearest_neighbors_grid_search.best_params_

{'knc__n_neighbors': 1}

In [10]:
calc_cv_stat(k_nearest_neighbors_grid_search)

(47.0, 3.835920452027872, 24)

## Elastic-net Penalized Multinomial

In [73]:
%%time
clf = SGDClassifier(
    loss='log', penalty='elasticnet', alpha=0.08,
    l1_ratio=0.6, max_iter=10000, n_jobs=4, tol=1e-5, eta0=0.0005,
    learning_rate='adaptive')
elastic_net_classifier = Pipeline([
    ('norm', Normalizer()),
    ('scale', StandardScaler()),
    ('enc', clf)]).fit(X_train, y_train)

CPU times: user 3min 36s, sys: 4.01 ms, total: 3min 36s
Wall time: 59.3 s


In [None]:
%%time
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(
    multi_class='multinomial', 
    penalty='elasticnet', 
    solver='saga', 
    max_iter=7000,
    fit_intercept=True,
    C=0.09,
    l1_ratio=0.6,
    n_jobs=4
)
elastic_net_classifier = Pipeline([
    ('norm', Normalizer()),
    ('scale', StandardScaler()),
    ('enc', clf)]).fit(X_train, y_train)

In [None]:
np.sum(elastic_net_classifier.predict(X_test) != y_test)

In [51]:
%%time
clf = SGDClassifier(
    loss='log', penalty='elasticnet', alpha=0.08,
    l1_ratio=0.6, max_iter=100, n_jobs=4, tol=0, eta0=0.001,
    learning_rate='adaptive')
elastic_net_classifier = Pipeline([
    ('norm', Normalizer()),
    ('scale', StandardScaler()),
    ('enc', clf)])
elastic_net_grid_search = GridSearchCV(
    elastic_net_classifier,
    {'enc__alpha': [0.08], # np.linspace(0.07, 0.09, 3),
     'enc__l1_ratio': [0.06]}, # np.linspace(0.5, 0.7, 3)},
    cv=cv_folds, iid=True, scoring='accuracy'
).fit(X_train, y_train)
elastic_net_grid_search.best_params_



CPU times: user 4min 58s, sys: 244 ms, total: 4min 58s
Wall time: 1min 19s




In [52]:
calc_cv_stat(elastic_net_grid_search)

(24.0, 2.618614682831908, 14)

In [11]:
# use grid search to select best shrink_threshold
shrink_threshold_grid_search = GridSearchCV(
    #ShrunkenCentroid(),
    #{'delta': np.linspace(0, 5, 20)},
    LinearSVC(tol=1e-6),
    {'C': np.linspace(100000, 1000, 5)},
    cv=cv_folds,
    iid=True,
    scoring='accuracy'
).fit(X_train, y_train)
best_model = shrink_threshold_grid_search.best_estimator_
#print('Test accuracy score',
#      np.sum(y_test != best_model.predict(X_test)),
#      best_model.delta, best_model.features_used_.shape)
print('Test accuracy score',
      np.sum(y_test != best_model.predict(X_test)), best_model.C)

Test accuracy score 13 100000.0


In [12]:
cv_errors = 18*(1 - np.vstack(
    [shrink_threshold_grid_search.cv_results_[f'split{i}_test_score']
     for i in range(8)]).T)

In [13]:
np.sum(cv_errors, axis=1)

array([32., 32., 32., 32., 32.])

In [88]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV

clf = SGDClassifier(
    loss='log', penalty='elasticnet', alpha=0.1,
    l1_ratio=0.6, max_iter=1000, n_jobs=4, tol=0, eta0=0.0005,
    learning_rate='adaptive')

shrink_threshold_grid_search = RandomizedSearchCV(
    clf,
    {'alpha': np.linspace(0.05, 0.15, 2),
     'eta0': np.linspace(0.0005, 0.005, 2),
     'l1_ratio': np.linspace(0.4, 0.7, 2),
     'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']},
    cv=cv_folds,
    iid=True,
    scoring='accuracy', n_iter=3
).fit(X_train, y_train)
#clf.fit(X_train, y_train)



KeyboardInterrupt: 

In [82]:

np.sum(clf.predict(X_test) != y_test), np.sum(np.sum(abs(clf.coef_), axis=0) != 0)
#clf.coef_

(16, 248)

In [71]:
nearest_centroid_classifier = Pipeline([
    ('scale', StandardScaler()),
    ('ncc', NearestCentroid())]
)
# use grid search to select best shrink_threshold
shrink_threshold_grid_search = GridSearchCV(
    nearest_centroid_classifier,
    {'ncc__shrink_threshold': np.linspace(0, 100, 100)},
    cv=cv_folds,
    iid=True
).fit(X_train, y_train)
best_model = shrink_threshold_grid_search.best_estimator_
print('Test accuracy score',
      accuracy_score(y_test, best_model.predict(X_test)))

KeyboardInterrupt: 

In [4]:
from sklearn.neighbors.nearest_centroid import NearestCentroid

In [5]:
ncc = Pipeline([
    ('scale', StandardScaler()),
    ('ncc', NearestCentroid(shrink_threshold=4.5))]
).fit(X_train, y_train)
print(np.sum(y_test != ncc.predict(X_test)))

16


## Support Vector Classfier

In [6]:
from sklearn.svm import LinearSVC

In [7]:
svc = LinearSVC(random_state=0, tol=1e-6, C=100000)
svc.fit(X_train, y_train)
print(np.sum(y_test != svc.predict(X_test)))

13


## L2-penalized Discriminant Analysis

In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [38]:
#scaler = StandardScaler().fit(np.vstack((X_train, X_test)))
scaler = StandardScaler().fit(X_train)

In [39]:
X_train1 = scaler.transform(X_train)
X_test1 = scaler.transform(X_test)

In [40]:
U, D, VT = np.linalg.svd(X_train1, full_matrices=False)

In [77]:
R = U@np.diag(D)
R_test = X_test1@VT.T

In [74]:
R = X_train
classes, counts = np.unique(y_train, return_counts=True)
priors = counts/y_train.size
means = np.vstack([np.mean(R[y_train == i], axis=0) for i in classes])
sigma = np.zeros(shape=(X_train.shape[1], X_train.shape[1]))
for k, m in zip(classes, means):
    R_class = R[y_train == k]
    for i in range(R_class.shape[0]):
        v = np.atleast_2d(R_class[i] - m).T
        sigma += v@v.T
sigma /= (144 - 14)
sigma_inv = np.linalg.inv(sigma)

result = np.zeros(shape=(144, 14))
for k, m, p in zip(classes, means, priors):
    m = np.atleast_2d(m).T
    dis = R.T @ sigma_inv @ m - 0.5 * m.T @ sigma_inv @ m + np.log(p)
    result[:,k-1:k] = dis
np.argmax(result, axis=1)

LinAlgError: Singular matrix

In [88]:
lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=0.001).fit(R, y_train)
sigma = lda.covariance_
means = lda.means_
priors = lda.priors_
sigma_inv = np.linalg.inv(sigma)


#result = np.zeros(shape=(144, 14))
#for k, m, p in zip(classes, means, priors):
#    m = np.atleast_2d(m).T
#    dis = R @ sigma_inv @ m - 0.5 * m.T @ sigma_inv @ m + np.log(p)
#    result[:,k-1:k] = dis
#np.argmax(result, axis=1) == y_train
x = np.atleast_2d(R[0]).T
m = np.atleast_2d(means[0]).T
x.T @ sigma_inv @ m - 0.5*m.T@sigma_inv@m + np.log(priors[0])

array([[2924.31132484]])

In [42]:
for shrinkage in np.linspace(0.1, 0.9, 100):
    lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=shrinkage).fit(R, y_train)
    print(shrinkage, np.sum((lda.predict(R) != y_train)), np.sum((lda.predict(R_test) != y_test)))

0.1 0 23
0.10808080808080808 0 23
0.11616161616161616 0 23
0.12424242424242425 0 23
0.13232323232323234 0 23
0.14040404040404042 0 23
0.1484848484848485 0 23
0.15656565656565657 0 23
0.16464646464646465 0 23
0.17272727272727273 0 23
0.1808080808080808 1 23
0.18888888888888888 1 22
0.19696969696969696 1 22
0.20505050505050504 1 21
0.21313131313131314 1 21
0.22121212121212122 1 21
0.2292929292929293 1 22
0.23737373737373738 1 22
0.24545454545454545 1 22
0.2535353535353535 1 23
0.26161616161616164 2 23
0.2696969696969697 2 23
0.2777777777777778 2 23
0.28585858585858587 2 23
0.29393939393939394 2 23
0.302020202020202 2 23
0.3101010101010101 2 23
0.3181818181818182 2 23
0.32626262626262625 2 23
0.3343434343434344 2 23
0.3424242424242424 3 23
0.35050505050505054 3 23
0.35858585858585856 3 23
0.3666666666666667 3 23
0.3747474747474747 3 23
0.38282828282828285 4 23
0.3909090909090909 4 23
0.398989898989899 4 23
0.407070707070707 4 23
0.41515151515151516 4 24
0.4232323232323232 4 24
0.431313131

In [None]:
lda = Pipeline([
    ('scale', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis(solver='eigen', shrinkage=0.5))]
).fit(X_train, y_train)
print(np.sum(y_test != lda.predict(X_test)))

## L1-penalized multinominal

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
lr = LogisticRegression(
    multi_class='multinomial', 
    penalty='l1', 
    solver='saga', 
    max_iter=7000,
    fit_intercept=True,
    C=0.097
)
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', lr)])
pipeline.fit(X_train, y_train)    
print(np.sum(y_test != pipeline.predict(X_test)), np.sum(np.sum(abs(lr.coef_), axis=0) != 0))

16 251


## L2-penalized multinominal

In [6]:
lr = LogisticRegression(
    multi_class='multinomial', 
    penalty='l2', 
    solver='saga', 
    max_iter=1000,
    fit_intercept=True,
    C=0.097
)
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', lr)])
pipeline.fit(X_train, y_train)    
print(np.sum(y_test != pipeline.predict(X_test)), np.sum(np.sum(abs(lr.coef_), axis=0) != 0))

21 16063




## Elastic-Net penalized multinominal

In [36]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(
    multi_class='multinomial', 
    penalty='elasticnet', 
    solver='saga', 
    max_iter=7000,
    fit_intercept=True,
    C=0.09,
    l1_ratio=0.6
)
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', lr)])
pipeline.fit(X_train, y_train)    
print(np.sum(y_test != pipeline.predict(X_test)), np.sum(np.sum(abs(lr.coef_), axis=0) != 0))

19 494
