# 14 Cancer
Ramaswamy et al. (2001) present a difficult microarray classification problem, involving a training set of 144 patients with 14 different types of cancer, and a test set of 54 patients. Gene expression measurements were available for 16,063 genes. One gene per row, one sample per column.

Cancer classes are labelled as follows:
1.  breast, 2.  prostate, 3.  lung, 4.  collerectal, 5.  lymphoma, 6.  bladder, 7.  melanoma, 8.  uterus, 9.  leukemia, 10. renal, 11. pancreas, 12. ovary, 13. meso, 14. cns

In [1]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder

%matplotlib inline

# define commonly used colors
GRAY1, GRAY4, PURPLE = '#231F20', '#646369', '#A020F0'
BLUE, ORANGE, BLUE1 = '#57B5E8', '#E69E00', '#174A7E'
# configure plot font family to Arial
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.linewidth'] = 0.5

## Load and Prepare Data

In [2]:
data = np.load('../data/14cancer.npy.npz')['data']

In [3]:
# last column contains 'is test' flag
is_test = data[:,-1].astype(int)
data_test = data[is_test == 1, :]
data_train = data[is_test == 0, :]
# pre-last column contains class
y_train = data_train[:, -2].astype(int)
y_test = data_test[:, -2].astype(int)
X_train = data_train[:, :-2]
X_test = data_test[:, :-2]

## Nearest Shrunken Centroids

In [4]:
from sklearn.neighbors.nearest_centroid import NearestCentroid

In [5]:
ncc = Pipeline([
    ('scale', StandardScaler()),
    ('ncc', NearestCentroid(shrink_threshold=4.5))]
).fit(X_train, y_train)
print(np.sum(y_test != ncc.predict(X_test)))

16


## Support Vector Classfier

In [6]:
from sklearn.svm import LinearSVC

In [7]:
svc = LinearSVC(random_state=0, tol=1e-6, C=100000)
svc.fit(X_train, y_train)
print(np.sum(y_test != svc.predict(X_test)))

13


## L2-penalized Discriminant Analysis

In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [38]:
#scaler = StandardScaler().fit(np.vstack((X_train, X_test)))
scaler = StandardScaler().fit(X_train)

In [39]:
X_train1 = scaler.transform(X_train)
X_test1 = scaler.transform(X_test)

In [40]:
U, D, VT = np.linalg.svd(X_train1, full_matrices=False)

In [77]:
R = U@np.diag(D)
R_test = X_test1@VT.T

In [74]:
R = X_train
classes, counts = np.unique(y_train, return_counts=True)
priors = counts/y_train.size
means = np.vstack([np.mean(R[y_train == i], axis=0) for i in classes])
sigma = np.zeros(shape=(X_train.shape[1], X_train.shape[1]))
for k, m in zip(classes, means):
    R_class = R[y_train == k]
    for i in range(R_class.shape[0]):
        v = np.atleast_2d(R_class[i] - m).T
        sigma += v@v.T
sigma /= (144 - 14)
sigma_inv = np.linalg.inv(sigma)

result = np.zeros(shape=(144, 14))
for k, m, p in zip(classes, means, priors):
    m = np.atleast_2d(m).T
    dis = R.T @ sigma_inv @ m - 0.5 * m.T @ sigma_inv @ m + np.log(p)
    result[:,k-1:k] = dis
np.argmax(result, axis=1)

LinAlgError: Singular matrix

In [88]:
lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=0.001).fit(R, y_train)
sigma = lda.covariance_
means = lda.means_
priors = lda.priors_
sigma_inv = np.linalg.inv(sigma)


#result = np.zeros(shape=(144, 14))
#for k, m, p in zip(classes, means, priors):
#    m = np.atleast_2d(m).T
#    dis = R @ sigma_inv @ m - 0.5 * m.T @ sigma_inv @ m + np.log(p)
#    result[:,k-1:k] = dis
#np.argmax(result, axis=1) == y_train
x = np.atleast_2d(R[0]).T
m = np.atleast_2d(means[0]).T
x.T @ sigma_inv @ m - 0.5*m.T@sigma_inv@m + np.log(priors[0])

array([[2924.31132484]])

In [42]:
for shrinkage in np.linspace(0.1, 0.9, 100):
    lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=shrinkage).fit(R, y_train)
    print(shrinkage, np.sum((lda.predict(R) != y_train)), np.sum((lda.predict(R_test) != y_test)))

0.1 0 23
0.10808080808080808 0 23
0.11616161616161616 0 23
0.12424242424242425 0 23
0.13232323232323234 0 23
0.14040404040404042 0 23
0.1484848484848485 0 23
0.15656565656565657 0 23
0.16464646464646465 0 23
0.17272727272727273 0 23
0.1808080808080808 1 23
0.18888888888888888 1 22
0.19696969696969696 1 22
0.20505050505050504 1 21
0.21313131313131314 1 21
0.22121212121212122 1 21
0.2292929292929293 1 22
0.23737373737373738 1 22
0.24545454545454545 1 22
0.2535353535353535 1 23
0.26161616161616164 2 23
0.2696969696969697 2 23
0.2777777777777778 2 23
0.28585858585858587 2 23
0.29393939393939394 2 23
0.302020202020202 2 23
0.3101010101010101 2 23
0.3181818181818182 2 23
0.32626262626262625 2 23
0.3343434343434344 2 23
0.3424242424242424 3 23
0.35050505050505054 3 23
0.35858585858585856 3 23
0.3666666666666667 3 23
0.3747474747474747 3 23
0.38282828282828285 4 23
0.3909090909090909 4 23
0.398989898989899 4 23
0.407070707070707 4 23
0.41515151515151516 4 24
0.4232323232323232 4 24
0.431313131

In [None]:
lda = Pipeline([
    ('scale', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis(solver='eigen', shrinkage=0.5))]
).fit(X_train, y_train)
print(np.sum(y_test != lda.predict(X_test)))

## L1-penalized multinominal

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
lr = LogisticRegression(
    multi_class='multinomial', 
    penalty='l1', 
    solver='saga', 
    max_iter=7000,
    fit_intercept=True,
    C=0.097
)
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', lr)])
pipeline.fit(X_train, y_train)    
print(np.sum(y_test != pipeline.predict(X_test)), np.sum(np.sum(abs(lr.coef_), axis=0) != 0))

16 251


## L2-penalized multinominal

In [6]:
lr = LogisticRegression(
    multi_class='multinomial', 
    penalty='l2', 
    solver='saga', 
    max_iter=1000,
    fit_intercept=True,
    C=0.097
)
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', lr)])
pipeline.fit(X_train, y_train)    
print(np.sum(y_test != pipeline.predict(X_test)), np.sum(np.sum(abs(lr.coef_), axis=0) != 0))

21 16063




## Elastic-Net penalized multinominal

In [None]:
lr = LogisticRegression(
    multi_class='multinomial', 
    penalty='elasticnet', 
    solver='saga', 
    max_iter=7000,
    fit_intercept=True,
    C=0.09,
    l1_ratio=0.6
)
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', lr)])
pipeline.fit(X_train, y_train)    
print(np.sum(y_test != pipeline.predict(X_test)), np.sum(np.sum(abs(lr.coef_), axis=0) != 0))