# 14 Cancer
Ramaswamy et al. (2001) present a difficult microarray classification problem, involving a training set of 144 patients with 14 different types of cancer, and a test set of 54 patients. Gene expression measurements were available for 16,063 genes. One gene per row, one sample per column.

Cancer classes are labelled as follows:
1.  breast, 2.  prostate, 3.  lung, 4.  collerectal, 5.  lymphoma, 6.  bladder, 7.  melanoma, 8.  uterus, 9.  leukemia, 10. renal, 11. pancreas, 12. ovary, 13. meso, 14. cns

In [1]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score

%matplotlib inline

# define commonly used colors
GRAY1, GRAY4, PURPLE = '#231F20', '#646369', '#A020F0'
BLUE, ORANGE, BLUE1 = '#57B5E8', '#E69E00', '#174A7E'
# configure plot font family to Arial
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.linewidth'] = 0.5

## Load and Prepare Data

In [2]:
data = np.load('../data/14cancer.npy.npz')['data']

In [3]:
# last column contains 'is test' flag
is_test = data[:,-1].astype(int)
data_test = data[is_test == 1, :]
data_train = data[is_test == 0, :]
# pre-last column contains class
y_train = data_train[:, -2].astype(int)
y_test = data_test[:, -2].astype(int)
X_train = data_train[:, :-2]
X_test = data_test[:, :-2]

## L1-penalized multinominal

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [5]:
lr = LogisticRegression(
    multi_class='multinomial', 
    penalty='l1', 
    solver='saga', 
    max_iter=7000,
    fit_intercept=True,
    C=0.097
)
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', lr)])
pipeline.fit(X_train, y_train)    
print(np.sum(y_test != pipeline.predict(X_test)), np.sum(np.sum(abs(lr.coef_), axis=0) != 0))

16 251


## L2-penalized multinominal

In [6]:
lr = LogisticRegression(
    multi_class='multinomial', 
    penalty='l2', 
    solver='saga', 
    max_iter=1000,
    fit_intercept=True,
    C=0.097
)
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', lr)])
pipeline.fit(X_train, y_train)    
print(np.sum(y_test != pipeline.predict(X_test)), np.sum(np.sum(abs(lr.coef_), axis=0) != 0))

21 16063




## Elastic-Net penalized multinominal

In [None]:
lr = LogisticRegression(
    multi_class='multinomial', 
    penalty='elasticnet', 
    solver='saga', 
    max_iter=7000,
    fit_intercept=True,
    C=0.09,
    l1_ratio=0.6
)
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', lr)])
pipeline.fit(X_train, y_train)    
print(np.sum(y_test != pipeline.predict(X_test)), np.sum(np.sum(abs(lr.coef_), axis=0) != 0))