## Pipelines for image classification

In [1]:
%matplotlib inline

In [2]:
import time
import warnings
import operator
import pickle
from functools import partial, wraps

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
np.set_printoptions(precision=3, suppress=True)

In [4]:
sns.set_context('notebook', font_scale=1.5)

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_selection import (SelectKBest, SelectFromModel, 
                                       SelectFdr, SelectPercentile, 
                                       f_classif, mutual_info_classif, RFE, RFECV)
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import (StratifiedKFold, cross_val_score, train_test_split,
                                     cross_val_predict, GridSearchCV)
from sklearn.linear_model import (LassoCV, RidgeClassifier, RidgeClassifierCV, ElasticNet,
                                  RandomizedLogisticRegression)
from sklearn.metrics import (accuracy_score, roc_auc_score, f1_score, make_scorer,
                             classification_report)

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, 
                              AdaBoostClassifier, ExtraTreesClassifier)
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.pipeline import Pipeline

## Read data

In [6]:
df = pd.read_csv('dat2_features_no_blue_all.csv')

In [7]:
df.shape

(312, 4062)

In [8]:
df.groupby('class')['class_id'].mean()

class
blue_bg                   1.0
distal_acinar_tubule2     2.0
proximal_acinar_tubule    3.0
Name: class_id, dtype: float64

In [9]:
X = df.ix[:, :-3]
y = df.ix[:, -3].astype('int')

## Classification

In [26]:
alpha = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
Cs = [0.01, 0.1, 1.0, 10.0, 100.0]
gamma = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
learning_rate_init = [1e-4, 1e-3, 1e-2]
l1_ratio = [1]
max_depth = [1, 5, 10, None]
max_features = [1, 5, 10, 'sqrt', 'log2', None]
n_estimators = [5, 10, 25]
n_neighbors = [3, 5, 7, 9, 11]

In [11]:
ridge_params = {'alpha': alpha}
en_params = {'l1_ratio': l1_ratio}
svc_params = [{'kernel': ['rbf'], 'gamma': gamma, 'C': Cs},
              {'kernel': ['linear'], 'C': Cs}]
dt_params = {'max_depth': max_depth, 
               'max_features': max_features}
rf_params = {'max_depth': max_depth, 'n_estimators' : n_estimators, 
               'max_features': max_features}
nn_params = {'alpha': alpha, 'learning_rate_init': learning_rate_init}
knn_params = {'n_neighbors': n_neighbors}

In [12]:
clfs = [
    GridSearchCV(RidgeClassifier(), ridge_params, cv=5),
    GridSearchCV(KNeighborsClassifier(), knn_params, cv=5),
    GridSearchCV(SVC(), svc_params, cv=5),
    GridSearchCV(DecisionTreeClassifier(), dt_params, cv=5),
    GridSearchCV(RandomForestClassifier(), rf_params, cv=5),
    # GridSearchCV(MLPClassifier(max_iter=1000), nn_params, cv=5),
    ]

In [13]:
try:
    with open('scores.pic', 'rb') as f:
        scores = pickle.load(f)
except Exception as e:
    print(e)
    scores = {}

for clf in clfs:
    with warnings.catch_warnings():
        print(clf.estimator.__class__.__name__)
        warnings.simplefilter('ignore')
        pipe = Pipeline([
          ('standard_scalar', StandardScaler()),
          ('feature_selection', SelectFdr()),
          ('classification', clf)
        ])
        if clf.estimator.__class__.__name__ in scores:
            continue
        start = time.clock()
        score = cross_val_score(pipe, X, y, cv=5, scoring='accuracy', n_jobs=-1)
        print(score)
        elapsed = time.clock() - start
        scores[clf.estimator.__class__.__name__] = score.mean(), elapsed, score
with open('scores.pic', 'wb') as f:
    pickle.dump(scores, f)

RidgeClassifier
KNeighborsClassifier
SVC
DecisionTreeClassifier
RandomForestClassifier


## Sort by average cross-validation accuracy

In [14]:
for k, v in sorted(scores.items(), key=lambda x: x[1][0], reverse=True):
    print('%s\n' % k, '%.2f' % v[0], '%8.2f' % v[1], v[2], '\n')

RidgeClassifier
 0.98     0.37 [ 0.968  0.968  0.984  1.     0.967] 

SVC
 0.96     0.63 [ 0.968  0.952  0.984  0.952  0.967] 

RandomForestClassifier
 0.92     0.46 [ 0.889  0.968  0.952  0.887  0.918] 

DecisionTreeClassifier
 0.92     0.39 [ 0.889  0.889  0.952  0.968  0.902] 

KNeighborsClassifier
 0.92     0.43 [ 0.952  0.952  0.952  0.871  0.869] 



## Sort by time taken

In [15]:
for k, v in sorted(scores.items(), key=lambda x: x[1][1], reverse=False):
    print('%-40s' % k, '%8.2f' % v[0], '%8.2f' % v[1], v[2])

RidgeClassifier                              0.98     0.37 [ 0.968  0.968  0.984  1.     0.967]
DecisionTreeClassifier                       0.92     0.39 [ 0.889  0.889  0.952  0.968  0.902]
KNeighborsClassifier                         0.92     0.43 [ 0.952  0.952  0.952  0.871  0.869]
RandomForestClassifier                       0.92     0.46 [ 0.889  0.968  0.952  0.887  0.918]
SVC                                          0.96     0.63 [ 0.968  0.952  0.984  0.952  0.967]


## Pipeline with PCA

In [16]:
clfs = [
    GridSearchCV(RidgeClassifier(), ridge_params, cv=5),
    GridSearchCV(KNeighborsClassifier(), knn_params, cv=5),
    GridSearchCV(SVC(), svc_params, cv=5),
    GridSearchCV(DecisionTreeClassifier(), dt_params, cv=5),
    GridSearchCV(RandomForestClassifier(), rf_params, cv=5),
    # GridSearchCV(MLPClassifier(max_iter=1000), nn_params, cv=5),
    ]

In [17]:
try:
    with open('scores_pca.pic', 'rb') as f:
        scores_pca = pickle.load(f)
except Exception as e:
    print(e)
    scores_pca = {}

for clf in clfs:
    with warnings.catch_warnings():
        print(clf.estimator.__class__.__name__)
        warnings.simplefilter('ignore')
        pipe = Pipeline([
          ('standard_scalar', StandardScaler()),
          ('pca', PCA(n_components=25)),
          # ('feature_selection', SelectFdr()),
          ('classification', clf)
        ])
        if clf.estimator.__class__.__name__ in scores_pca:
            continue
        start = time.clock()
        score = cross_val_score(pipe, X, y, cv=5, scoring='accuracy', n_jobs=-1)
        print(score)
        elapsed = time.clock() - start
        scores_pca[clf.estimator.__class__.__name__] = score.mean(), elapsed, score
with open('scores_pca.pic', 'wb') as f:
    pickle.dump(scores_pca, f)

RidgeClassifier
KNeighborsClassifier
SVC
DecisionTreeClassifier
RandomForestClassifier


## Sort by average cross-validation accuracy

In [18]:
for k, v in sorted(scores_pca.items(), key=lambda x: x[1][0], reverse=True):
    print('%s\n' % k, '%.2f' % v[0], '%8.2f' % v[1], v[2], '\n')

RidgeClassifier
 0.96     0.34 [ 0.937  0.952  0.952  0.968  0.984] 

SVC
 0.96     0.35 [ 0.952  0.937  1.     0.935  0.951] 

RandomForestClassifier
 0.93     0.36 [ 0.952  0.921  0.921  0.919  0.951] 

KNeighborsClassifier
 0.93     0.34 [ 0.952  0.984  0.937  0.887  0.902] 

DecisionTreeClassifier
 0.88     0.37 [ 0.889  0.857  0.905  0.823  0.902] 



## Sort by time taken

In [19]:
for k, v in sorted(scores_pca.items(), key=lambda x: x[1][1], reverse=False):
    print('%-40s' % k, '%8.2f' % v[0], '%8.2f' % v[1], v[2])

KNeighborsClassifier                         0.93     0.34 [ 0.952  0.984  0.937  0.887  0.902]
RidgeClassifier                              0.96     0.34 [ 0.937  0.952  0.952  0.968  0.984]
SVC                                          0.96     0.35 [ 0.952  0.937  1.     0.935  0.951]
RandomForestClassifier                       0.93     0.36 [ 0.952  0.921  0.921  0.919  0.951]
DecisionTreeClassifier                       0.88     0.37 [ 0.889  0.857  0.905  0.823  0.902]


## Ridge

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

clf = GridSearchCV(RidgeClassifier(), ridge_params, cv=5)
pipe = Pipeline([
  ('standard_scalar', StandardScaler()),
  ('feature_selection', SelectFdr()),
  ('classification', clf)
])

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    pipe.fit(X_train, y_train)

# print(clf.best_params_, clf.cv_results_)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    y_pred = pipe.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=df.groupby('class')['class_id'].mean().index))
    print("Accuracy = ", accuracy_score(y_test, y_pred))

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    print(cross_val_score(pipe, X, y, cv=5, n_jobs=1))

                        precision    recall  f1-score   support

               blue_bg       1.00      1.00      1.00        22
 distal_acinar_tubule2       1.00      1.00      1.00        38
proximal_acinar_tubule       1.00      1.00      1.00        18

           avg / total       1.00      1.00      1.00        78

Accuracy =  1.0
[ 0.968  0.968  0.984  1.     0.967]


In [21]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    print(pipe.named_steps['feature_selection'].get_support().sum())

3167


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

clf = GridSearchCV(RidgeClassifier(), ridge_params, cv=5)
pipe = Pipeline([
  ('standard_scalar', StandardScaler()),
  ('feature_selection', SelectKBest(f_classif, 400)),
  ('classification', clf)
])

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    pipe.fit(X_train, y_train)

# print(clf.best_params_, clf.cv_results_)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    y_pred = pipe.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=df.groupby('class')['class_id'].mean().index))
    print("Accuracy = ", accuracy_score(y_test, y_pred))

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    print(cross_val_score(pipe, X, y, cv=5, n_jobs=1))

                        precision    recall  f1-score   support

               blue_bg       1.00      1.00      1.00        22
 distal_acinar_tubule2       1.00      1.00      1.00        38
proximal_acinar_tubule       1.00      1.00      1.00        18

           avg / total       1.00      1.00      1.00        78

Accuracy =  1.0
[ 0.952  0.984  1.     1.     0.967]


In [23]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    print(pipe.named_steps['feature_selection'].get_support().sum())

400


## How large does the training set have to be?

In [None]:
X.shape

In [None]:
for s in np.arange(0.02, 0.51, 0.02):

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=s, random_state=1)

    clf = GridSearchCV(RidgeClassifier(), ridge_params, cv=3)
    pipe = Pipeline([
      ('standard_scalar', StandardScaler()),
      ('feature_selection', SelectFdr()),
      ('classification', clf)
    ])

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        pipe.fit(X_train, y_train)

    # print(clf.best_params_, clf.cv_results_)
    print("Number of training samples", X_train.shape[0])

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        y_pred = pipe.predict(X_test)
        # print(classification_report(y_test, y_pred, target_names=df.groupby('class')['class_id'].mean().index))
        print("Accuracy = ", accuracy_score(y_test, y_pred))