In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
% matplotlib inline

  from ._conv import register_converters as _register_converters


In [3]:
# import the data
X = np.load(os.path.join('data','small_images.npy'))
labels = pd.read_pickle(os.path.join("data","labels.pkl"))

In [4]:
# decide what we want to classify on, let's start with normal or abnormal
y = labels.ABNORMAL.values.astype(np.int)

In [5]:
def calculate_recall(y, yhat):
    tp = np.sum(y & yhat)
    fn = np.sum(y & ~yhat)
    recall = tp / (tp + fn)
    return recall

## VGG Features

### fc6

In [45]:
# vgg pool 5 features
with h5py.File('data/vgg_fc6_features.h5') as features:
    filenames = features['filenames'].value
    vgg_fc6_features = features['vgg_19']['fc6'].value
    
# flatten the array
vgg_fc6_flat = vgg_fc6_features.reshape([322,-1])

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(vgg_fc6_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 4096)
X_te (65, 4096)
y_tr (257,)
y_te (65,)


In [46]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
8,30,1,0.626459
9,30,2,0.626459
5,20,2,0.614786
7,25,2,0.610895
6,25,1,0.607004
4,20,1,0.603113
1,10,2,0.59144
0,10,1,0.587549
2,15,1,0.583658
3,15,2,0.579767


In [47]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Positive Predictions:", np.sum(yhat))
print("Recall:", recall)

Test accuracy: 0.7076923076923077
Positive Predictions: 0
Recall: 0.0


### Pool 5

In [46]:
# vgg pool 5 features
with h5py.File('data/vgg_p5_features.h5') as features:
    filenames = features['filenames'].value
    vgg_features = features['vgg_19']['pool5'].value
    
# flatten the array
vgg_p5_flat = vgg_features.reshape([322,-1])

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(vgg_p5_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 25088)
X_te (65, 25088)
y_tr (257,)
y_te (65,)


In [47]:
# distribution in y_te
print("Test baseline", np.sum(y_te == 0) / y_te.shape[0])
print("Train baseline", np.sum(y_tr == 0) / y_tr.shape[0])

Test baseline 0.7076923076923077
Train baseline 0.6264591439688716


In [48]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
9,30,2,0.63035
7,25,2,0.626459
8,30,1,0.626459
0,10,1,0.622568
4,20,1,0.622568
6,25,1,0.622568
5,20,2,0.618677
2,15,1,0.610895
1,10,2,0.607004
3,15,2,0.607004


In [49]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Positive Predictions:", np.sum(yhat))
print("Recall:", recall)

Test accuracy: 0.7076923076923077
Recall: 0.0


### Pool 4

In [6]:
# vgg pool 3 features
with h5py.File('data/vgg_p4_features.h5') as features:
    filenames = features['filenames'].value
    vgg_p4_features = features['vgg_19']['pool4'].value
    
# flatten the array
vgg_p4_flat = vgg_p4_features.reshape([322,-1])

In [7]:
# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(vgg_p4_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

print("Most frequent baseline:", np.sum(y_te == 0) / y_te.shape[0])

X_tr (257, 100352)
X_te (65, 100352)
y_tr (257,)
y_te (65,)
Most frequent baseline: 0.7076923076923077


In [8]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', None), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knn__n_neighbors': [10, 15, 20, 25, 30], 'knn__p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [9]:
# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
9,30,2,0.622568
0,10,1,0.618677
8,30,1,0.618677
1,10,2,0.607004
2,15,1,0.607004
3,15,2,0.607004
6,25,1,0.607004
4,20,1,0.599222
5,20,2,0.599222
7,25,2,0.59144


In [11]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Positive Predictions:", np.sum(yhat))
print("True Positives:", np.sum(y_te))
print("Recall:", recall)

Test accuracy: 0.676923076923077
Positive Predictions: 4
True Positives: 19
Recall: 0.05263157894736842


### Pool3

In [12]:
# vgg pool 3 features
with h5py.File('data/vgg_p3_features.h5') as features:
    filenames = features['filenames'].value
    vgg_p3_features = features['vgg_19']['pool3'].value
    
# flatten the array
vgg_p3_flat = vgg_p3_features.reshape([322,-1])

In [13]:
# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(vgg_p3_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

print("Most frequent baseline:", np.sum(y_te == 0) / y_te.shape[0])

X_tr (257, 200704)
X_te (65, 200704)
y_tr (257,)
y_te (65,)
Most frequent baseline: 0.7076923076923077


In [14]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
0,10,1,0.626459
3,15,2,0.622568
8,30,1,0.622568
9,30,2,0.618677
4,20,1,0.614786
2,15,1,0.610895
1,10,2,0.603113
7,25,2,0.595331
5,20,2,0.59144
6,25,1,0.587549


In [15]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Positive Predictions:", np.sum(yhat))
print("True Positives:", np.sum(y_te))
print("Recall:", recall)

Test accuracy: 0.676923076923077
Positive Predictions: 4
True Positives: 19
Recall: 0.05263157894736842


### Pool 2

In [17]:
# vgg pool 2 features
with h5py.File('data/vgg_p2_features.h5') as features:
    filenames = features['filenames'].value
    vgg_p2_features = features['vgg_19']['pool2'].value
    
# flatten the array
vgg_p2_flat = vgg_p2_features.reshape([322,-1])

In [18]:
# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(vgg_p2_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 401408)
X_te (65, 401408)
y_tr (257,)
y_te (65,)


In [19]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
9,30,2,0.63035
0,10,1,0.622568
4,20,1,0.622568
8,30,1,0.614786
2,15,1,0.610895
5,20,2,0.610895
6,25,1,0.610895
3,15,2,0.603113
7,25,2,0.599222
1,10,2,0.579767


In [20]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Positive Predictions:", np.sum(yhat))
print("True Positives:", np.sum(y_te))
print("Recall:", recall)

Test accuracy: 0.7076923076923077
Positive Predictions: 0
True Positives: 19
Recall: 0.0


### Pool 1

In [51]:
# vgg pool 2 features
with h5py.File('data/vgg_p1_features.h5') as features:
    filenames = features['filenames'].value
    vgg_p1_features = features['vgg_19']['pool1'].value
    
# flatten the array
vgg_p1_flat = vgg_p1_features.reshape([322,-1])

In [52]:
# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(vgg_p1_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 802816)
X_te (65, 802816)
y_tr (257,)
y_te (65,)


In [53]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
2,15,1,0.622568
4,20,1,0.618677
6,25,1,0.618677
9,30,2,0.618677
0,10,1,0.614786
1,10,2,0.610895
7,25,2,0.610895
8,30,1,0.610895
3,15,2,0.603113
5,20,2,0.603113


In [54]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Positive Predictions:", np.sum(yhat))
print("True Positives:", np.sum(y_te))
print("Recall:", recall)

Test accuracy: 0.5846153846153846
Positive Predictions: 10
True Positives: 19
Recall: 0.05263157894736842


## ResNet

### Block 1

In [83]:
# resnet features
with h5py.File('data/rn_b1_features.h5') as features:
    filenames = features['filenames'].value
    rn_b1_features = features['resnet_v2_152']['block1'].value
    
# flatten the array
rn_b1_flat = rn_b1_features.reshape([322,-1])

In [84]:
# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(rn_b1_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 200704)
X_te (65, 200704)
y_tr (257,)
y_te (65,)


In [85]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', None), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knn__n_neighbors': [10, 15, 20, 25, 30], 'knn__p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [86]:
# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
0,10,1,0.618677
8,30,1,0.614786
2,15,1,0.610895
4,20,1,0.607004
9,30,2,0.607004
1,10,2,0.595331
5,20,2,0.595331
6,25,1,0.587549
7,25,2,0.587549
3,15,2,0.583658


In [87]:
grid_cv.score(X_te, y_te)

0.6153846153846154

In [88]:
grid_cv.predict(X_te)

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Block 3

In [94]:
# resnet features
with h5py.File('data/rn_b3_features.h5') as features:
    filenames = features['filenames'].value
    rn_b3_features = features['resnet_v2_152']['block3'].value
    
# flatten the array
rn_b3_flat = rn_b3_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(rn_b3_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 50176)
X_te (65, 50176)
y_tr (257,)
y_te (65,)


In [95]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
8,30,1,0.618677
0,10,1,0.607004
9,30,2,0.607004
3,15,2,0.603113
4,20,1,0.603113
6,25,1,0.599222
2,15,1,0.595331
5,20,2,0.595331
1,10,2,0.587549
7,25,2,0.583658


In [96]:
grid_cv.score(X_te, y_te)

0.6615384615384615

In [97]:
grid_cv.predict(X_te)

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Block 4

In [98]:
# resnet features
with h5py.File('data/rn_b4_features.h5') as features:
    filenames = features['filenames'].value
    rn_b4_features = features['resnet_v2_152']['block4'].value
    
# flatten the array
rn_b4_flat = rn_b4_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(rn_b4_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 100352)
X_te (65, 100352)
y_tr (257,)
y_te (65,)


In [99]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
2,15,1,0.622568
9,30,2,0.622568
4,20,1,0.618677
5,20,2,0.618677
7,25,2,0.614786
8,30,1,0.614786
3,15,2,0.610895
6,25,1,0.607004
1,10,2,0.603113
0,10,1,0.599222


In [100]:
grid_cv.score(X_te, y_te)

0.676923076923077

In [101]:
grid_cv.predict(X_te)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Logits

In [102]:
# resnet features
with h5py.File('data/rn_l_features.h5') as features:
    filenames = features['filenames'].value
    rn_l_features = features['resnet_v2_152']['logits'].value
    
# flatten the array
rn_l_flat = rn_l_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(rn_l_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 1001)
X_te (65, 1001)
y_tr (257,)
y_te (65,)


In [103]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
6,25,1,0.63035
5,20,2,0.626459
8,30,1,0.626459
4,20,1,0.622568
7,25,2,0.618677
9,30,2,0.614786
1,10,2,0.607004
2,15,1,0.603113
3,15,2,0.603113
0,10,1,0.571984


In [104]:
grid_cv.score(X_te, y_te)

0.7076923076923077

## Inception

## Mixed_3a

In [25]:
# inception features
with h5py.File('data/inc_m_3a_features.h5') as features:
    filenames = features['filenames'].value
    inc_3a_features = features['Mixed_3a'].value
    
# flatten the array
inc_3a_flat = inc_3a_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(inc_3a_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 852640)
X_te (65, 852640)
y_tr (257,)
y_te (65,)


In [26]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
0,10,1,0.626459
4,20,1,0.626459
1,10,2,0.614786
6,25,1,0.614786
7,25,2,0.614786
2,15,1,0.610895
5,20,2,0.610895
8,30,1,0.610895
3,15,2,0.607004
9,30,2,0.607004


In [28]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Positive Predictions:", np.sum(yhat))
print("True Positives:", np.sum(y_te))
print("Recall:", recall)

Test accuracy: 0.6615384615384615
Positive Predictions: 5
True Positives: 19
Recall: 0.05263157894736842


### Mixed_5e

In [21]:
# inception features
with h5py.File('data/inc_m_5e_features.h5') as features:
    filenames = features['filenames'].value
    inc_5e_features = features['Mixed_5e'].value
    
# flatten the array
inc_5e_flat = inc_5e_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(inc_5e_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 470400)
X_te (65, 470400)
y_tr (257,)
y_te (65,)


In [22]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
0,10,1,0.63035
8,30,1,0.626459
9,30,2,0.626459
4,20,1,0.610895
3,15,2,0.607004
6,25,1,0.607004
2,15,1,0.603113
5,20,2,0.603113
7,25,2,0.599222
1,10,2,0.595331


In [23]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Recall:", recall)

Test accuracy: 0.6615384615384615
Recall: 0.10526315789473684


### Mixed_6a

In [18]:
# inception features
with h5py.File('data/inc_m_6a_features.h5') as features:
    filenames = features['filenames'].value
    inc_6a_features = features['Mixed_6a'].value
    
# flatten the array
inc_6a_flat = inc_6a_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(inc_6a_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 295936)
X_te (65, 295936)
y_tr (257,)
y_te (65,)


In [19]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
4,20,1,0.618677
8,30,1,0.618677
9,30,2,0.618677
2,15,1,0.614786
0,10,1,0.610895
3,15,2,0.603113
1,10,2,0.599222
5,20,2,0.599222
6,25,1,0.599222
7,25,2,0.599222


In [20]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Recall:", recall)

Test accuracy: 0.676923076923077
Recall: 0.05263157894736842


### Mixed_6h

In [14]:
# inception features
with h5py.File('data/inc_m_6h_features.h5') as features:
    filenames = features['filenames'].value
    inc_6h_features = features['Mixed_6h'].value
    
# flatten the array
inc_6h_flat = inc_6h_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(inc_6h_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 295936)
X_te (65, 295936)
y_tr (257,)
y_te (65,)


In [15]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
2,15,1,0.626459
4,20,1,0.614786
6,25,1,0.610895
8,30,1,0.607004
9,30,2,0.607004
3,15,2,0.603113
0,10,1,0.595331
5,20,2,0.595331
7,25,2,0.595331
1,10,2,0.59144


In [16]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Recall:", recall)

Test accuracy: 0.6461538461538462
Recall: 0.15789473684210525


### Mixed 7d

In [6]:
# inception features
with h5py.File('data/inc_m_7d_features.h5') as features:
    filenames = features['filenames'].value
    inc_7d_features = features['Mixed_7d'].value
    
# flatten the array
inc_7d_flat = inc_7d_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(inc_7d_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 98304)
X_te (65, 98304)
y_tr (257,)
y_te (65,)


In [7]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
8,30,1,0.610895
0,10,1,0.607004
9,30,2,0.599222
4,20,1,0.59144
6,25,1,0.587549
5,20,2,0.583658
7,25,2,0.571984
1,10,2,0.564202
2,15,1,0.548638
3,15,2,0.544747


In [8]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Recall:", recall)

Test accuracy: 0.6923076923076923
Recall: 0.05263157894736842


### Mixed 7a

In [9]:
# inception features
with h5py.File('data/inc_m_7a_features.h5') as features:
    filenames = features['filenames'].value
    inc_7a_features = features['Mixed_7a'].value
    
# flatten the array
inc_7a_flat = inc_7a_features.reshape([322,-1])

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(inc_7a_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 98304)
X_te (65, 98304)
y_tr (257,)
y_te (65,)


In [10]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
8,30,1,0.618677
9,30,2,0.614786
0,10,1,0.599222
4,20,1,0.595331
6,25,1,0.595331
7,25,2,0.59144
1,10,2,0.587549
2,15,1,0.587549
3,15,2,0.583658
5,20,2,0.583658


In [11]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Recall:", recall)

Test accuracy: 0.6923076923076923
Recall: 0.0


### PreLogits

In [32]:
# inception features
with h5py.File('data/inc_flat_features.h5') as features:
    filenames = features['filenames'].value
    inc_flat_features = features['PreLogitsFlatten'].value
    
# flatten the array
inc_flat_flat = inc_flat_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(inc_flat_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 1536)
X_te (65, 1536)
y_tr (257,)
y_te (65,)


In [33]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
9,30,2,0.618677
8,30,1,0.614786
4,20,1,0.607004
5,20,2,0.599222
6,25,1,0.59144
1,10,2,0.587549
7,25,2,0.587549
2,15,1,0.583658
0,10,1,0.579767
3,15,2,0.571984


In [34]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Recall:", recall)

Test accuracy: 0.7076923076923077
Recall: 0.05263157894736842


### Mixed 4a

In [38]:
# inception features
with h5py.File('data/inc_m_4a_features.h5') as features:
    filenames = features['filenames'].value
    inc_4a_features = features['Mixed_4a'].value
    
# flatten the array
inc_4a_flat = inc_4a_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(inc_4a_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 967872)
X_te (65, 967872)
y_tr (257,)
y_te (65,)


In [39]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
1,10,2,0.63035
5,20,2,0.626459
8,30,1,0.626459
9,30,2,0.626459
3,15,2,0.622568
4,20,1,0.622568
7,25,2,0.622568
2,15,1,0.614786
6,25,1,0.614786
0,10,1,0.587549


In [40]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Recall:", recall)

Test accuracy: 0.6615384615384615
Recall: 0.05263157894736842


### Mixed_5a

In [42]:
# inception features
with h5py.File('data/inc_m_5a_features.h5') as features:
    filenames = features['filenames'].value
    inc_5a_features = features['Mixed_5a'].value
    
# flatten the array
inc_5a_flat = inc_5a_features.reshape([322,-1])

# split the data
from sklearn.model_selection import train_test_split

# Split data into training and test
X_tr, X_te, y_tr,y_te = train_test_split(inc_5a_flat, y, test_size=0.20, random_state=0)
print("X_tr", X_tr.shape)
print("X_te", X_te.shape)
print("y_tr", y_tr.shape)
print("y_te", y_te.shape)

X_tr (257, 470400)
X_te (65, 470400)
y_tr (257,)
y_te (65,)


In [43]:
knn = KNeighborsClassifier()

pipe = Pipeline([
    ('scaler', None),
    ('knn', knn)
])

grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_tr, y_tr)

# Collect results and sort them
df = pd.DataFrame.from_items([
    ('neighbors', grid_cv.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv.cv_results_['param_knn__p']),
    ('mean_te', grid_cv.cv_results_['mean_test_score'])
])

df.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
8,30,1,0.634241
9,30,2,0.622568
1,10,2,0.618677
2,15,1,0.618677
3,15,2,0.618677
4,20,1,0.618677
5,20,2,0.618677
6,25,1,0.618677
7,25,2,0.618677
0,10,1,0.599222


In [44]:
test_acc = grid_cv.score(X_te, y_te)
yhat = grid_cv.predict(X_te)
recall = calculate_recall(y_te, yhat)
print("Test accuracy:", test_acc)
print("Recall:", recall)

Test accuracy: 0.6923076923076923
Recall: 0.0
