In [1]:
import numpy as np
from keras.utils import np_utils
from sklearn.svm import SVC
from sklearn.svm import SVC
from keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from time import time
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
import cv2
import scipy
import os
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
BASE_DIR = '/Users/eslemus/Desktop/ML/hwk_2/'
# all parameters not specified are set to their defaults



In [3]:
def get_data(folder):
    """
    Load the data and labels from the given folder.
    """
    X = []
    y = []

    for seismic_type in os.listdir(folder):
        if not seismic_type.startswith('.'):
            if seismic_type in ['Class1']:
                label = '0'
            else:
                label = '1'
            for image_filename in os.listdir(folder + seismic_type):
                img_file = cv2.imread(folder + seismic_type + '/' + image_filename, 0)
                if img_file is not None:
                    # Downsample the image to 120, 160, 3
                    # img_file = scipy.misc.imresize(arr=img_file, size=(120, 160, 3))
                    img_arr = np.asarray(img_file)
                    X.append(img_arr)
                    y.append(label)
    X = np.asarray(X)
    y = np.asarray(y)
    return X,y

In [4]:
X_train, y_train = get_data(BASE_DIR + 'images/Train/')
X_test, y_test = get_data(BASE_DIR + 'images/Test/')

#scikit-learn expects 2d num arrays for the training dataset for a fit function. 
#The dataset you are passing in is a 3d array you need to reshape the array into a 2d.

nsamples, nx, ny = X_train.shape
nsamples2, nx2, ny2 = X_test.shape
X_train_new = X_train.reshape((nsamples,nx*ny))
X_test = X_test.reshape((nsamples2,nx2*ny2))

encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [5]:

KM = kmeans = KMeans(n_clusters=2, random_state=0)
KM.fit(X_train_new,y_train)

print("Predicting images on the test set")
t0 = time()
y_pred = KM.predict(X_test)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Predicting images on the test set
done in 0.030s
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        25
           1       0.89      1.00      0.94        25

    accuracy                           0.94        50
   macro avg       0.95      0.94      0.94        50
weighted avg       0.95      0.94      0.94        50

[[22  3]
 [ 0 25]]


In [6]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)
print(score)

0.94


In [7]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[22  3]
 [ 0 25]]


In [8]:
from time import time
n_components = 175

print("Extracting the top %d seismic-images from %d images"
      % (n_components, X_train_new.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train_new)
print("done in %0.3fs" % (time() - t0))

print("Projecting the input data on the seismic-images")
t0 = time()
X_train_pca = pca.transform(X_train_new)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))

Extracting the top 175 seismic-images from 400 images
done in 6.504s
Projecting the input data on the seismic-images
done in 0.447s


In [9]:
print('175 principal components account for {:.4f}% of the variance.'.format(100 * np.sum(pca.explained_variance_ratio_[:175])))

175 principal components account for 98.8637% of the variance.


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from time import time
# #############################################################################
# Quantitative evaluation of the model quality on the test set

KM = KMeans(n_clusters=2, random_state=0)
KM.fit(X_train_pca,y_train)

print("Predicting images on the test set")
t0 = time()
y_pred = KM.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Predicting images on the test set
done in 0.002s
              precision    recall  f1-score   support

           0       0.50      1.00      0.67        25
           1       0.00      0.00      0.00        25

    accuracy                           0.50        50
   macro avg       0.25      0.50      0.33        50
weighted avg       0.25      0.50      0.33        50

[[25  0]
 [25  0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# compute the confusion matrix and and use it to derive the raw
# accuracy, sensitivity, and specificity
cm = confusion_matrix(y_test, y_pred)
total = sum(sum(cm))
acc = (cm[0, 0] + cm[1, 1]) / total
sensitivity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
specificity = cm[1, 1] / (cm[1, 0] + cm[1, 1])

# show the confusion matrix, accuracy, sensitivity, and specificity
print(cm)
print("acc: {:.4f}".format(acc))
print("sensitivity: {:.4f}".format(sensitivity))
print("specificity: {:.4f}".format(specificity))

[[25  0]
 [25  0]]
acc: 0.5000
sensitivity: 1.0000
specificity: 0.0000


Applying K-means computes a classification of 94% on the seismic images. Then we applied PCA which implemented a variance score of 98.86% but an accuracy score of 50%. Therefore, it is more efficient to apply PCA as a feature extraction first. Then apply K-means clustering methods, where K-means is based on the feature selections of PCA.

Estefany Lemus