In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
#from keras.utils import np_utils
#from keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import cv2
import scipy
import os
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [None]:
BASE_DIR = '/Users/eslemus/Desktop/ML/anticl/'

img_size = 150
im_l =240
im_b =320
# all parameters not specified are set to their defaults

logisticRegr = LogisticRegression()

In [None]:
def get_data(folder):
    """
    Load the data and labels from the given folder.
    """
    X = []
    y = []

    for seismic_type in os.listdir(folder):
        if not seismic_type.startswith('.'):
            if seismic_type in ['Class1']:
                label = '0'
            else:
                label = '1'
            for image_filename in os.listdir(folder + seismic_type):
                img_file = cv2.imread(folder + seismic_type + '/' + image_filename, 0)
                if img_file is not None:
                    # Downsample the image to 120, 160, 3
                    # img_file = scipy.misc.imresize(arr=img_file, size=(120, 160, 3))
                    img_arr = np.asarray(img_file)
                    X.append(img_arr)
                    y.append(label)
    X = np.asarray(X)
    y = np.asarray(y)
    return X,y

In [None]:
X_train, y_train = get_data(BASE_DIR + 'images/Train/')
X_test, y_test = get_data(BASE_DIR + 'images/Test/')

#scikit-learn expects 2d num arrays for the training dataset for a fit function. 
#The dataset you are passing in is a 3d array you need to reshape the array into a 2d.

nsamples, nx, ny = X_train.shape
nsamples2, nx2, ny2 = X_test.shape
X_train_new = X_train.reshape((nsamples,nx*ny))
X_test = X_test.reshape((nsamples2,nx2*ny2))

encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [None]:
logisticRegr.fit(X_train_new, y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


print('Predicting on test data')
y_pred = np.rint(logisticRegr.predict(X_test))

#predictions = logisticRegr.predict(x_test)

score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(score)
print(cm)

Grid Search for model tuning A model hyperparameter is a characteristic of a model that is external to the model and whose value cannot be estimated from data. The value of the hyperparameter has to be set before the learning process begins. Grid-search is used to find the optimal hyperparameters of a model which results in the most ‘accurate’ predictions. The hyperparameters we tuned are: 1) Penalty: l1 or l2 which species the norm used in the penalization. 2) C: Inverse of regularization strength- smaller values of C specify stronger regularization.

In [None]:
#Grid Search
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.model_selection import GridSearchCV
logisticRegr = LogisticRegression()
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_LR_acc = GridSearchCV(logisticRegr, param_grid = grid_values, scoring = None)
grid_LR_acc.fit(X_train_new, y_train)

#Predict values based on new parameters
y_pred_acc = grid_LR_acc.predict(X_test)

# New Model Evaluation metrics 
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc, average = None)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc, average = None)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc, average = None)))

print(confusion_matrix(y_test, y_pred_acc))

In [None]:
#----------KNN Classifier 
training_accuracy = []
test_accuracy = []

#try KNN for diffrent k nearest neighbor from 1 to 10
neighbors_setting = range(1,10)

for n_neighbors in neighbors_setting:
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train_new, y_train)
    training_accuracy.append(knn.score(X_train_new, y_train))
    test_accuracy.append(knn.score(X_test, y_test))
 
plt.plot(neighbors_setting,training_accuracy, label='Accuracy of the training set')
plt.plot(neighbors_setting,test_accuracy, label='Accuracy of the test set')
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors')
plt.legend()
plt.show()
#by looking at plot, best result accurs when n_neighbors is 3

In [None]:
print("Accuracy of the training set for 3NN: {:3f}".format(training_accuracy[3]))
print("Accuracy of the test set for 3NN: {:3f}".format(test_accuracy[3]))

In [None]:
knnop = KNeighborsClassifier(n_neighbors=3)
knnop.fit(X_train_new, y_train)
predictionsknn = knnop.predict(X_test)

print(accuracy_score(y_test, predictionsknn))
print(confusion_matrix(y_test, predictionsknn))
print(classification_report(y_test, predictionsknn))

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
import seaborn as sns
plt.figure(figsize=(8,8))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 8);

Conclusion: The most crucial aspect of algorithm training within Machine Learning is having a dependable dataset. There exists several processes for enhancing predictions and accuracy. In this case, we used Logistic Regression with parameter optimization and then an implementation of KNN. For the classification of seismic images, this specific python notebook needs a larger dataset than the one provided. -Estefany Lemus