# Import Library

In [35]:
import matplotlib.pyplot as plt
import numpy as np
import platform
import os
from sklearn.utils import class_weight
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

%matplotlib inline
%load_ext autoreload
%aimport utils
%aimport imc
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Settings

In [31]:
pdiv = "/"
### LABELS ###

normal_label = 0
pneumonia_label = 1
labels = sorted([normal_label, pneumonia_label])

### IMAGE SETTINGS ###

dimension = (64,0)
resize_dim = dimension[:2]

# Training and Test data preparation

In [32]:
path_train_normal = ".{}chest_xray{}train{}NORMAL{}".format(pdiv, pdiv, pdiv, pdiv)
path_train_pneumonia = ".{}chest_xray{}train{}PNEUMONIA{}".format(pdiv, pdiv, pdiv, pdiv)

path_test_normal = ".{}chest_xray{}test{}NORMAL{}".format(pdiv, pdiv, pdiv, pdiv)
path_test_pneumonia = ".{}chest_xray{}test{}PNEUMONIA{}".format(pdiv, pdiv, pdiv, pdiv)

# number of images to be loaded from each directory
train_images_limit = 200
test_images_limit = 200
load_all_images = True

paths_train_normal = utils.extract_image_paths(path_train_normal)
paths_train_pneumonia = utils.extract_image_paths(path_train_pneumonia)
paths_test_normal = utils.extract_image_paths(path_test_normal)
paths_test_pneumonia = utils.extract_image_paths(path_test_pneumonia)

if not load_all_images:
    paths_train_normal = paths_train_normal[1:train_images_limit]
    paths_train_pneumonia = paths_train_pneumonia[1:train_images_limit]
    paths_test_normal = paths_test_normal[1:test_images_limit]
    paths_test_pneumonia = paths_train_pneumonia[1:test_images_limit]

# Build Sift Vocabulary

In [47]:
# from imc import build_vocabulary_from_dirs
import os.path as osp
import pickle

vocab_filename = 'vocab.pkl'
vocab_size = 40
vocab = imc.build_vocabulary_from_dirs(paths_train_normal, paths_train_pneumonia, vocab_size)
with open(vocab_filename, 'wb') as f:
    pickle.dump(vocab, f)
    
print(f"Built {vocab_size}-word vocabulary from training dataset, saved to {vocab_filename}")

Built 40-word vocabulary from training dataset, saved to vocab.pkl


# Generate Image Histograms

In [52]:
from imc import bags_of_sifts

data_train_normal = bags_of_sifts(paths_train_normal, vocab_filename)
data_train_pneumonia = bags_of_sifts(paths_train_pneumonia, vocab_filename)

data_test_normal = bags_of_sifts(paths_test_normal, vocab_filename)
data_test_pneumonia = bags_of_sifts(paths_test_pneumonia, vocab_filename)

print("Image histograms generated for training and test images")


Image histograms generated for training and test images


# Combine Datasets

In [54]:
# Number of training and test images for normal and pneumonia

label_train_normal = [0] * len(paths_train_normal)
label_train_pneumonia = [1] * len(paths_train_pneumonia)
label_test_normal = [0] * len(paths_test_normal)
label_test_pneumonia = [1] * len(paths_test_pneumonia)

# Combine training images and labels
data_train = np.concatenate((data_train_normal, data_train_pneumonia), axis=0)
label_train = np.asarray(label_train_normal + label_train_pneumonia)

# Combine training images and labels
data_test = np.concatenate((data_test_normal, data_test_pneumonia[1:234]), axis=0)
label_test = np.asarray(label_test_normal + label_test_pneumonia[1:234])

print ("Data concatenated")
print (f"Train data dimensions: {data_train.shape}")
print (f"Test data dimensions: {data_test.shape}")

Data concatenated
Train data dimensions: (5216, 200)
Test data dimensions: (467, 200)


# Oversample Datasets

In [38]:
from imblearn.over_sampling import SMOTE, RandomOverSampler, KMeansSMOTE, BorderlineSMOTE
# import imblearn
oversample = BorderlineSMOTE(random_state=100)
data_train, label_train = oversample.fit_resample(data_train.reshape(data_train.shape[0], -1), label_train)

print(data_train.shape)

(7750, 200)


# Shuffle Data and Labels

In [48]:
# Shuffle
rand_order = np.random.permutation(data_train.shape[0])

data_train = data_train[rand_order]
label_train = label_train[rand_order]


# Shuffle
rand_order = np.random.permutation(data_test.shape[0])
data_test = data_test[rand_order]
label_test = label_test[rand_order]

print("Datasets shuffled")

Datasets shuffled
(7750, 200)


# SVM - Scale Gamma

In [40]:
svclassifier = SVC(gamma='scale')
svclassifier.fit(data_train, label_train)
      
# Make prediction
label_pred = svclassifier.predict(data_test)

print(confusion_matrix(label_test, label_pred))
print(classification_report(label_test, label_pred))

[[ 17 217]
 [  1 232]]
              precision    recall  f1-score   support

           0       0.94      0.07      0.13       234
           1       0.52      1.00      0.68       233

    accuracy                           0.53       467
   macro avg       0.73      0.53      0.41       467
weighted avg       0.73      0.53      0.41       467



# SVM - polynomial kernel

In [49]:
svclassifier = SVC(kernel='poly', degree=8)
svclassifier.fit(data_train, label_train)

# Make prediction
label_pred = svclassifier.predict(data_test)

cm = confusion_matrix(label_test, label_pred)
print(confusion_matrix(label_test, label_pred))
print(classification_report(label_test, label_pred))


print('### CONFUSION MATRIX ###')
print(cm)
tn, fp, fn, tp = cm.ravel()
acc = (tp + tn) / sum([tn, fp, fn, tp])

### Calculate Recall and Precision ###
print('\n ### TEST METRICS ###')
precision = tp/(tp+fp)*100
recall = tp/(tp+fn)*100
print('Accuracy: {}%'.format(acc))
print('Precision: {}%'.format(precision))
print('Recall: {}%'.format(recall))
print('F1-score: {}'.format(2*precision*recall/(precision+recall)))



[[ 44 190]
 [  3 230]]
              precision    recall  f1-score   support

           0       0.94      0.19      0.31       234
           1       0.55      0.99      0.70       233

    accuracy                           0.59       467
   macro avg       0.74      0.59      0.51       467
weighted avg       0.74      0.59      0.51       467

### CONFUSION MATRIX ###
[[ 44 190]
 [  3 230]]

 ### TEST METRICS ###
Accuracy: 0.6346153846153846%
Precision: 63.10679611650486%
Recall: 100.0%
F1-score: 77.38095238095238


# SVM - Gaussian kernel

In [42]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(data_train, label_train)

label_pred = svclassifier.predict(data_test)

print(confusion_matrix(label_test, label_pred))
print(classification_report(label_test, label_pred))

[[ 17 217]
 [  1 232]]
              precision    recall  f1-score   support

           0       0.94      0.07      0.13       234
           1       0.52      1.00      0.68       233

    accuracy                           0.53       467
   macro avg       0.73      0.53      0.41       467
weighted avg       0.73      0.53      0.41       467



# SVM - Sigmoid kernel

In [44]:
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(data_train, label_train)

label_pred = svclassifier.predict(data_test)
cm = confusion_matrix(label_test, label_pred)
print(classification_report(label_test, label_pred))


print('### CONFUSION MATRIX ###')
acc = (tp + tn) / sum([tn, fp, fn, tp])

### Calculate Recall and Precision ###
print('\n ### TEST METRICS ###')
precision = tp/(tp+fp)*100
recall = tp/(tp+fn)*100
print('Accuracy: {}%'.format(acc))
print('Precision: {}%'.format(precision))
print('Recall: {}%'.format(recall))
print('F1-score: {}'.format(2*precision*recall/(precision+recall)))



              precision    recall  f1-score   support

           0       0.75      0.52      0.61       234
           1       0.63      0.83      0.72       233

    accuracy                           0.67       467
   macro avg       0.69      0.67      0.66       467
weighted avg       0.69      0.67      0.66       467

### CONFUSION MATRIX ###
[[121 113]
 [ 40 193]]

 ### TEST METRICS ###
Accuracy: 0.6875%
Precision: 72.20956719817767%
Recall: 81.28205128205128%
F1-score: 76.47768395657418
