In [1]:
import glob
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import cv2
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, classification_report, roc_auc_score
import random
from skimage.feature import hog
from skimage.io import imread
from skimage.transform import rescale
import skimage
from sklearn.svm import SVC, LinearSVC
import random

In [2]:
adimuc = glob.glob("../kaggle/input/tumor-detection/data/ADIMUC/*")
strmus = glob.glob("../kaggle/input/tumor-detection/data/STRMUS/*")
tumstu = glob.glob("../kaggle/input/tumor-detection/data/TUMSTU/*")


len(adimuc)

3977

In [3]:
full_data = adimuc + strmus + tumstu
len(full_data)

11977

In [4]:
small_data = adimuc[:1000] + strmus[:1000] + tumstu[:1000]
len(small_data)

3000

In [5]:
X = []
y = []



for img in small_data:#full_data:
    img_label = img.split('/')[-2]
    y.append(img_label)
    
    try:
        flattened_hog = hog(imread(img, as_gray=True), # convert to grey and turn into hog
                                 pixels_per_cell=(12, 12), # reduces number of features. Smaller more granular hogs
                                 cells_per_block=(2,2),
                                 orientations=8,
                                 block_norm='L2-Hys')
    except:
        print(img)
        break
        
    if len(y) % 2000 == 0:
        print(f"{len(y)} images have been processed")

    X.append(flattened_hog)
    
X = np.array(X)    
#y = np.where(np.array(y) == 'MSIMUT', 1, 0)

2000 images have been processed


In [11]:
# Y BINARY! adimuc and strmus == 0 (non cancer)
# tumstu = cancer
y_bin = np.where(np.array(y) == 'TUMSTU', 1, 0)

In [14]:
X.shape

(3000, 53792)

In [15]:
def standard_scaler(a):
    return (a-np.mean(a))/np.std(a)

X = np.apply_along_axis(standard_scaler, 0, X)

In [16]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y_bin, 
                                                    train_size = 0.7, 
                                                    random_state = 42, 
                                                    stratify = y_bin)

In [28]:
# Incremental PCA due to memory issues:
# https://stackoverflow.com/questions/44334950/how-to-use-sklearns-incrementalpca-partial-fit
from sklearn.decomposition import IncrementalPCA

iPCA = IncrementalPCA(n_components = 10)

num_rows = X_train.shape[0]
chunk_size = 1000

# FIT on Training Data
for i in range(0, num_rows // chunk_size +1):
    iPCA.partial_fit(X_train[i*chunk_size : (i+1)*chunk_size])

# Transform Training Data
X_train_pca = iPCA.transform(X_train[0: chunk_size])

for i in range(1, num_rows // chunk_size +1):
    transformed_chunk = iPCA.transform(X_train[i*chunk_size : (i+1)*chunk_size])
    X_train_pca = np.concatenate((X_train_pca, transformed_chunk), axis = 0)
    
# Transform Test Data
X_test_pca = iPCA.transform(X_test[0: chunk_size])
num_rows = X_test.shape[0]

for i in range(1, num_rows // chunk_size +1):
    transformed_chunk = iPCA.transform(X_test[i*chunk_size : (i+1)*chunk_size])
    X_test_pca = np.concatenate((X_test_pca, transformed_chunk), axis = 0)

In [29]:
np.sum(iPCA.explained_variance_ratio_)

0.19636914487335108

### PCA HOG SVM

In [30]:
svc = LinearSVC(C = 0.1, tol = 0.001, random_state=42)

svc.fit(X_train_pca, y_train)

y_train_preds = svc.predict(X_train_pca)
y_test_preds = svc.predict(X_test_pca)

train_acc = accuracy_score(y_train, y_train_preds)
test_acc = accuracy_score(y_test, y_test_preds)

train_recall = recall_score(y_train, y_train_preds)
test_recall = recall_score(y_test, y_test_preds)

train_auc = roc_auc_score(y_train, y_train_preds)
test_auc = roc_auc_score(y_test, y_test_preds)

print(f"Train Accuracy: {train_acc}")
print(f"Test Accuracy: {test_acc}")

print(f"Train Recall: {train_recall}")
print(f"Test Recall: {test_recall}")

print(f"Train AUC: {train_auc}")
print(f"Test AUC: {test_auc}")

Train Accuracy: 0.8142857142857143
Test Accuracy: 0.79
Train Recall: 0.8557142857142858
Test Recall: 0.8533333333333334
Train AUC: 0.8246428571428572
Test AUC: 0.8058333333333334




In [31]:
np.sum(iPCA.explained_variance_ratio_)

0.19636914487335108

In [None]:
# 10 COMPONENNTS
# Train Accuracy: 0.8142857142857143
# Test Accuracy: 0.79
# Train Recall: 0.8557142857142858
# Test Recall: 0.8533333333333334
# Train AUC: 0.8246428571428572
# Test AUC: 0.8058333333333334

In [22]:
svc = LinearSVC(C = 0.1, tol = 0.001, random_state=42)

svc.fit(X_train, y_train)

y_train_preds = svc.predict(X_train)
y_test_preds = svc.predict(X_test)

train_acc = accuracy_score(y_train, y_train_preds)
test_acc = accuracy_score(y_test, y_test_preds)

train_recall = recall_score(y_train, y_train_preds)
test_recall = recall_score(y_test, y_test_preds)

train_auc = roc_auc_score(y_train, y_train_preds)
test_auc = roc_auc_score(y_test, y_test_preds)

print(f"Train Accuracy: {train_acc}")
print(f"Test Accuracy: {test_acc}")

print(f"Train Recall: {train_recall}")
print(f"Test Recall: {test_recall}")

print(f"Train AUC: {train_auc}")
print(f"Test AUC: {test_auc}")

Train Accuracy: 1.0
Test Accuracy: 0.52
Train Recall: 1.0
Test Recall: 0.95
Train AUC: 1.0
Test AUC: 0.6275


### HOG SVM

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
param_grid = {"C": [0.1, 1, 10, 100, 1000], 
              "tol": [1e-3,1e-4,1e-5]}

gcv = GridSearchCV(LinearSVC(random_state=42), param_grid)

gcv.fit(X_train, y_train)

y_train_preds = svc.predict(X_train)
y_test_preds = svc.predict(X_test)

train_acc = accuracy_score(y_train, y_train_preds)
test_acc = accuracy_score(y_test, y_test_preds)

train_recall = recall_score(y_train, y_train_preds)
test_recall = recall_score(y_test, y_test_preds)

train_auc = roc_auc_score(y_train, y_train_preds)
test_auc = roc_auc_score(y_test, y_test_preds)

print(f"Train Accuracy: {train_acc}")
print(f"Test Accuracy: {test_acc}")

print(f"Train Recall: {train_recall}")
print(f"Test Recall: {test_recall}")

print(f"Train AUC: {train_auc}")
print(f"Test AUC: {test_auc}")

Train Accuracy: 1.0
Test Accuracy: 0.53
Train Recall: 1.0
Test Recall: 0.7833333333333333
Train AUC: 1.0
Test AUC: 0.5933333333333333


In [20]:
print(gcv.best_estimator_)

LinearSVC(C=0.1, random_state=42, tol=0.001)
