In [13]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 12 17:36:09 2018

@author: goncalofigueira
"""
# =============================================================================
# MODULES IMPORT
# =============================================================================
from __future__ import print_function
import sys
import os
from tqdm import tqdm
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
import scipy as sc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from FisherVector import FeatureExtract
import pickle
from sklearn import preprocessing
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

os.chdir("/Users/goncalofigueira/Desktop/gf/capstone_project/src/")
print(os.getcwd())
sys.path.append(os.path.join(os.getcwd(), "utils"))
from data_utils import getFileList, sortTarget, ReadImage, rgb_normalized
from FisherVector import FeatureExtract, computeFV
from sklearn.model_selection import RepeatedStratifiedKFold


/Users/goncalofigueira/Desktop/gf/capstone_project/src


In [14]:
#==============================================================================
# IMAGE PARAMETERS
#==============================================================================
path = '/Users/goncalofigueira/Documents/capstone_project/datasets/ICIAR2018_BACH_Challenge/Photos/'
im_type = '.tif'
test_perc = 0.2 # test set percentage

#==============================================================================
# GET IMAGE LIST AND INFO
#==============================================================================
im_folder = np.array(getFileList(path,im_type)) # image list
# Load csv with image information
im_info = pd.read_csv(getFileList(path,'.csv')[0], header = None)
im_info.columns = ['filename','target']

# =============================================================================
# MATCH IMAGE LIST AND LABELS
# =============================================================================
im_info = sortTarget(im_folder,im_info)
le = preprocessing.LabelEncoder()
T = im_info.target
T = np.array(le.fit_transform(T))

In [15]:
# =============================================================================
# TRAIN/TEST SPLIT
# =============================================================================
split = StratifiedShuffleSplit(n_splits = 1, test_size = test_perc, random_state = 0)
for train_index, test_index in split.split(im_folder,T):
    train_files = train_index
    test_files = test_index

y_train = T[train_files]
y_test = T[test_files]

In [None]:
# =============================================================================
# FISHER VECTOR PARAMETERS
# =============================================================================
n_cmp = 10 # pca components
k = 512 # gmm n centroids
fnum = 8192 # n sift descriptors

In [None]:
# ============================================================================
# EXTRACT SIFT DESCRIPTORS FROM TRAIN SET
# =============================================================================
dictionary = []
for file  in tqdm(im_folder[train_files]):
    im = ReadImage(file)
    im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)

    # Extract sift descriptors
    sift = cv2.xfeatures2d.SIFT_create(nfeatures = fnum)
    kp, descriptors = sift.detectAndCompute(im_gray, None)
    
    descriptors /= (descriptors.sum(axis=1, keepdims=True) + 1e-7)
    descriptors = np.sqrt(descriptors)
 
    dictionary.append(descriptors)

    
dictionary = np.asarray(dictionary)
dictionary = np.concatenate(dictionary).astype(None)

In [None]:
# =============================================================================
# APPLY PCA TO DESCRIPTORS LIBRARY
# =============================================================================
sift_scaler = preprocessing.StandardScaler()
descriptors = sift_scaler.fit_transform(descriptors)

sift_pca = PCA(n_components=n_cmp,whiten=True)
dictionary = sift_pca.fit_transform(dictionary)
dictionary = np.float32(dictionary)

#with open('pca_transform.pickle', 'wb') as handle:
#    pickle.dump(sift_pca, handle, protocol=pickle.HIGHEST_PROTOCOL)
#with open('scaler.pickle', 'wb') as handle:
#    pickle.dump(sift_scaler, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
## =============================================================================
## BUILD DICTIONARY MODEL
## =============================================================================
gmm_pca = GaussianMixture(n_components = k, covariance_type = "diag").fit(dictionary)

In [None]:
# =============================================================================
 # COMPUTE FISHER VECTORS FOR TRAIN SET
# =============================================================================
X_train = np.empty((y_train.shape[0],k+2*dictionary.shape[1]*k))
    
idx = 0
for file in tqdm(im_folder[train_files]):
    X_train[idx,:] = FeatureExtract(file, nkeys = fnum, pca = sift_pca, gmm = gmm_pca, scaler = sift_scaler)
    idx += 1

In [None]:
# =============================================================================
 # COMPUTE FISHER VECTORS FOR TEST SET
# =============================================================================  
X_test = np.empty((y_test.shape[0],k+2*dictionary.shape[1]*k))

idx = 0
for file in tqdm(im_folder[test_files]):
    X_test[idx,:] = FeatureExtract(file, nkeys = fnum, pca = sift_pca, gmm = gmm_pca, scaler = sift_scaler)
    idx += 1

In [None]:
# =============================================================================
# PRE PROCESSING   
# =============================================================================
#ch2 = SelectKBest(chi2, k=100)
#X_train = ch2.fit_transform(X_train, y_train)
#X_test = ch2.transform(X_test)
#

##PCA
pca = PCA(n_components = 20, whiten=True,random_state=42)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)  
#
## SCALING data
#scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1), copy=True)
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# =============================================================================
# SVM MODEL parameters
# =============================================================================
sss = RepeatedStratifiedKFold(n_splits=5, n_repeats=200, random_state=42)

C_range = 2. ** np.arange(0, 1, step=0.05)
g_range = np.logspace(-2, -1, 20)

tuned_parameters = [{'kernel': ['rbf'], 'gamma': g_range, 'C': C_range}]
#tuned_parameters = [{'kernel': ['linear'],  'C': C_range}]
#tuned_parameters = [{'kernel': ['poly'],  'C': C_range,'degree': [2,3,4,5,6,7,8]}]


In [None]:
# ==============================================================================
 # GRID SEARCH
# ==============================================================================
scores = ['f1']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(cache_size=5000, random_state = 42, decision_function_shape='ovr'), tuned_parameters, cv=sss,
                       scoring='%s_macro' % score, n_jobs=-1)

# =============================================================================
#     COMPUTE PARAMETERS
# =============================================================================
    t2 = time.time()
    clf.fit(X_train, y_train)
    elapsed2 = time.time() - t2
    print()
    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()

    print('Training time: ', elapsed2)


In [None]:
# EVALUATION
# ==============================================================================
    # TRAIN SET
    # ==============================================================================
    clf2 = clf.best_estimator_
   #s print(clf2)
    print("Classification on training set:")
    y_true, y_pred = y_train, clf2.predict(X_train)
    #print('Confusion matrix:')
    #print(confusion_matrix(y_true, y_pred))
    print(" Train set f1 score: " + str(f1_score(y_true, y_pred, average='macro')))
    # ==============================================================================
    # TESTING
    # ==============================================================================
    y_true, y_pred = y_test, clf2.predict(X_test)
    y_pred_ci = clf.decision_function(X_test)
    print("Classification on test set:")
    print(classification_report(y_true, y_pred))
    print('Confusion matrix:')
    print(confusion_matrix(y_true, y_pred))
