# Programming workflow
### I - Build image dictionary
### II - Fit GMM model
### III - Compute Fisher Vector for Training and Test set
### IV - Grid Search Cross-validation SVM
### V - Prediction on test set




In [None]:
# =============================================================================
# FISHER VECTOR PARAMETERS
# =============================================================================
n_cmp = 20 # pca components
k = 256 # gmm centroids
fnum = 4096 # number of descriptors

#==============================================================================
# IMAGE PARAMETERS
#==============================================================================
path = '/Users/goncalofigueira/Documents/capstone_project/datasets/ICIAR2018_BACH_Challenge/Photos/'
im_type = '.tif'
test_perc = 0.2

#==============================================================================
# GET IMAGE LIST AND INFO
#==============================================================================
im_folder = np.array(getFileList(path,im_type)) # image list
# Load csv with image information
im_info = pd.read_csv(getFileList(path,'.csv')[0], header = None)
im_info.columns = ['filename','target']

# =============================================================================
# MATCH IMAGE LIST AND LABELS
# =============================================================================
im_info = sortTarget(im_folder,im_info)
le = preprocessing.LabelEncoder()
T = im_info.target
T = np.array(le.fit_transform(T)) # array with targets (0-3)

Split dataset in train (80%) and test set (20%)
Here I used stratified shuffle to get equal number of sampes per class in the train and test set.

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = test_perc, random_state = 0)
for train_index, test_index in split.split(im_folder,T):
    train_files = train_index
    test_files = test_index

y_train = T[train_files]
y_test = T[test_files]

## I - Build SIFT dictionary


### For each image in training set:
#### 1. Image pre - processing
#### 2. Compute SIFT descrpitor
#### 3. Normalization

In [None]:
dictionary = []

for file  in tqdm(im_folder[train_files]):
    
    # 1. Read image with opencv [1536, 2048, 3]
    im = ReadImage(file)
    
    #3.  to gray
    im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    
    # 5. Compute SIFT descriptors (fnum = 4096)
    sift = cv2.xfeatures2d.SIFT_create(nfeatures = fnum)
    kp, descriptors = sift.detectAndCompute(im_gray, None)
    
    # 6. Normalize descriptor vector
    descriptors /= (descriptors.sum(axis=1, keepdims=True) + 1e-7)
    descriptors = np.sqrt(descriptors)

    # 7. Append to array
    dictionary.append(descriptors)

# list to np array: []
dictionary = np.asarray(dictionary)
dictionary = np.concatenate(dictionary).astype(None)

## II - Fit GMM model
#### Standardise descriptor features
#### Apply PCA-withening
#### fit gmm

In [None]:
# =============================================================================
# APPLY PCA TO DESCRIPTORS LIBRARY
# =============================================================================

# scalling
sift_scaler = preprocessing.StandardScaler()
descriptors = sift_scaler.fit_transform(descriptors)

# apply pca (n_cmp = 20)
sift_pca = PCA(n_components=n_cmp,whiten=True)
dictionary = sift_pca.fit_transform(dictionary)

## fit GMM (k = 256)
gmm_pca = GaussianMixture(n_components = k, covariance_type = "diag").fit(dictionary)

## III - Compute Fisher Vector for Training and Test set
#### 1. Apply same pre-processing
#### 2. Apply scalling and pca transformation
#### 3. Compute Fisher Vector
#### 4. Normalise Fisher Vector

In [None]:
# =============================================================================
 # COMPUTE FISHER VECTORS FOR TRAIN SET
# =============================================================================
    
X_train = np.empty((y_train.shape[0],k+2*n_cmp*k))
idx = 0
for file in tqdm(im_folder[train_files]):
    X_train[idx,:] = FeatureExtract(file, nkeys = fnum, pca = sift_pca, gmm = gmm_pca, scaler = sift_scaler)
    idx += 1


In [None]:
# =============================================================================
 # COMPUTE FISHER VECTORS FOR TEST SET
# =============================================================================
    
X_test = np.empty((y_test.shape[0],k+2*n_cmp*k))
idx = 0
for file in tqdm(im_folder[test_files]):
    X_test[idx,:] = FeatureExtract(file, nkeys = fnum, pca = sift_pca, gmm = gmm_pca, scaler = sift_scaler)
    idx += 1

### FeatureExtract function

In [None]:
def FeatureExtract(im_file, nkeys, pca, gmm, scaler):
    # read image
    im = ReadImage(im_file)
    
    # to gray
    im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    
    # extract SIFT descriptors
    sift = cv2.xfeatures2d.SIFT_create(nfeatures = nkeys)
    kp, descriptors = sift.detectAndCompute(im_gray, None)
    descriptors /= (descriptors.sum(axis=1, keepdims=True) + 1e-7)
    descriptors = np.sqrt(descriptors)
    
    # apply pca transform
    descriptors = scaler.transform(descriptors)
    descriptors = pca.transform(descriptors)
    
    # compute Fisher Vector
    fv = computeFV(descriptors, gmm)
    
    
    # power-normalization
    fv = np.sign(fv) * np.abs(fv) ** 0.5
    # L2 normalize
    fv /= np.sqrt(np.sum(fv ** 2))
    
    return fv


### computeFV function

In [None]:
def computeFV(xx, gmm):
    """Computes the Fisher vector on a set of descriptors.

    Parameters
    ----------
    xx: array_like, shape (N, D) or (D, )
        The set of descriptors

    gmm: instance of sklearn mixture.GMM object
        Gauassian mixture model of the descriptors.

    Returns
    -------
    fv: array_like, shape (K + 2 * D * K, )
        Fisher vector (derivatives with respect to the mixing weights, means
        and variances) of the given descriptors.

    Reference
    ---------
    J. Krapac, J. Verbeek, F. Jurie.  Modeling Spatial Layout with Fisher
    Vectors for Image Categorization.  In ICCV, 2011.
    http://hal.inria.fr/docs/00/61/94/03/PDF/final.r1.pdf

    """
    xx = np.atleast_2d(xx)
    N = xx.shape[0]
    # Compute posterior probabilities.
    Q = gmm.predict_proba(xx)  # NxK
   # print(Q.shape)
    # Compute the sufficient statistics of descriptors.
    Q_sum = np.sum(Q, 0)[:, np.newaxis] / N
    Q_xx = np.dot(Q.T, xx) / N
    Q_xx_2 = np.dot(Q.T, xx ** 2) / N

    # Compute derivatives with respect to mixing weights, means and variances.
    d_pi = Q_sum.squeeze() - gmm.weights_
    d_mu = Q_xx - Q_sum * gmm.means_
    d_sigma = (
            - Q_xx_2
            - Q_sum * gmm.means_ ** 2
            + Q_sum * gmm.covariances_
            + 2 * Q_xx * gmm.means_)

    # Merge derivatives into a vector.
    return np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten()))


## IV - Grid Search Cross-validation SVM

In [None]:
C_range = np.logspace(-2, 10, 13)
g_range = np.logspace(-9, 3, 13)
cv = StratifiedShuffleSplit(n_splits=5, test_size=test_perc, random_state=42)
tuned_parameters = [{'kernel': ['rbf'], 'gamma': g_range, 'C': C_range}]

In [None]:
clf = GridSearchCV(SVC(cache_size=2500), tuned_parameters, cv=cv,
                   scoring='%f1_macro' % score, n_jobs=-1)

# =============================================================================
#     COMPUTE PARAMETERS
# =============================================================================
t2 = time.time()
clf.fit(X_train, y_train)
elapsed2 = time.time() - t2
print()
print(clf.best_params_)
print()
print('Training time: ', elapsed2)

# ==============================================================================
# TRAIN SET
# ==============================================================================
clf2 = clf.best_estimator_
print("Classification on training set:")
y_true, y_pred = y_train, clf2.predict(X_train)
#print('Confusion matrix:')
#print(confusion_matrix(y_true, y_pred))
print(" Train set f1 score: " + str(f1_score(y_true, y_pred, average='macro')))


## V - Predict Test Set

In [None]:
# ==============================================================================
# TESTING
# ==============================================================================
y_true, y_pred = y_test, clf2.predict(X_test)
print("Classification on test set:")
print(classification_report(y_true, y_pred))
print('Confusion matrix:')
print(confusion_matrix(y_true, y_pred))