## Preprocessing

In [1]:
import numpy as np
import os
import cv2
import sklearn
from sklearn.preprocessing import StandardScaler
import random
from sklearn.model_selection import train_test_split
from sklearn_extra.cluster import KMedoids
import scipy.cluster.vq as vq
from sklearn_extra.cluster import KMedoids

data_directory = 'C:\\Users\i\Documents\Python Scripts\SPM_docs\docs\docs_path'
categories = ['News', 'Resume', 'Scientific']

data = []
X = []
y = []

def load_data():
    '''
    Loading the data by the category, converting jpg into array.
    Output:
    X - images dataset
    y - labels
    '''
    img_height = 1000
    img_width = 800

    for category in categories:
        path = os.path.join(data_directory, category)
        class_number = categories.index(category)
        
        for img in os.listdir(path): 
            try:
                image_array = cv2.imread(os.path.join(path, img))
                resized_image_array = cv2.resize(image_array, (img_height, img_width))
                data.append([resized_image_array, class_number])
            except Exception as e:
                pass
            
    random.shuffle(data)
    
    for features, label in data:
        X.append(features) 
        y.append(label)

    return X, y



def denoising_images(image_path):
    """
    Denoising for better generalization.
    """

    thresh = cv2.threshold(image_path, 220, 255, cv2.THRESH_BINARY_INV)[1]
    
    kernel1 = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel1)
    
    result = 255 - opening
    return result



def ORB_extractor(X):
    
    '''
    Extracting interesting points from images.
    
    Input: 
    X - list of image dataset
    Output:
    descriptors_scaled - array containing scaled descriptors
    descriptors_list - array containing images and corresponding set of descriptors
    
    '''
    
    #if type(X) is list:
        #pass
    #else:
        #X = [X]
    
    descriptors_list = []
    orb = cv2.ORB_create(nfeatures = 1200)

    for image in X:
        keypoint = orb.detect(image,None)
        keypoints, descriptor = orb.compute(image, keypoint)
        descriptors_list.append((image, descriptor))
    
    descriptors = descriptors_list[0][1]
    for image_path, descriptor in descriptors_list[1:]:
        descriptors = np.vstack((descriptors,descriptor))
        
    standard_scaler = StandardScaler().fit(descriptors)
    descriptors_scaled = standard_scaler.transform(descriptors)
    
    return descriptors_scaled, descriptors_list




def build_codebook(descriptors, voc_size):
    """
    Creating the codebook, vocabulary.
    
    Inupt: 
    descriptors - scaled array of descriptors
    voc_size  - vocabulary size (the number of desired clusters)
    Output:
    codebook - squeezed centers of descriptor clusters
    """
    features  = np.vstack((descriptor for descriptor in descriptors))
    kmedoids = KMedoids(n_clusters=voc_size,  init='k-medoids++', random_state = 0).fit(features)
    y_kmed = kmedoids.fit_predict(features)
    codebook = kmedoids.cluster_centers_.squeeze()
    return codebook



def vector_encoder(X, descriptors_list, codebook, voc_size):
    '''
    Vector Quantization
    
    Input: 
    X - image dataset
    descriptors_list - array: images and corresponding list of its descriptors
    codebook,
    voc_size  - vocabulary size (the number of desired clusters)
    Output:
    image_features - list of compressed images
    '''
    image_features = np.zeros((len(X),voc_size), "float32")
    
    for i in range(len(X)):
        words, distance = vq.vq(descriptors_list[i][1], codebook)    
        
        for w in words:
            image_features[i][w] += 1 
    
    standard_scaler = StandardScaler().fit(image_features)
    image_features = standard_scaler.transform(image_features)
    
    return image_features

In [2]:
X = load_data()[0]
y = load_data()[1]
X_codeboook = X[:40]
X_codebook_denoised = [denoising_images(img) for img in X_codeboook]

X_codebook_descriptors = ORB_extractor(X_codebook_denoised)[0]

codebook = build_codebook(X_codebook_descriptors,15)

  features  = np.vstack((descriptor for descriptor in descriptors))


In [None]:
import json, codecs
list_codebook = codebook.tolist() # nested lists with same data, indices
with codecs.open('codebook.json', 'w', encoding='utf-8') as handle:
        json.dump(list_codebook, handle, ensure_ascii=False, separators=(',', ':'), sort_keys=True, indent=4)

list_codebook_new = json.loads(codecs.open('codebook.json', 'r', encoding='utf-8').read())
_codebook = np.array(list_codebook_new)

# source: https://stackoverflow.com/questions/12309269/how-do-i-write-json-data-to-a-file
# https://stackoverflow.com/questions/26646362/numpy-array-is-not-json-serializable

In [6]:
X_denoised = [denoising_images(img) for img in X]



X_train, X_test, y_train, y_test = train_test_split(X_denoised, y, train_size = 0.6, random_state=42)


descriptors_list_train = ORB_extractor(X_train)[1]
descriptors_list_test = ORB_extractor(X_test)[1]


vector_encoder_train = vector_encoder(X_train, descriptors_list_train, codebook, 15) 
vector_encoder_test = vector_encoder(X_test, descriptors_list_test, codebook, 15)

In [41]:
_X_train = X_train[0].tolist() # nested lists with same data, indices
with codecs.open('X_train.json', 'w', encoding='utf-8') as handle:
        json.dump(_X_train, handle, ensure_ascii=False, separators=(',', ':'), sort_keys=True, indent=4)
_X_test = X_test[0].tolist() # nested lists with same data, indices
with codecs.open('X_test.json', 'w', encoding='utf-8') as handle:
        json.dump(_X_test, handle, ensure_ascii=False, separators=(',', ':'), sort_keys=True, indent=4)

with codecs.open('y_train.json', 'w', encoding='utf-8') as handle:
        json.dump(y_train, handle, ensure_ascii=False, separators=(',', ':'), sort_keys=True, indent=4)
with codecs.open('y_test.json', 'w', encoding='utf-8') as handle:
        json.dump(y_test, handle, ensure_ascii=False, separators=(',', ':'), sort_keys=True, indent=4)



_vector_encoder_train = vector_encoder_train.tolist() # nested lists with same data, indices
with codecs.open('vector_encoder_train.json', 'w', encoding='utf-8') as handle:
        json.dump(_vector_encoder_train, handle, ensure_ascii=False, separators=(',', ':'), sort_keys=True, indent=4)

_vector_encoder_test = vector_encoder_test.tolist() # nested lists with same data, indices
with codecs.open('vector_encoder_test.json', 'w', encoding='utf-8') as handle:
        json.dump(_vector_encoder_test, handle, ensure_ascii=False, separators=(',', ':'), sort_keys=True, indent=4)