In [57]:
#Run the Master Code to Call all Feature Extraction Protocols
# need to install PIL / pillow to run this
# $ pip install Pillow

#Import all the Modules Needed for FEATURE_EXTRACTION
import os
import numpy as np
from scipy import stats
import pandas as pd
from PIL import Image
from skimage import color
from skimage.util.dtype import dtype_range
from skimage.util import img_as_ubyte
from skimage.morphology import disk
from skimage.filters.rank import gradient
from skimage.filters import roberts, sobel, threshold_otsu
from skimage.feature import corner_harris, corner_peaks, blob_log, blob_doh, blob_dog

In [58]:
#This function walks through all the files in the main 50_categories folder
# to generate an iterable list of file paths to submit later for feature extraction

def directory_search(directory):
    directory_list = [] #initialize list of lists to collect features 
    subdirectories = [f for f in os.listdir(directory) if not f.startswith('.')] #ignores hidden folders
    for i in range(len(subdirectories)):
        folder = subdirectories[i]
        subdirectory = directory+folder
        files = [f for f in os.listdir(subdirectory) if not f.startswith('.')] #ignores hidden folders
        for j in range(len(files)):
            file = files[j]
            file_name = subdirectory+'/'+file
            directory_list.append(file_name)
    return directory_list

#Assemble directory List w/above function
directory = '/Users/Charlie/Desktop/50_categories/' #define this as YOUR path to the 50_categories folder
list_of_paths = directory_search(directory) 

In [61]:
def feature_extraction(path):
    parts = path.split('/')
    category = parts[5]
    file_name = parts[6]
    
    #Read image
    im = Image.open(path)
    image = np.array(im)
    #Basic Feature Extraction
    
    #Image Size
    y = image.shape[0]
    x = image.shape[1]

    #Image Color
    if len(image.shape) == 2: #image is grey scale
        image_gray = image
        red_val = 'nan'
        grn_val = 'nan'
        blu_val = 'nan'
        most_freq_color = 'nan' #will clear later(?)
    elif len(image.shape) == 3: #image is RGB color
    
    #Most Frequent Color (really go with 2nd most freq, many of the images will have white borders and make it the most common, which is not helpful)
        pixels = im.getcolors(x * y)
        most_frequent_pixel = pixels[1]
    #Convert RGB Values to single unique number
        most_freq_color = 65536*most_frequent_pixel[1][0] + 256*most_frequent_pixel[1][1] + most_frequent_pixel[1][2]

    #Avg Value per Color channel
        red_val = np.mean(image[:,:,0])
        grn_val = np.mean(image[:,:,1])
        blu_val = np.mean(image[:,:,2])
    
    #Grey Scale value 
    image_gray = color.rgb2gray(image)
    grey_val = np.mean(image_gray)

    #Outsu Binary Score
    thresh = threshold_otsu(image_gray)
    binary = image_gray > thresh
    binary_per = np.sum(binary)/(x*y) #since we'll be comparing images of different sizes, its more appropriate to use percent of image converted in the binary and not total binary score (# of black pixels)

    #Sharpness and Sharpness Frequencies
    selection_element = disk(5) # matrix of n pixels with a disk shape
    sharpness = (gradient(image_gray, selection_element))
    mean_sharp=np.mean(sharpness)
    sharp = sharpness.flatten()
    shrp = np.fft.fft(sharp)
    freq = np.fft.fftfreq(shrp.size, d=1)
    avg_sharp_freq = np.mean(freq)

    #Edge Detection
    edge_roberts = roberts(image_gray)
    edge_sobel = sobel(image_gray)
    roberts_score = np.sum(edge_roberts)/(x*y) #normalize to size of image
    sobel_score = np.sum(edge_sobel)/(x*y)

    #Blobs - Difference of Gaussian [x coord, y coord, radius size]
    blobs_dog = blob_dog(image_gray, max_sigma=30, threshold=.1)
    # use this method for image coordinates for fitting if corners detection fails later
    
    #Blobs - Laplacian of Gaussian [x coord, y coord, radius size]
    blobs_log = blob_log(image_gray, max_sigma=30, num_sigma=10, threshold=.1) 
    gaussian_blobs = len(blobs_log)
    guassian_means = np.mean(blobs_log[:,2])

    #Blobs - Determinant of Hessian [x coord, y coord, radius size]
    blobs_doh = blob_doh(image_gray, max_sigma=30, threshold=.01) 
    hessain_blobs = len(blobs_doh)
    hessain_means = np.mean(blobs_doh[:,2])
    
    #Linear Fitting With Corners Detction (Linear Fit to detect corner locations, help determine shape orientation)
    coords = corner_peaks(corner_harris(image_gray), min_distance=5)
    if coords.shape[0] == 0: #for some reason, corners fails on certain images, use diff method to find feature coordinates
        coords=np.empty([len(blobs_dog),2])
        coords[:,0] = blobs_dog[:,0]
        coords[:,1] = blobs_dog[:,1]
        
    lin_slope, lin_intercept, lin_r_value, lin_p_value, lin_std_err = stats.linregress(coords[:,0],coords[:,1])
    lin_rsq_value = lin_r_value**2

    return [file_name, x, y, most_freq_color, red_val, grn_val, blu_val, grey_val,
          binary_per, mean_sharp, avg_sharp_freq, roberts_score, sobel_score,
          lin_slope, lin_intercept, lin_rsq_value, lin_p_value, lin_std_err,
          gaussian_blobs, guassian_means, hessain_blobs, hessain_means, category]

In [62]:
#concurrent.futures for parallelizing the feature extraction
from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor()
future = e.submit(feature_extraction,list_of_paths)
results = list(e.map(feature_extraction, list_of_paths)) 
#returns a list of lists of the results from feature extraction
#collected into a pandas data frame below
e.shutdown

#Organize Extracted Features
columns=['file name','x', 'y', 'most_freq_color', 'red_val', 'grn_val', 'blu_val', 'grey_val','binary_per', 'mean_sharp', 'avg_sharp_freq', 'roberts_score', 'sobel_score',
          'lin_slope', 'lin_intercept', 'lin_rsq_value', 'lin_p_value', 'lin_std_err',
          'gaussian_blobs', 'guassian_means', 'hessain_blobs', 'hessain_means', 'category']
category_vals = pd.DataFrame(results,columns=columns)

#ignore the large amount of warning and error messages below...
#And be patient, might take a second or two, or a few minutes

#The resulted DataFrame from all the categories is saved as a .CSV in the homework folder

NameError: name 'rgb2gray' is not defined

In [6]:
category_vals #check if it worked!

NameError: name 'category_vals' is not defined

In [43]:
path = '/Users/Charlie/Desktop/50_categories/gorilla/gorilla_0112.jpg'
im = Image.open(path)
image = np.array(im)

y = image.shape[0]
x = image.shape[1]

    #Most Frequent Color (really go with 2nd most freq, many of the images will have white borders and make it the most common, which is not helpful)
pixels = im.getcolors(x * y)
most_frequent_pixel = pixels[1]


In [45]:
red_val = np.mean(image[:,:,0])
grn_val = np.mean(image[:,:,1])
blu_val = np.mean(image[:,:,2])


NameError: name 'image_gray' is not defined

In [54]:
image.shape[2]

3