# # Machine Learning Homework 6
Run each cell of the notebook in order, make sure to change "directory" in the following cell to the path to your test folder

## Import needed Utilities and Packages

In [None]:
directory = "/Users/Charlie/Desktop/Tester/" #define this as YOUR path to the 50_categories folder 
#Please end this path with '/' or the file finder will not work

In [14]:
# need to install PIL / pillow to run this
# $ pip install Pillow

#Import all the Modules Needed
import os
import numpy as np
from scipy import stats
import pandas as pd
from PIL import Image
from skimage import color
from skimage.util.dtype import dtype_range
from skimage.util import img_as_ubyte
from skimage.morphology import disk
from skimage.filters.rank import gradient
from skimage.filters import roberts, sobel, threshold_otsu
from skimage.feature import corner_harris, corner_peaks, blob_log, blob_doh, blob_dog
from sklearn import model_selection, metrics, cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection, metrics, cross_validation, preprocessing
from sklearn.ensemble import RandomForestClassifier

## Feature Extraction on the Test Directory

In [15]:
#This function walks through all the files in directory to generate an iterable list of file paths to submit later for feature extraction

def directory_search(directory):
    """
    This function takes the input "directory" which is a file path i.e. "/new/directory/path/" to a directory 
    that contains files like:
          validation1.jpg
          validation2.jpg
          ...
    and returns a list with all file paths to images in "directory"
    """
    
    directory_list = [] #initialize list of lists to collect file names 
    subdirectories = [f for f in os.listdir(directory) if not f.startswith('.')] #ignores hidden folders
    for i in range(len(subdirectories)):
        file = subdirectories[i]
        file_name = directory+file
        directory_list.append(file_name)
    return directory_list

#Assemble directory List w/above function
list_of_paths = directory_search(directory)

In [9]:
def feature_extraction(path):
    """This function takes as input a file path to an image and returns features about that image"""
    #Parse file name
    parts = path.split('/')
    file_name = parts[-1]
    
    #Read image
    im = Image.open(path)
    image = np.array(im)
    #Basic Feature Extraction
    
    #Image Size
    y = image.shape[0]
    x = image.shape[1]
    aspect_ratio = x/y

    #Image Color
    if len(image.shape) == 2: #image is grey scale
        image_gray = image
        red_val = 'nan'
        grn_val = 'nan'
        blu_val = 'nan'
        most_freq_color = 'nan' #will clear later
    elif len(image.shape) == 3: #image is RGB color
    
    #Most Frequent Color (really go with 2nd most freq, many of the images will have white borders and make it the most common, which is not helpful)
        pixels = im.getcolors(x * y)
        most_frequent_pixel = pixels[1]
    #Convert RGB Values to single unique number
        most_freq_color = 65536*most_frequent_pixel[1][0] + 256*most_frequent_pixel[1][1] + most_frequent_pixel[1][2]

    #Avg Value per Color channel
        red_val = np.mean(image[:,:,0])
        grn_val = np.mean(image[:,:,1])
        blu_val = np.mean(image[:,:,2])
    
    #Grey Scale value 
    image_gray = color.rgb2gray(image)
    grey_val = np.mean(image_gray)

    #Outsu Binary Score
    thresh = threshold_otsu(image_gray)
    binary = image_gray > thresh
    binary_per = np.sum(binary)/(x*y) #since we'll be comparing images of different sizes, its more appropriate to use percent of image converted in the binary and not total binary score (# of black pixels)

    #Sharpness and Sharpness Frequencies
    selection_element = disk(5) # matrix of n pixels with a disk shape
    sharpness = (gradient(image_gray, selection_element))
    mean_sharp=np.mean(sharpness)
#     sharp = sharpness.flatten()
#     shrp = np.fft.fft(sharp)
#     freq = np.fft.fftfreq(shrp.size, d=1)
#     avg_sharp_freq = np.mean(freq)

    #Edge Detection
    edge_roberts = roberts(image_gray)
    edge_sobel = sobel(image_gray)
    roberts_score = np.sum(edge_roberts)/(x*y) #normalize to size of image
    sobel_score = np.sum(edge_sobel)/(x*y)

#     #Blobs - Difference of Gaussian [x coord, y coord, radius size]
    blobs_dog = blob_dog(image_gray, max_sigma=30, threshold=.1)
#     # use this method for image coordinates for fitting if corners detection fails later
    
    #Blobs - Laplacian of Gaussian [x coord, y coord, radius size]
    blobs_log = blob_log(image_gray, max_sigma=30, num_sigma=10, threshold=.1) 
    gaussian_blobs = len(blobs_log)
    guassian_means = np.mean(blobs_log[:,2])

    #Blobs - Determinant of Hessian [x coord, y coord, radius size]
    blobs_doh = blob_doh(image_gray, max_sigma=30, threshold=.01) 
    hessain_blobs = len(blobs_doh)
    hessain_means = np.mean(blobs_doh[:,2])
    
    #Linear Fitting With Corners Detection (Linear Fit to detect corner locations, help determine shape orientation)
    coords = corner_peaks(corner_harris(image_gray), min_distance=5)
    if coords.shape[0] == 0: #for some reason, corners fails on certain images, use diff method to find feature coordinates
        coords=np.empty([len(blobs_dog),2])
        coords[:,0] = blobs_dog[:,0]
        coords[:,1] = blobs_dog[:,1]
        
    lin_slope, lin_intercept, lin_r_value, lin_p_value, lin_std_err = stats.linregress(coords[:,0],coords[:,1])
    #lin_rsq_value = lin_r_value**2

    return [file_name, x, y, aspect_ratio, most_freq_color, red_val, grn_val, blu_val, grey_val,
          binary_per, mean_sharp, roberts_score, sobel_score,
          lin_slope, lin_intercept, lin_std_err, gaussian_blobs, guassian_means, hessain_blobs, hessain_means]

#lin_rsq_value, lin_p_value, avg_sharp_freq : unused features

In [10]:
#concurrent.futures for parallelizing the feature extraction

#Runs the Feature Extraction Function (above) on the given file path for the test set. And returns a csv with the filenames and feature vals
#Depending on number of validation files, this may take a few minutes (took ~25 mins on the full 50 categories on my laptop)

from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor()
future = e.submit(feature_extraction,list_of_paths)
results = list(e.map(feature_extraction, list_of_paths)) 
#returns a list of lists of the results from feature extraction
#collected into a pandas data frame below
e.shutdown

#Organize Extracted Features
columns=['file name','x', 'y','aspect_ratio', 'most_freq_color', 'red_val', 'grn_val', 'blu_val', 'grey_val','binary_per', 
         'mean_sharp','roberts_score', 'sobel_score', 'lin_slope', 'lin_intercept', 'lin_std_err',
         'gaussian_blobs', 'guassian_means', 'hessain_blobs', 'hessain_means']

unknown_vals = pd.DataFrame(results,columns=columns)

#Save Data to csv so dont have to wait again
unknown_vals.to_csv('unknown_vals', sep='\t')


#Ignore the error warnings...

  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))


## Prepare Data for sklearn

In [16]:
#Import Data From Feature Processing
train_data = pd.read_csv('category_vals', delimiter = '\t',index_col=0) #this csv is from feature processing on the full 50 categories folder
test_data = pd.read_csv('unknown_vals', delimiter = '\t',index_col=0) #this is from YOUR testing set

#Shuffle the rows of the training data so that categories mix
train_data = train_data.sample(frac=1).reset_index(drop=True) 

#Some Pre-processing to convert from pandas form to useable np.arrays in sklearn

#convert infinities to NaN
train_data = train_data.replace([np.inf, -np.inf], np.nan)
test_data = test_data.replace([np.inf, -np.inf], np.nan)
#remove NaNs from data set (set to 0)
train_data = train_data.fillna(value=0)
test_data = test_data.fillna(value=0)

size = train_data.shape 
Y_pos = size[1]-1 #will use later when extracting the category from the data frame. Categories in the trainging set are in the last column of the dataframe
# data_len = size[0]
    
#split data into X (features) and Y (categories)
#remove the file name and the category from the features
X_train = train_data.iloc[:,1:-1] 
X_test = test_data.iloc[:,1:] #testing data does not come with category

#Scale the features AND convert the X_ dataframes into an np.arrays useable in sklearn
X_train_scaled = preprocessing.scale(X_train)
X_test_scaled = preprocessing.scale(X_test)

# Extract Categories of the Training Set
Y = train_data.iloc[:,Y_pos]
Y = Y.as_matrix()

#training set
Xtr = X_train_scaled
Ytr = Y
print("training size: " + str(len(Ytr)))
# testing set
Xte = X_test_scaled
print("testing size: " + str(len(Xte)))

#Concatenates Features to number of digits useable in sklearn - feature extraction returns values w/too much specificity to be used as dtype float32
Xtr = np.around(Xtr, decimals=8)
Xte = np.around(Xte, decimals=8)

training size: 4244
testing size: 31


## Run the Random Forest

In [23]:
# Create a classifier -instantiate classifier object
classifier = RandomForestClassifier(n_estimators=50)
# fit the classification model on training set
classifier.fit(Xtr, Ytr)
# make predictions for testing set
predictions = classifier.predict(Xte)

In [36]:
#Save Predicitions as Text File
Unknown_Category_Predictions=[]
Unknown_Category_Predictions.append(('Filename', 'Predicted Category'))
Unknown_Category_Predictions.append(('-------------------------------'))
for i in range(len(predictions)):
    parts = list_of_paths[i].split('/')
    file_name = parts[-1]
    Unknown_Category_Predictions.append((file_name,predictions[i]))
    

[('Filename', 'Predicted Category'),
 '-------------------------------',
 ('goose_0007.jpg', 'goat'),
 ('gorilla_0002.jpg', 'bat'),
 ('gorilla_0003.jpg', 'gorilla'),
 ('goose_0006.jpg', 'snake'),
 ('goose_0012.jpg', 'cormorant'),
 ('goose_0004.jpg', 'goose'),
 ('goose_0010.jpg', 'goose'),
 ('gorilla_0001.jpg', 'starfish'),
 ('goose_0011.jpg', 'airplanes'),
 ('goose_0005.jpg', 'goose'),
 ('goose_0001.jpg', 'airplanes'),
 ('gorilla_0004.jpg', 'bat'),
 ('gorilla_0005.jpg', 'airplanes'),
 ('goose_0002.jpg', 'blimp'),
 ('gorilla_0007.jpg', 'iguana'),
 ('kangaroo_0009.jpg', 'mars'),
 ('kangaroo_0008.jpg', 'bat'),
 ('gorilla_0006.jpg', 'leopards'),
 ('goose_0003.jpg', 'duck'),
 ('kangaroo_0005.jpg', 'bat'),
 ('kangaroo_0010.jpg', 'elk'),
 ('kangaroo_0004.jpg', 'snake'),
 ('gorilla_0008.jpg', 'owl'),
 ('kangaroo_0006.jpg', 'kangaroo'),
 ('kangaroo_0007.jpg', 'bat'),
 ('gorilla_0009.jpg', 'airplanes'),
 ('goose_0008.jpg', 'airplanes'),
 ('kangaroo_0003.jpg', 'mussels'),
 ('kangaroo_0002.jpg', '

In [None]:
with open("file.txt", "w") as output:
    output.write(str(values))