**Kate Groschner** <br>
**AY250 Homework 6**

## Question 1

In [4]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import skimage 
from skimage import io
from skimage.color import rgb2gray
import os
import skimage.feature
import skimage.filters
import skimage.color
from skimage.util import invert
from skimage.morphology import skeletonize
from scipy.signal import correlate2d
import pandas as pd
import cv2
from pathlib import Path
from joblib import Parallel, delayed
from tqdm import tqdm

def import_exImg():
    sample_images = {}
    if os.getcwd().split('/')[-1] != '50_categories':
        exit()
    files = os.listdir()
    for idx, direct in enumerate(files):
        if idx == 0:
            pass
        else:
            os.chdir(direct)
            file = os.listdir()[0]
            img = io.imread(file)
            sample_images[direct] = img
            os.chdir('..')
    print('Done!')
    return(sample_images)

def avg_red(img):
    """returns average value of red channel"""
    if len(img.shape) != 3:
        img = skimage.color.gray2rgb(img)
    avg = [img[:,:,0].mean()]
    return np.array(avg)

def avg_blue(img):
    """returns average value of blue channel"""
    if len(img.shape) != 3:
        img = skimage.color.gray2rgb(img)
    avg = [img[:,:,1].mean()]
    return np.array(avg)

def avg_green(img):
    """returns average value of green channel"""
    if len(img.shape) != 3:
        img = skimage.color.gray2rgb(img)
    avg = [img[:,:,2].mean()]
    return np.array(avg)

# edges from sobel
def sobel_edges(gray_image):
    """Returns histogram of edges and average of edges"""
    edges = skimage.filters.sobel(gray_image)
    edge_hist = np.histogram(edges,bins=250, density = True)[0]
    return edge_hist

# number of blobs from difference of gaussian
def blobs_dog(gray_image):
    """returns two features: average blob size and total number of blobs detected by difference of gaussians"""
    blob = skimage.feature.blob_dog(gray_image, max_sigma=50, threshold=.1)
    blobs = blob[:,2]
    avg_blob = blobs.mean()
    num_blobs = len(blobs)
    blob_info = np.array([avg_blob, num_blobs])
    return blob_info

def blobs_log(gray_image):
    """returns two features: average blob size and total number of blobs detected by laplace of gaussians"""
    blob = skimage.feature.blob_log(gray_image, max_sigma=5, num_sigma=10, threshold=.1)
    blobs = blob[:,2]
    avg_blob = blobs.mean()
    num_blobs = len(blobs)
    blob_info = np.array([avg_blob, num_blobs])
    return blob_info

def fft_hist(gray_image):
    """returns 250 bin histogram of frequencies from fft of image"""
    fft = np.log2(abs(np.fft.rfft2(gray_image)))
    fhist = np.histogram(fft,bins=250,density = True)[0]
    return fhist

def lbp_hist(gray_image):
    """returns 250 bin histogram of local binary patterns from image"""
    lbp = skimage.feature.local_binary_pattern(gray_image,30,30)
    lhist = np.histogram(lbp,bins=250,density = True)[0]
    return lhist

def center_cut(image):
    """returns a 1d array of length 1200 which is 20x20 center of image of all three channels flattened"""
    if len(image.shape) != 3:
        image = skimage.color.gray2rgb(image)
    middle = (image.shape[0]//2,image.shape[1]//2)
    cut = image[(middle[0]-10):(middle[0]+10),(middle[1]-10):(middle[1]+10),:].flatten()
    return cut

def avg_HSV(image):
    """returns the average for hue, saturation, and value from image"""
    if len(image.shape) == 3:
        img_out = skimage.color.rgb2hsv(image)
    else:
        img_out = skimage.color.rgb2hsv(skimage.color.grey2rgb(image))
    results = [img_out[:,:,0].mean(),img_out[:,:,1].mean(),img_out[:,:,2].mean()]
    return np.array(results)

def sift10(image):
    """returns 1d array of length 1280 which represents the first 10 SIFT features of the image"""
    if len(image.shape) != 3:
        image = skimage.color.gray2rgb(image)
    sift = cv2.xfeatures2d.SIFT_create()
    _, des = sift.detectAndCompute(image,None)
    if des.shape[0] <= 10:
        dif = 10 - des.shape[0]
        filler = np.zeros((dif,128))
        sift = np.concatenate((des,filler),axis=0).flatten()
    else:
        sift = des[:10,:].flatten()
    return sift

def color_hist(image):
    """returns 1d array of length 300 representing the color histogram for red,blue, and green channels"""
    hist = []
    if len(image.shape) == 3:
        for i in range(0,3):
            hist.append(np.histogram(image[:,:,i],bins=100, density = True)[0])
    else:
        for i in range(0,3):
            hist.append(np.histogram(image,bins=100, density = True)[0])
    hist = np.concatenate(hist)
    return hist

def get_features(file):
    image = io.imread(file)
    label = file.parent.name
    gray_image = rgb2gray(image)
#     print(file)
    features = [] 
    features.append(color_hist(image))
    features.append(sift10(image))
    features.append(avg_HSV(image))
    features.append(center_cut(image))
    features.append(lbp_hist(gray_image))
    features.append(fft_hist(gray_image))
    features.append(blobs_log(gray_image))
    features.append(blobs_dog(gray_image))
    features.append(sobel_edges(gray_image))
    features.append(avg_green(image))
    features.append(avg_blue(image))
    features.append(avg_red(image))
    features = np.concatenate(features)
    return (features,label)


In [2]:
def feature_frame(directory):
    files = list(Path(directory).glob('*/*.jpg'))
    np.random.seed(42)
    file_list = np.random.choice(files,100)
    features = Parallel(n_jobs=-1)(delayed(get_features)(file) for file in tqdm(file_list))
    print('Done!')
    feat_list, labels_list = zip(*features)
    df = pd.DataFrame.from_records(feat_list)
    df['Label'] = labels_list
    return df

In [5]:
directory = '/Users/cgroschner/Documents/pythonLearn/groschner-python-ay250-homework/hw_6/50_categories'
df = feature_frame(directory)
csv_name = directory + '/features.csv'
df.to_csv(csv_name)


  0%|          | 0/100 [00:00<?, ?it/s][A
  8%|▊         | 8/100 [00:00<00:02, 40.55it/s][A
 10%|█         | 10/100 [00:00<00:03, 28.38it/s][A
 14%|█▍        | 14/100 [00:00<00:03, 28.67it/s][A
 16%|█▌        | 16/100 [00:00<00:03, 22.50it/s][A
 18%|█▊        | 18/100 [00:00<00:03, 21.63it/s][A
 20%|██        | 20/100 [00:00<00:03, 21.43it/s][A
 22%|██▏       | 22/100 [00:01<00:03, 20.87it/s][A
 24%|██▍       | 24/100 [00:01<00:03, 19.03it/s][A
 26%|██▌       | 26/100 [00:01<00:05, 13.75it/s][A
 28%|██▊       | 28/100 [00:02<00:06, 10.38it/s][A
 30%|███       | 30/100 [00:03<00:07,  9.60it/s][A
 34%|███▍      | 34/100 [00:04<00:09,  6.83it/s][A
 35%|███▌      | 35/100 [00:05<00:09,  6.84it/s][A
 36%|███▌      | 36/100 [00:05<00:09,  6.85it/s][A
 37%|███▋      | 37/100 [00:05<00:09,  6.63it/s][A
 38%|███▊      | 38/100 [00:07<00:11,  5.39it/s][A
 40%|████      | 40/100 [00:07<00:10,  5.48it/s][A
 41%|████      | 41/100 [00:07<00:10,  5.52it/s][A
 42%|████▏     | 42/1

Done!


In [8]:
np.random.seed(42)
files = list(Path(directory).glob('*/*.jpg'))
file_list = np.random.choice(files,100)
file_list[4]

PosixPath('/Users/cgroschner/Documents/pythonLearn/groschner-python-ay250-homework/hw_6/50_categories/saturn/saturn_0002.jpg')

In [None]:
a = io.imread(file_list[0])
plt.imshow(a)

In [10]:
file_list[1]

PosixPath('/Users/cgroschner/Documents/pythonLearn/groschner-python-ay250-homework/hw_6/50_categories/speed-boat/speed-boat_0058.jpg')

In [None]:
b = sift10(a)

In [None]:
plt.plot(b)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 3541 entries, 0 to Label
dtypes: float64(3540), object(1)
memory usage: 2.7+ MB


In [7]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3531,3532,3533,3534,3535,3536,3537,3538,3539,Label
0,0.02511,0.01877,0.020958,0.034464,0.018808,0.026511,0.014981,0.020233,0.012167,0.015781,...,0.0,0.0,0.0,0.0,0.0,0.012332,32.523756,33.06942,30.501913,comet
1,0.001691,0.003303,0.002824,0.008314,0.00797,0.005322,0.003088,0.004229,0.003518,0.004731,...,0.0,0.0,0.014096,0.007048,0.0,0.007048,125.360291,133.424845,135.834574,speed-boat
2,0.001169,0.000889,0.000912,0.001536,0.001294,0.002642,0.001902,0.003196,0.00205,0.003274,...,0.007928,0.007928,0.0,0.0,0.007928,0.007928,95.778772,156.762552,138.274975,ostrich
3,0.000205,0.00023,0.000211,0.000547,0.00054,0.001453,0.001174,0.002609,0.00205,0.004298,...,0.019038,0.025384,0.031729,0.006346,0.0,0.019038,91.767532,83.981644,70.983497,airplanes
4,0.204038,0.01315,0.007371,0.006915,0.003806,0.004942,0.004129,0.005017,0.001492,0.002338,...,0.0,0.002914,0.002914,0.002914,0.014569,0.008742,38.448656,40.500436,44.199605,saturn


## Question 2

In [14]:
half = df.shape[0]//2
X_train = df.iloc[0:half,0:3539]
Y_train = df.iloc[0:half,3540]
X_test = df.iloc[half:,0:3539]
Y_test = df.iloc[half:,3540]

In [19]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=50)
classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)