In [1]:
from skimage import feature, transform, measure,filters
from skimage.io import imread
from skimage.color import gray2rgb, rgb2gray
from skimage.filters import threshold_otsu, frangi
from skimage.segmentation import felzenszwalb


import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import os
# pip install imutils

#import imutils
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from multiprocessing import Pool
from glob import glob
from time import time

import pickle 

#Note, I am just ignoring warnings because I get many flags that are unnecesary. 
#I understand this is not good practice, but just for visual aesthetic of the 
#notebook I don't want them to print
import warnings
warnings.filterwarnings("ignore")



## Functions to extract features from images

In [2]:
def get_dumb_features(img):
    """
    Get simple features such as the min,max,mean,etc of the r,g,b channels
    """
    
    ratios = []
    correlations = []
    max_norm = []
    mean_std = []
        
    a = img[:,:,0]
    b = img[:,:,1]
    c = img[:,:,2]    
        
    for ii,jj in combinations([a,b,c],2):
        ratios.append(np.mean(ii)/np.mean(jj))
        corr = np.corrcoef(np.ndarray.flatten(ii),np.ndarray.flatten(jj))[0,1]
        correlations.append(corr)
    for ii in [a,b,c]:
        max_norm.append(np.max(ii)/np.mean(ii))    
        mean_std.append(np.mean(ii)/np.std(ii)) 
    maxes = np.array([np.sum(a),np.sum(b),np.sum(c)])
    dominant_color = np.argmax(maxes)    

    
    return np.asarray(ratios + correlations + max_norm + mean_std +dominant_color)

def get_smart_features(img):
    """
    get "smart" features, using more advanced tools from skimage
    """
    img_grey = rgb2gray(img)
    #img_grey_rz = transform.resize(img_grey,(338,424))
    #it seems resizing the images makes it worse, so I 
    #taking out the above line of code
    img_grey_rz = img_grey
    thresh = threshold_otsu(img_grey_rz)
    binary_img = img_grey_rz > thresh
    
    
    edginess = np.sum(feature.canny(img_grey_rz,sigma=3))
    corners = feature.corner_harris(img_grey_rz).size
    shape = np.mean(feature.shape_index(img_grey_rz))
    blobbiness = (feature.blob_dog(img_grey_rz)).shape[0]
    perimeter = measure.perimeter(binary_img)
    peaks = (feature.peak_local_max(img_grey_rz)).size
    
    smart_features = [edginess,corners,shape,blobbiness,perimeter,peaks]
    
    for ii,jj in combinations(smart_features,2):
        try:
            smart_features.append(ii/jj)
        except:
            smart_features.append(0)
        
    return np.asarray(smart_features)


def get_features(filename):
    """
    Takes a filename and reads in corresponding image,
    then called get_dumb_features() and get_smart_features()
    to get the image features
    """

    img = imread(filename)

    if len(img.shape) != 3:
        img = gray2rgb(img)
    
        
    dumb_features = get_dumb_features(img)
    smart_features = get_smart_features(img)
    
    smart_features[np.isnan(smart_features)] = 10000.0
    smart_features[np.isinf(smart_features)] = 0.0
    
    
    good_features = np.concatenate((dumb_features , smart_features))

    
    return good_features

### Read in all the images to explore the data

In [13]:
#path = '/Users/cwfink/Documents/School/UC_Berkeley/Classes/Spring_2018/AY250/HW/Fink_ay250_homework/hw_6/50_categories/'
path = '50_categories/'


img_list =[]

for subdir, dirs, files in os.walk(path):
    print('On directory: {}'.format(subdir))
    for file in files:
        if '.DS_Store' not in file:
            img = imread(subdir+'/'+file)
            img_list.append(img)





On directory: 50_categories/
On directory: 50_categories/gorilla
On directory: 50_categories/raccoon
On directory: 50_categories/crab
On directory: 50_categories/blimp
On directory: 50_categories/snail
On directory: 50_categories/airplanes
On directory: 50_categories/dog
On directory: 50_categories/dolphin
On directory: 50_categories/goldfish
On directory: 50_categories/giraffe
On directory: 50_categories/bear
On directory: 50_categories/killer-whale
On directory: 50_categories/penguin
On directory: 50_categories/zebra
On directory: 50_categories/duck
On directory: 50_categories/conch
On directory: 50_categories/camel
On directory: 50_categories/owl
On directory: 50_categories/helicopter
On directory: 50_categories/starfish
On directory: 50_categories/saturn
On directory: 50_categories/galaxy
On directory: 50_categories/goat
On directory: 50_categories/iguana
On directory: 50_categories/elk
On directory: 50_categories/hummingbird
On directory: 50_categories/triceratops
On directory: 50

In [4]:
img_list=np.asarray(img_list)

In [14]:
# we now find the ave width and higth to rescale the images by
h = []
w = []
for ii, im in enumerate(img_list):
    if len(im.shape) == 3:
        h.append(im.shape[0])
        w.append(im.shape[1])
    

In [15]:
### Strangly, it seems that the model does better if we don't resize the images. so I removed this feature from the code
print(np.mean(h))
print(np.mean(w))



338.281472684
424.582185273


## Loop over all files in folder


In [3]:
#path = '/Users/cwfink/Documents/School/UC_Berkeley/Classes/Spring_2018/AY250/HW/Fink_ay250_homework/hw_6/50_categories/'

def classify_training_images(path,lgc_Save = False):
    #base_path = '50_categories/'
    subdirs = glob(path + '*/')

    X = [] 
    Y = []

    num_process = 16
    pool = Pool(processes=num_process)
    start_time = time()
    for ii, folder in enumerate(subdirs):

        label = folder.split('/')[1]
        print('Processing Files for: {}, On folder {} out of 50'.format(label,ii+1))

        files = glob(folder+'*.jpg')
        Features = pool.map(get_features,files)
        X.append(np.vstack(Features))
        Y.append(np.repeat(label,len(Features)))
        #print(Features)
    end_time = time()
    print('time to complete: {} min'.format(round((end_time-start_time)*60),3))
    pool.terminate()
    del pool

    X = np.vstack(X)
    Y = np.concatenate(Y)
    
    if lgc_Save:
        np.save('X_data.npy', X)
        np.save('Y_data.npy', Y)
    
    return X,Y

## Function to build ML random forest model

In [4]:
def build_model(X,Y,lgc_Save=False):
    """
    Scales the X data, trains the model, saves it, and prints the accuracy
    """
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,stratify = Y, random_state=42)
    #scale training data
    X_scale = StandardScaler()
    X_train = X_scale.fit_transform(X_train)
    X_test = X_scale.transform(X_test)
    #get a baseline 
    dummy_clf = DummyClassifier(strategy='prior')
    dummy_clf.fit(X_train, Y_train)
    baseline_score = dummy_clf.score(X_test, Y_test)
    
    clf_rand_forest = RandomForestClassifier(n_estimators=1000)
    clf_rand_forest = RandomForestClassifier(class_weight='balanced',n_jobs=-1, random_state=42)
    clf_rand_forest.fit(X_train, Y_train)
    score = clf_rand_forest.score(X_test, Y_test)
    
    print('Baseline score: {:2f} '.format(baseline_score))
    print('Model score: {:2f}'.format(score))
    
    cv_scores = cross_val_score(clf_rand_forest, X, Y, cv=5)
    print("Accuracy from cross validation: {:2f} +/- {:2f}".format(cv_scores.mean(), cv_scores.std() * 2))
    
    print('Feature importance: \n{}'.format(clf_rand_forest.feature_importances_))
    
    if lgc_Save:
        save_file = {'model' : clf_rand_forest,
                     'Scaler' : X_scale}
        with open('ML_model.pkl','wb') as file:
            pickle.dump(save_file,file)
    
    return clf_rand_forest

## Function to be used for the final classifier to test model

In [5]:
def final_classifier(path, model = 'ML_model.pkl'):
    
    X_final = []
    label = []
    
    with open(model, 'rb') as file:
        ML_model = pickle.load(file)
    model = ML_model['model']
    X_scale = ML_model['Scaler']
    
    files = glob(path+'*.jpg')
    
    for ii, file in enumerate(files):
        X_final.append(get_features(file))
        label.append(file.split('/')[-1])
        
    X_final = np.asarray(X_final)
    
    X_final_scaled = X_scale.transform(X_final)
    
    prediction = model.predict(X_final_scaled)
        
    with open('final_predictions.txt', 'w') as file:    
        file.write('{:15} {}'.format('filename','predicted_class'))
        file.write('\n------------------------------------------\n')
        for ii, p in enumerate(prediction):
            file.write('\n{:15} ==> {}\n'.format(label[ii],p))
            
            
    print('predictions have been saved')      
    return
    

## Build model and test final clasifier functionality

In [None]:
#Classify the data and Build the model
X,Y = classify_training_images(path = '50_categories/',lgc_Save=True)
model = build_model(X,Y,lgc_Save=True)

#Just to test that the final classifier is working as expected
#path = '50_categories/bat/'
#final_classifier(path)

Processing Files for: gorilla, On folder 1 out of 50
Processing Files for: raccoon, On folder 2 out of 50
Processing Files for: crab, On folder 3 out of 50
Processing Files for: blimp, On folder 4 out of 50
Processing Files for: snail, On folder 5 out of 50
Processing Files for: airplanes, On folder 6 out of 50
Processing Files for: dog, On folder 7 out of 50
Processing Files for: dolphin, On folder 8 out of 50
Processing Files for: goldfish, On folder 9 out of 50
Processing Files for: giraffe, On folder 10 out of 50
Processing Files for: bear, On folder 11 out of 50
Processing Files for: killer-whale, On folder 12 out of 50
Processing Files for: penguin, On folder 13 out of 50
Processing Files for: zebra, On folder 14 out of 50
Processing Files for: duck, On folder 15 out of 50
Processing Files for: conch, On folder 16 out of 50
Processing Files for: camel, On folder 17 out of 50
Processing Files for: owl, On folder 18 out of 50
Processing Files for: helicopter, On folder 19 out of 50

## Conclusion: I wouldn't bet my life on the accuracy of this model...
So it seems that the all the features have about the same importance, and the 'dumb' features seem to be weighted slighly higher. This may be because all my features may in fact be pretty dumb.... 