# Samuel Watkins, 3032132676

# HW 6: Homebrew Computer Vision
## Due Monday Apr 2, 2018 at 2 PM

1. Download the [zip file](https://www.dropbox.com/s/cst9awcjpp08k33/50_categories.tar.gz). Look at some of the images, noting that there are 50 classes in 4244 images (e.g. "goldfish", “llama”, “speed-boat”, ...). Caution: it’s a pretty large file (~208M).
2. Write a set of methods that takes as input one of these images, and then computes real-numbered features as the return. You should produce at least 15 features.
3. Based on the feature set for each image, build a random forest classifier. Produce metrics on your estimated error rates using cross-validation. How much better is this than the expectation with random guessing? What are the 3 most important features?
4. Make sure your final classifier can run on a directory of different images, where a call like `run_final_classifier("/new/directory/path/")` on a directory that contains files like `validation1.jpg`, `validation2.jpg`, etc. will produce an output file that looks like:  
```
filename              predicted_class  
``` 
` `-----------------------------------------------------------------
```
validation1.jpg       unicorn  
validation2.jpg       camel  
```

    We will have a validation set to test how good your classifier is.

# Function to Extract Features from an Image

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from skimage.feature import corner_harris,peak_local_max,canny,corner_peaks
from skimage.segmentation import slic
from skimage.color.colorconv import rgb2grey,grey2rgb
from skimage.filters import frangi
from skimage.transform import rescale
from skimage.measure import shannon_entropy
from skimage.filters import threshold_otsu

In [2]:
def extractImageFeatures(pathToImage,smallImageSize=16000):
    imageArray = plt.imread(pathToImage).astype("float")
    if len(imageArray.shape)<3:
        imageArray = grey2rgb(imageArray)
    greyImgArr = rgb2grey(imageArray)
    scaleFactor = np.sqrt(smallImageSize/np.prod(greyImgArr.shape))
    smallImage = rescale(greyImgArr,scaleFactor,mode = "constant")
    
    # dumb features
    imgSize = np.prod(greyImgArr.shape)
    imgHeight = greyImgArr.shape[0]
    imgWidth = greyImgArr.shape[1]
    avgAllChans = np.mean(imageArray)
    stdAllChans = np.std(imageArray)
    ratioStdAvgAllChans = stdAllChans/avgAllChans
    
    avgRedChan = np.mean(imageArray[:,:,0])
    stdRedChan = np.std(imageArray[:,:,0])
    
    avgBlueChan = np.mean(imageArray[:,:,1])
    stdBlueChan = np.std(imageArray[:,:,1])
    
    avgGreenChan = np.mean(imageArray[:,:,2])
    stdGreenChan = np.std(imageArray[:,:,2])
    
    ratioRedBlue = avgRedChan/avgBlueChan
    ratioBlueGreen = avgBlueChan/avgGreenChan
    ratioRedGreen = avgRedChan/avgGreenChan
    
    ratioStdAvgRedChan = stdRedChan/avgRedChan
    ratioStdAvgBlueChan = stdBlueChan/avgBlueChan
    ratioStdAvgGreenChan = stdGreenChan/avgGreenChan
    
    
    # "smart" features
    corners = corner_harris(smallImage)
    numCorners = len(corner_peaks(corners))
    
    peaks = peak_local_max(smallImage)
    numPeaks = len(peaks)
    
    segments = slic(smallImage)
    numSegments = np.max(segments)
    
    edges = canny(frangi(smallImage))
    edgeLength = np.sum(edges)

    ratioCornersPeak = numCorners/numPeaks
    ratioCornersSegments = numCorners/numSegments
    ratioCornersEdges = numCorners/edgeLength
    ratioPeaksSegments = numPeaks/numSegments
    ratioPeaksEdges = numPeaks/edgeLength
    ratioSegmentsEdges = numSegments/edgeLength
    
    shanent = shannon_entropy(imageArray)
    
    thresh = threshold_otsu(smallImage)
    foreground = np.sum(smallImage <=thresh)

    features=np.array([imgSize,imgHeight,imgWidth,avgRedChan,stdRedChan,avgBlueChan,stdBlueChan,
              avgGreenChan,stdGreenChan,ratioRedBlue,ratioBlueGreen,
              ratioRedGreen,ratioStdAvgRedChan,ratioStdAvgBlueChan,ratioStdAvgGreenChan,
              avgAllChans,stdAllChans,ratioStdAvgAllChans,
              numCorners,numPeaks,numSegments,edgeLength,
              ratioCornersPeak,ratioCornersSegments,ratioCornersEdges,ratioPeaksSegments,
              ratioPeaksEdges,ratioSegmentsEdges,shanent,thresh,foreground])
    
    if np.any(np.isnan(features)):
        print(pathToImage)
    features[np.isnan(features)]=0.0
    features[np.isinf(features)]=0.0
    
    return features

In [None]:
pathToImage = "/home/sam/Documents/watkins-ay250-s2018-hw/hw_6/50_categories/bear/bear_0010.jpg"
imageArray = plt.imread(pathToImage).astype("float")
imageArray = grey2rgb(imageArray)
greyImgArr = rgb2grey(imageArray)
scaleFactor = np.sqrt(16000.0/np.prod(greyImgArr.shape))
imgScaled = rescale(greyImgArr,scaleFactor,mode = "constant")
contedges = canny(frangi(imgScaled))


# plt.imshow(contedges)

binary = imgScaled <= threshold_otsu(imgScaled)
plt.imshow(canny(binary))
# timenow=time()
# extractImageFeatures(pathToImage);
# print(time()-timenow)

# Extract Features from All Images

In [3]:
from glob import glob
from sklearn.model_selection import train_test_split
from multiprocessing import Pool
from time import time

In [4]:
pathToImageFolders = "50_categories/"
eachFolder = glob(pathToImageFolders+"*/")
train_size = 0.5 # ratio of training dataset to total dataset
X = list()
Y = list()

# open up 16 processes to extract features in parallel
num_processes = 16
pool = Pool(processes=num_processes)

starttime = time()
for iFolder,folder in enumerate(eachFolder):
    print(f"Looking in folder {iFolder+1} of {len(eachFolder)} folders...")
    filesInFolder = glob(folder+"*.jpg")
    parallelFeatures = pool.map(extractImageFeatures,filesInFolder)
    X.append(np.vstack(parallelFeatures))
    Y.append(np.repeat(folder[len(pathToImageFolders):-1],len(filesInFolder)))

print(time()-starttime)
        
pool.terminate()
del pool

X = np.vstack(X)
Y = np.concatenate(Y)
        
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=train_size,stratify=Y)


Looking in folder 1 of 50 folders...
Looking in folder 2 of 50 folders...
Looking in folder 3 of 50 folders...
Looking in folder 4 of 50 folders...
Looking in folder 5 of 50 folders...
Looking in folder 6 of 50 folders...
Looking in folder 7 of 50 folders...
Looking in folder 8 of 50 folders...
Looking in folder 9 of 50 folders...
Looking in folder 10 of 50 folders...
Looking in folder 11 of 50 folders...
Looking in folder 12 of 50 folders...
Looking in folder 13 of 50 folders...
Looking in folder 14 of 50 folders...
Looking in folder 15 of 50 folders...
Looking in folder 16 of 50 folders...
Looking in folder 17 of 50 folders...
Looking in folder 18 of 50 folders...
Looking in folder 19 of 50 folders...
Looking in folder 20 of 50 folders...
Looking in folder 21 of 50 folders...
Looking in folder 22 of 50 folders...
Looking in folder 23 of 50 folders...




Looking in folder 24 of 50 folders...
Looking in folder 25 of 50 folders...
Looking in folder 26 of 50 folders...




Looking in folder 27 of 50 folders...
Looking in folder 28 of 50 folders...
Looking in folder 29 of 50 folders...
Looking in folder 30 of 50 folders...
Looking in folder 31 of 50 folders...




Looking in folder 32 of 50 folders...
Looking in folder 33 of 50 folders...
Looking in folder 34 of 50 folders...
Looking in folder 35 of 50 folders...
Looking in folder 36 of 50 folders...
Looking in folder 37 of 50 folders...
Looking in folder 38 of 50 folders...




Looking in folder 39 of 50 folders...
Looking in folder 40 of 50 folders...
Looking in folder 41 of 50 folders...
Looking in folder 42 of 50 folders...
Looking in folder 43 of 50 folders...
Looking in folder 44 of 50 folders...
Looking in folder 45 of 50 folders...




Looking in folder 46 of 50 folders...
Looking in folder 47 of 50 folders...
Looking in folder 48 of 50 folders...
Looking in folder 49 of 50 folders...
Looking in folder 50 of 50 folders...
302.83050751686096




# Build A Random Forest Classifier

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [6]:
randforclf = RandomForestClassifier(n_estimators=50)

randforclf.fit(X_train,Y_train)

pred_rf = randforclf.predict(X_test)

In [7]:
print(f"Score: {metrics.accuracy_score(Y_test,pred_rf)}")
scores = cross_val_score(randforclf,X,Y,cv=5,groups=Y)
print(f"Accuracy from cross-validation: {np.mean(scores)} (+/- {np.std(scores)})")

Score: 0.29688972667295005
Accuracy from cross-validation: 0.29979769744902585 (+/- 0.012319788085494752)


In [8]:
randforclf.feature_importances_

array([0.05261205, 0.06150218, 0.05622375, 0.02648454, 0.02782996,
       0.0284188 , 0.02592573, 0.0257615 , 0.02967849, 0.03422208,
       0.03410235, 0.03207322, 0.02728713, 0.02694469, 0.02831861,
       0.02715394, 0.02641158, 0.02404731, 0.024617  , 0.0369651 ,
       0.03026014, 0.03024907, 0.02895527, 0.02803103, 0.02707987,
       0.03065793, 0.03383542, 0.02991957, 0.04433411, 0.02996621,
       0.0301314 ])

# Compare to Random Guessing

In [9]:
from sklearn.dummy import DummyClassifier

In [10]:
dummyclf = DummyClassifier(strategy="prior",random_state=42)

dummyclf.fit(X_train,Y_train)

dummypred_rf = dummyclf.predict(X_test)

In [12]:
print(metrics.accuracy_score(Y_test,dummypred_rf))
scores = cross_val_score(dummyclf,X,Y,cv=5,groups=Y)
print(f"{np.mean(scores)} (+/- {np.std(scores)})")

0.12582469368520263
0.12562240614744166 (+/- 0.0018389617183439597)
