In [206]:
#import all the things
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

from PIL import Image
from PIL import ImageStat

from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt

import numpy as np
import fnmatch
import os
import pylab
import pandas as pd
from tqdm import tqdm

from skimage.filters import threshold_otsu
from skimage.feature import corner_harris, corner_subpix, corner_peaks
from skimage import color
from skimage.feature import canny
from skimage.transform import (hough_line, hough_line_peaks,
                               probabilistic_hough_line)

%matplotlib inline

In [207]:
#connect to database
dbname = 'coffee_db'
username = 'emclinden'
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print engine.url

postgres://emclinden@localhost/coffee_db


In [None]:
sql_query = """
SELECT * FROM image_address;
"""
data = pd.read_sql_query(sql_query,engine)

In [209]:
rY = 0.212655;
gY = 0.715158;
bY = 0.072187;

# Inverse of sRGB "gamma" function. (approx 2.2)
def inv_gam_sRGB(ic): 
    c = ic/255.0
    if ( c <= 0.04045 ):
         ans = c/12.92;
    else:
        ans = ((c+0.055)/(1.055))**2.4
    return ans

# sRGB "gamma" function (approx 2.2)
def gam_sRGB(v):
    if v <= 0.0031308:
        v =  v *12.92
    else:
        v = 1.055* pow(v,1.0/2.4)-0.055;
        v = 1.055* (v**(1.0/2.4)) - 0.055
    return int(v*255+.5)

#  GRAY VALUE ("brightness")
def gray(r, g, b):
    a =  rY*inv_gam_sRGB(r) + gY*inv_gam_sRGB(g) + bY*inv_gam_sRGB(b)
    return gam_sRGB(a)

In [210]:
white = [255,255,255]
yellow = [255,255,0]
orange = [255,165,0]
red = [255,255,0]
magenta = [255,0,255]
purple = [128,0,128]
blue =[0,0,255]
cyan = [0,255,255]
green = [0,128,0]
dark_green = [0,100,0]
brown = [140,72,22]
tan = [215, 180, 130]
tan = [245,215,200]
light_grey = [204,204,204]
med_grey = [102,102,102]
dark_grey = [51,51,51]
black = [0,0,0]
palette = [white,yellow,orange,red,magenta,purple,blue,cyan,green,dark_green,brown,tan,light_grey,med_grey,dark_grey,black]

In [217]:
#this cell is the meat - calculate lots of image features for each image in tables

bright = [] #empty list to hold brightness values
contrast = [] #empty list to hold contrast values
rgb1 = [] #empty list to hold hex colors (each is list of 8 hex colors)
rgb2 = []
rgb3 = []
nedges = []
nforeground = []
ncorners = []
nlines = []
nbucket = []
colorfrac = []

for pim in tqdm(data.picture_address):
    number = []
    hold = []
    img = Image.open(pim,'r')
    new_img = img.resize((50,50))
    n  = np.array(new_img)
    
    #get dominant colors
    for p in range(0,50):
        for i in n[p]:
            d = np.sqrt(np.sum((i - palette)**2,axis=1))
            md = np.argmin(d)
            hold.append(palette[md])
    hold = np.asarray(hold)
    a = np.reshape(hold,(50,50,3))
    im = Image.fromarray(np.uint8(a))
    new_name = '/Users/emclinden/coffee012316/s50_'+ pim.split('012316/')[1]
    im.save(new_name)
    h = list(hold)
    p = list(palette)
    for pal in p:
        count = 0
        for ho in h:
            if np.array_equal(ho,pal):
                count = count+1
        number.append(count)
    
    #how many color buckets actually have pixels in them?
    nbucket.append(np.size(np.nonzero(np.asarray(number))))
    
    #size of biggest color bucket
    colorfrac.append(np.max(np.asarray(number)) / 2500.)
    
    #get names of three biggest color buckets
    ind = np.argmax(number) #index of most prominent color
    val = np.max(number)
    rgb1.append(palette[ind]) #apply index to palette
    number.remove(val)
    ind = np.argmax(number) +1 #index of 2nd most prominent color
    val = np.max(number)
    rgb2.append(palette[ind])
    number.remove(val)
    ind = np.argmax(number) +2 #index of 3rd most prominent color
    val = np.max(number)
    rgb3.append(palette[ind])
    
    #get brightness
    bb = []
    try:
        for p in range(0,50):
            for i in n[p]:
                r,g,b = i
                bb.append(gray(r,g,b))
        bright.append(np.mean(bb))
    except:
        foo = np.empty([50, 50, 3])
        foo[:,:,0] = n; foo[:,:,1] = n; foo[:,:,2] = n;
        for p in range(0,50):
            for i in foo[p]:
                r,g,b = i
                bb.append(gray(r,g,b))
        bright.append(np.mean(bb))

    #get contrast
    gimg = Image.open(pim,'r').convert('L')
    ngimg = np.asarray(gimg)
    gim = np.ravel(gimg)
    hist,bins = np.histogram(gim,255,[0,255])
    itemindex = np.where((hist > 10) == True)
    contrast.append(itemindex[0][-1] - itemindex[0][0]) #store contrast
    
    #get fraction of foreground pixels
    thresh = threshold_otsu(ngimg)
    binary = ngimg > thresh
    t = np.extract(binary == 1,binary)
    nforeground.append(len(t) / np.size(ngimg))
    
    #get number of corners (from thresholded image)
    coords = corner_peaks(corner_harris(binary), min_distance=5,threshold_rel=0.001)
    ncorners.append(len(coords))
    
    #get number of edges
    edges = canny(ngimg, sigma=2, low_threshold=100, high_threshold=150)
    t = np.extract(edges == 1,edges)
    nedges.append(len(t) / np.size(ngimg))
    
    #get number of straight lines
    edges = canny(ngimg, 2, 50, 100)
    lines = probabilistic_hough_line(edges, threshold=10, line_length=20,
                                     line_gap=3)
    nlines.append(len(lines))




In [218]:
#read image_stats because we want to append few columns
sql_query = """
SELECT * FROM image_stats;
"""
data2 = pd.read_sql_query(sql_query,engine)

In [223]:
#append columns
data2['RGB_1'] = rgb1
data2['RGB_2'] = rgb2
data2['RGB_3'] = rgb3
data2['brightness'] = bright
data2['contrast'] = contrast
data2['n_edges'] = nedges
data2['n_foreground'] = nforeground
data2['n_corners'] = ncorners
data2['n_lines'] = nlines
data2['color_fraction'] = colorfrac
data2['nbuckets'] = np.asarray(nbucket) / 16.

In [229]:
type(data2.camera_make)

pandas.core.series.Series

In [199]:
data2 = data2.drop('index',1)

In [None]:
data2.to_sql('image_stats', engine, if_exists='replace')

In [481]:
df = data2[(data2.camera_make != 'None') ]
df2 = df[df.iso_speed != 'None']
df3 = df2[df2.focal_length != 'None']
df4 = df3[df3.aperture != 'None']

In [482]:
df = df4.drop('index',axis=1)
df4 = df.reset_index(drop = 'True')

In [485]:
#fix exposure syntax
exp = []
for d in df4.exposure:
    try:
        dd = d.split('/')
        exp.append(float(dd[0]) / float(dd[1]))
    except:
        exp.append(float(d))
df = df4.drop('exposure',1)
df['exposure']= exp

In [486]:
#fix focal length syntax
foc = []
for d in df.focal_length:
    foc.append(float(d.split(' mm')[0]))
df.drop('focal_length',1,inplace='True')
df['focal_length'] = foc

In [487]:
#define and add binary definitions for cameras
camone=[]; camtwo=[]; camthree=[]; camfour=[];

for d in df.camera_make:
    if d.find('Canon') == 0:
        cam1 = 1; cam2 = 0; cam3 = 0; cam4 = 0; 
    elif d.find('NIKON') == 0: 
        cam1 = 0; cam2 = 1; cam3 = 0; cam4 = 0;
    elif d.find('Apple') == 0:
        cam1 = 0; cam2 = 0; cam3 = 1; cam4 = 0;
    elif d.find('Sony') == 0:
        cam1 = 0; cam2 = 0; cam3 = 0; cam4 = 1; 
    else:
        cam1 = 0; cam2 = 0; cam3 = 0; cam4 = 0;
    camone.append(cam1); camtwo.append(cam2); camthree.append(cam3); camfour.append(cam4);

In [488]:
df.drop('camera_make',1,inplace='True')
df['camera1'] = cam1
df['camera2'] = cam2
df['camera3'] = cam3
df['camera4'] = cam4

In [489]:
arr = df.as_matrix(['n_edges','n_lines','n_corners'])

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(arr)

a= (kmeans.labels_)

In [None]:
df['cluster_member'] = a
df.to_sql('image_stats', engine, if_exists='replace')

In [595]:
newdf = pd.DataFrame()

In [596]:
#popularity metric
y = np.asarray(df.n_favs).astype('float') / np.asarray(df.n_views).astype('float')
newdf['picture_id'] = df.picture_id
newdf['popularity'] = y

In [597]:
#float aperture value and scale
aperture = np.asarray(df.aperture).astype('string').astype('float')
newdf['aperture_scale'] = (aperture - np.mean(aperture))/ np.std(aperture)

In [598]:
#parse bad iso values and scale
iso=[]
for d in df.iso_speed:
    try:
        iso.append(d.astype('str').astype('float'))
    except:
        iso.append(float(d.split(',')[0]))
newdf['iso_speed_scale'] = (iso - np.mean(iso)) / np.std(iso)

In [613]:
newdf['brightness_scale'] = (df.brightness - np.mean(np.asarray(df.brightness))) / np.std(np.asarray(df.brightness))
newdf['contrast_scale'] = (df.contrast - np.mean(np.asarray(df.contrast))) / np.std(np.asarray(df.contrast))
newdf['frac_edges'] = df.n_edges 
newdf['n_corners_scale'] = (df.n_corners - np.mean(np.asarray(df.n_corners))) / np.std(np.asarray(df.n_corners))
newdf['frac_foreground'] = df.n_foreground
newdf['n_lines_scale'] = (df.n_lines - np.mean(np.asarray(df.n_lines))) / np.std(np.asarray(df.n_lines))
newdf['domcolor_frac'] = df.color_fraction
newdf['frac_buckets'] = df.nbuckets
newdf['exposure'] = df.exposure
newdf['focal_length_scale'] = (df.focal_length - np.mean(np.asarray(df.focal_length))) / np.std(np.asarray(df.focal_length))

In [615]:
newdf['camera1'] = df.camera1
newdf['camera2'] = df.camera2
newdf['camera3'] = df.camera3
newdf['camera4'] = df.camera4

In [617]:
newdf['cluster_member'] = df.cluster_member

In [619]:
newdf.to_sql('image_features', engine, if_exists='replace')