In [64]:
# Ref : http://blog.zeevgilovitz.com/detecting-dominant-colours-in-python/

from PIL import Image
import random
import numpy as np
import pandas as pd
import itertools
import webcolors

In [16]:
df = pd.read_csv('../data/data_updated.csv')
df = df.drop('Unnamed: 0', 1)

In [17]:
df.head()

Unnamed: 0,target_market,score,cover_image,sugar_result,fiber_result,sodium_result,sugar_per_serving,fiber_per_serving,sodium_per_serving,calories_per_serving,...,has_cha,down_gaze,b_mean,g_mean,r_mean,b_sd,g_sd,r_sd,most_fre_color,most_fre_color_count
0,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,...,1,0,35.58,71.37,208.29,54.28,86.32,62.01,#e70000,256
1,Family,52,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,310,7,1,90,110,...,1,0,53.31,107.21,195.62,46.53,48.07,54.99,#d8501e,256
2,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,...,1,0,65.4,123.12,191.4,52.63,60.91,40.15,#bd582a,256
3,Family,64,http://cerealfacts.org/media/cereal_images/Ann...,7%,3%,379,2,1,110,120,...,1,0,95.78,78.63,115.69,47.58,71.51,58.18,#581680,1272
4,Family,58,http://cerealfacts.org/media/cereal_images/Bar...,23%,10%,267,7,3,80,120,...,1,0,108.41,153.04,186.66,48.37,51.09,53.44,#ffffff,681


In [18]:
def gen_file_name(row):
    file_name = row.cover_image.split('/')[len(row.cover_image.split('/'))-1]
    file_name = 'img_' + str(row['index']).zfill (3) + '_' + (file_name).lower()
    return(file_name)

In [19]:
df['file_name'] = df.apply(gen_file_name, axis=1)

In [20]:
df.head()

Unnamed: 0,target_market,score,cover_image,sugar_result,fiber_result,sodium_result,sugar_per_serving,fiber_per_serving,sodium_per_serving,calories_per_serving,...,down_gaze,b_mean,g_mean,r_mean,b_sd,g_sd,r_sd,most_fre_color,most_fre_color_count,file_name
0,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,...,0,35.58,71.37,208.29,54.28,86.32,62.01,#e70000,256,img_001_anniesfruitybunnies.jpg
1,Family,52,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,310,7,1,90,110,...,0,53.31,107.21,195.62,46.53,48.07,54.99,#d8501e,256,img_002_annieshoneybunnies.jpg
2,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,...,0,65.4,123.12,191.4,52.63,60.91,40.15,#bd582a,256,img_003_anniescinnamonrollbunnyos.jpg
3,Family,64,http://cerealfacts.org/media/cereal_images/Ann...,7%,3%,379,2,1,110,120,...,0,95.78,78.63,115.69,47.58,71.51,58.18,#581680,1272,img_004_anniesorganicbunnyos.jpg
4,Family,58,http://cerealfacts.org/media/cereal_images/Bar...,23%,10%,267,7,3,80,120,...,0,108.41,153.04,186.66,48.37,51.09,53.44,#ffffff,681,img_005_barbarasbakerypuffinspuffscrunchycocoa...


In [41]:
# K-mean Clustering

# Class : Cluster
class Cluster(object):

    def __init__(self):
        self.pixels = []
        self.lastpixels = []
        self.centroid = None

    def addPoint(self, pixel):
        self.pixels.append(pixel)
        
    def pixelCount(self):
        return len(self.lastpixels)

    def setNewCentroid(self):

        R = [colour[0] for colour in self.pixels]
        G = [colour[1] for colour in self.pixels]
        B = [colour[2] for colour in self.pixels]

        R = sum(R) / (len(R) + 1)
        G = sum(G) / (len(G) + 1)
        B = sum(B) / (len(B) + 1)
        
        self.centroid = (R, G, B)
        self.lastpixels = self.pixels
        self.pixels = []

        return self.centroid

In [42]:
# Class : Kmeans
class Kmeans(object):

    def __init__(self, k=3, max_iterations=5, min_distance=2.0, size=300):
        self.k = k
        self.max_iterations = max_iterations
        self.min_distance = min_distance
        self.size = (size, size + 1)

    def run(self, image):
        self.image = image
        self.image.thumbnail(self.size)
        self.pixels = np.array(image.getdata(), dtype=np.uint8)

        self.clusters = [None for i in range(self.k)]
        self.oldClusters = None

        randomPixels = random.sample(self.pixels, self.k)

        for idx in range(self.k):
            self.clusters[idx] = Cluster()
            self.clusters[idx].centroid = randomPixels[idx]

        iterations = 0

        while self.shouldExit(iterations) is False:

            self.oldClusters = [cluster.centroid for cluster in self.clusters]

            # print iterations

            for pixel in self.pixels:
                self.assignClusters(pixel)

            for cluster in self.clusters:
                cluster.setNewCentroid()

            iterations += 1
        
        pixelCounts = [cluster.pixelCount() for cluster in self.clusters]
        
        clusters_centroid = [cluster.centroid for cluster in self.clusters]
        
        return pixelCounts

    def assignClusters(self, pixel):
        shortest = float('Inf')
        for cluster in self.clusters:
            distance = self.calcDistance(cluster.centroid, pixel)
            if distance < shortest:
                shortest = distance
                nearest = cluster

        nearest.addPoint(pixel)

    def calcDistance(self, a, b):

        result = np.sqrt(sum((a - b) ** 2))
        return result

    def shouldExit(self, iterations):

        if self.oldClusters is None:
            return False

        for idx in range(self.k):
            dist = self.calcDistance(
                np.array(self.clusters[idx].centroid),
                np.array(self.oldClusters[idx])
            )
            if dist < self.min_distance:
                return True

        if iterations <= self.max_iterations:
            return False

        return True

    # The remaining methods are used for debugging
    
    def showImage(self):
        self.image.show()

    def showCentroidColours(self):

        for cluster in self.clusters:
            image = Image.new("RGB", (200, 200), cluster.centroid)
            image.show()

    def showClustering(self):

        localPixels = [None] * len(self.image.getdata())

        for idx, pixel in enumerate(self.pixels):
                shortest = float('Inf')
                for cluster in self.clusters:
                    distance = self.calcDistance(
                        cluster.centroid,
                        pixel
                    )
                    if distance < shortest:
                        shortest = distance
                        nearest = cluster

                localPixels[idx] = nearest.centroid

        w, h = self.image.size
        localPixels = np.asarray(localPixels)\
            .astype('uint8')\
            .reshape((h, w, 3))

        colourMap = Image.fromarray(localPixels)
        colourMap.show()
        
    def printCentroidColoursHexCode(self):
        hexcodes = []
        for cluster in self.clusters:
            hexcode = '#'+''.join(map(chr, cluster.centroid)).encode('hex')
            hexcodes.append(hexcode)
            # print cluster.centroid, hexcode
          
        print hexcodes
        
        return hexcodes
    
    def printCentroidColoursRGB(self):
        RGBs = []
        for cluster in self.clusters:
            RGBs.append(cluster.centroid)
        
        print RGB
        
        return RGB

In [43]:
img_name = df['file_name']

In [44]:
hexcodes_dominant_color = []
pixels_num_color = []

for i in range(260):
    img = Image.open('../img/' + img_name[i])
    
    k = Kmeans()
    pixels_num = k.run(img)
    
    hexcode = k.printCentroidColoursHexCode()
    print pixels_num
    
    hexcodes_dominant_color.append(hexcode)
    pixels_num_color.append(pixels_num)

['#fdfcfc', '#d31510', '#c7a43e']
[40685, 32671, 16644]
['#df5a22', '#644f30', '#c4b26e']
[39109, 10467, 13032]
['#c8b347', '#f2e9d8', '#b15329']
[16192, 11510, 35207]
['#4e186f', '#c3af5a', '#756437']
[34797, 16500, 12816]
['#5c4147', '#c89e5d', '#e0ceb3']
[10803, 34860, 15139]
['#623849', '#cda66d', '#e6d6bd']
[10832, 29772, 19596]
['#cdb28d', '#be7c3b', '#412d42']
[32909, 18499, 6083]
['#d4bd9f', '#b8936a', '#603636']
[20685, 25494, 11613]
['#bb8956', '#502f34', '#d2b893']
[18918, 10631, 27641]
['#bf986d', '#523543', '#dccbb4']
[35640, 7256, 14294]
['#b18c66', '#d4bea1', '#403340']
[22261, 23869, 15876]
['#d9c8b0', '#ba976e', '#443c4c']
[13969, 32544, 10376]
['#b7875b', '#d0b695', '#5a3530']
[19082, 27832, 10878]
['#b9946a', '#674334', '#d4c0a6']
[29000, 9081, 19410]
['#b68f6a', '#d4bc9f', '#4c4050']
[25742, 23176, 8874]
['#85765c', '#abb3a6', '#3a3d55']
[42244, 13667, 11814]
['#cfaf94', '#a76443', '#281315']
[52950, 21778, 14368]
['#b79571', '#5e3a38', '#d5bfa5']
[25726, 11433, 203

In [47]:
print len(pixels_num_color)

260


In [48]:
print len(hexcodes_dominant_color)

260


In [49]:
pixels_num_color[0]

[40685, 32671, 16644]

In [50]:
hexcodes_dominant_color[0][np.argmax(pixels_num_color[0])]

'#fdfcfc'

In [51]:
most_domin_colors = []
for i in range(len(pixels_num_color)):
    most_domin_colors.append(hexcodes_dominant_color[i][np.argmax(pixels_num_color[i])])

In [57]:
# most_domin_colors return the most dominant color in each image
print most_domin_colors

['#fdfcfc', '#df5a22', '#b15329', '#4e186f', '#c89e5d', '#cda66d', '#cdb28d', '#b8936a', '#d2b893', '#bf986d', '#d4bea1', '#ba976e', '#d0b695', '#b9946a', '#b68f6a', '#85765c', '#cfaf94', '#b79571', '#c3a17b', '#c3a37c', '#c19f77', '#c5a365', '#cfad81', '#ccad80', '#c7a37a', '#fbfaf9', '#f8f8f7', '#fbfbf9', '#f7f4ed', '#e5e8e9', '#f0f0ec', '#f6f6f5', '#eaeae4', '#f6f8f8', '#f9f8f4', '#dadad8', '#e1e7e7', '#f9f8f4', '#f4f3f0', '#ecf0f3', '#faf9f8', '#eef6fa', '#91904d', '#f4e7a6', '#eee374', '#ecf0ee', '#d9161c', '#f5f5f4', '#47366b', '#fcfcfb', '#f6f6f5', '#f4f2ed', '#d8c896', '#492e29', '#803924', '#f5f5f4', '#ebedd9', '#f7f7f5', '#f9f7f6', '#e8ecec', '#f6f6f5', '#dfd1d5', '#c92249', '#3e2f27', '#3f1218', '#f8f8f8', '#eff1f1', '#571d0f', '#f6f6f6', '#f8f6f3', '#eff0ee', '#e7aa68', '#f3e5c5', '#f8f3eb', '#f6e7c4', '#f1ecce', '#f8e8cd', '#e7ebed', '#e189b0', '#e7b72f', '#d4a329', '#8c2673', '#f9f8f5', '#f9f9f7', '#fcfbfa', '#fbf9f7', '#e1dcd0', '#efeced', '#814557', '#fbfbf8', '#441e5b'

In [56]:
print most_domin_colors[48]

#47366b


In [69]:
def hex_to_rgb(value):
    value = value.lstrip('#')
    lv = len(value)
    return tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))

In [70]:
hex_to_rgb('#958d8d')

(149, 141, 141)

In [65]:
def closest_colour(requested_colour):
    pri_color = ['Red', 'Orange', 'Yellow', 'Green', 'Blue', 'Violet', 'Brown', 'Black', 'Grey', 'White']
    pri_color = [x.lower() for x in pri_color]
    min_colours = {}
    #for key, name in webcolors.css3_hex_to_names.items():
    for key, name in [x for x in itertools.ifilter(lambda x: x[1]  in pri_color, webcolors.css3_hex_to_names.items())]:
        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
        rd = (r_c - requested_colour[0]) ** 2
        gd = (g_c - requested_colour[1]) ** 2
        bd = (b_c - requested_colour[2]) ** 2
        min_colours[(rd + gd + bd)] = name
    return min_colours[min(min_colours.keys())]

In [73]:
closest_colour(hex_to_rgb(most_domin_colors[0]))

u'white'

In [74]:
domin_colors = []
for i in range(260):
    domin_colors.append(closest_colour(hex_to_rgb(most_domin_colors[i])))

In [75]:
print domin_colors

[u'white', u'brown', u'brown', u'brown', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'violet', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'grey', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'grey', u'white', u'yellow', u'white', u'red', u'white', u'grey', u'white', u'white', u'white', u'violet', u'brown', u'brown', u'white', u'white', u'white', u'white', u'white', u'white', u'white', u'brown', u'black', u'black', u'white', u'white', u'brown', u'white', u'white', u'white', u'orange', u'white', u'white', u'white', u'white', u'white', u'white', u'violet', u'orange', u'orange', u'brown', u'white', u'white', u'white', u'white', u'white', u'white', u'brown', u'white', u'brown', u'grey', u'white', u'white', u'brown', u'black', u'white', u'white', u'white', u'green', u'white', u'white

In [76]:
def genDominColor(row):
    return domin_colors[row['index'] - 1]

In [77]:
df['most_domin_color'] = df.apply(genDominColor, axis = 1)

In [78]:
df.head()

Unnamed: 0,target_market,score,cover_image,sugar_result,fiber_result,sodium_result,sugar_per_serving,fiber_per_serving,sodium_per_serving,calories_per_serving,...,b_mean,g_mean,r_mean,b_sd,g_sd,r_sd,most_fre_color,most_fre_color_count,file_name,most_domin_color
0,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,...,35.58,71.37,208.29,54.28,86.32,62.01,#e70000,256,img_001_anniesfruitybunnies.jpg,white
1,Family,52,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,310,7,1,90,110,...,53.31,107.21,195.62,46.53,48.07,54.99,#d8501e,256,img_002_annieshoneybunnies.jpg,brown
2,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,...,65.4,123.12,191.4,52.63,60.91,40.15,#bd582a,256,img_003_anniescinnamonrollbunnyos.jpg,brown
3,Family,64,http://cerealfacts.org/media/cereal_images/Ann...,7%,3%,379,2,1,110,120,...,95.78,78.63,115.69,47.58,71.51,58.18,#581680,1272,img_004_anniesorganicbunnyos.jpg,brown
4,Family,58,http://cerealfacts.org/media/cereal_images/Bar...,23%,10%,267,7,3,80,120,...,108.41,153.04,186.66,48.37,51.09,53.44,#ffffff,681,img_005_barbarasbakerypuffinspuffscrunchycocoa...,grey


In [79]:
df.tail()

Unnamed: 0,target_market,score,cover_image,sugar_result,fiber_result,sodium_result,sugar_per_serving,fiber_per_serving,sodium_per_serving,calories_per_serving,...,b_mean,g_mean,r_mean,b_sd,g_sd,r_sd,most_fre_color,most_fre_color_count,file_name,most_domin_color
255,Adult,70,http://cerealfacts.org/media/cereal_images/Qua...,16%,9%,339,9,5,190,210,...,164.55,136.89,90.92,62.61,58.52,98.7,#0093d8,249,img_256_quakeroatmealsquaresbrownsugar.jpg,grey
256,Adult,70,http://cerealfacts.org/media/cereal_images/Qua...,16%,9%,339,9,5,190,210,...,97.41,126.17,178.93,58.6,61.11,85.87,#26397b,1792,img_257_quakeroatmealsquarescinnamon.jpg,orange
257,Adult,70,http://cerealfacts.org/media/cereal_images/Qua...,16%,9%,339,9,5,190,210,...,91.22,121.25,167.26,65.59,61.71,74.12,#1b4482,256,img_258_quakeroatmealsquaresgoldenmaple.jpg,brown
258,Family,58,http://cerealfacts.org/media/cereal_images/Qua...,19%,19%,281,6,6,90,110,...,158.91,175.17,203.98,90.9,73.27,47.04,#e4e3e1,148,img_259_quakerlifecrunchtimestrawberry.jpg,white
259,Family,58,http://cerealfacts.org/media/cereal_images/Qua...,22%,19%,281,7,6,90,110,...,88.67,104.01,126.5,51.3,39.4,38.09,#817d7c,256,img_260_quakerlifecrunchtimeapplecinnamon.jpg,grey


In [80]:
df.to_csv('../data/data_updated.csv')