In [380]:
import pandas as pd
import numpy as np
import operator as op
import os
import urllib
import string
from ggplot import *
from IPython.display import IFrame
from PIL import Image
import colorsys
from sklearn import feature_selection
import seaborn as sns
import webcolors
from __future__ import unicode_literals
import itertools

%matplotlib inline

We used Kimono to help and get the cereal data from CerealFacts.com

In [381]:
IFrame('http://cerealfacts.org/', width=1024, height=850)

<img src="../assets/3027659-poster-p-kimono.jpg">

###Let's begin!

In [382]:
## Load in the dataset
df = pd.read_csv('../data/data.csv')

In [383]:
df.describe()

Unnamed: 0,score,sodium_result,sugar_per_serving,fiber_per_serving,sodium_per_serving,calories_per_serving,serving_size,index
count,260.0,260.0,260.0,260.0,260.0,260.0,260.0,260.0
mean,55.623077,390.165385,9.130769,3.911538,144.596154,150.480769,40.342308,130.5
std,11.464072,195.988829,3.960009,2.757206,70.477757,48.26587,12.519411,75.199734
min,26.0,0.0,0.0,0.0,0.0,60.0,19.0,1.0
25%,48.0,258.0,6.0,2.0,110.0,110.0,30.0,65.75
50%,54.0,400.0,9.0,3.0,140.0,120.0,32.0,130.5
75%,58.0,545.5,12.0,5.25,190.0,200.0,55.0,195.25
max,82.0,889.0,19.0,14.0,360.0,250.0,62.0,260.0


In [384]:
df.head()

Unnamed: 0,target_market,score,cover_image,sugar_result,fiber_result,sodium_result,sugar_per_serving,fiber_per_serving,sodium_per_serving,calories_per_serving,serving_size,index,url
0,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,29,1,http://cerealfacts.org/cereal_nutrition_scores...
1,Family,52,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,310,7,1,90,110,29,2,http://cerealfacts.org/cereal_nutrition_scores...
2,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,29,3,http://cerealfacts.org/cereal_nutrition_scores...
3,Family,64,http://cerealfacts.org/media/cereal_images/Ann...,7%,3%,379,2,1,110,120,29,4,http://cerealfacts.org/cereal_nutrition_scores...
4,Family,58,http://cerealfacts.org/media/cereal_images/Bar...,23%,10%,267,7,3,80,120,30,5,http://cerealfacts.org/cereal_nutrition_scores...


###Data pre-processing

In [385]:
pd.get_option("display.max_columns")

50

In [386]:
pd.set_option("display.max_columns", 50)

In [387]:
df['sugar_result_perc'] = df.sugar_result.str.replace('%','').astype(int)
df['fiber_result_perc'] = df.fiber_result.str.replace('%','').astype(int)
df['calories_result_ratio'] = df.calories_per_serving / df.serving_size 

In [388]:
df.iloc[:,-3:].describe()

Unnamed: 0,sugar_result_perc,fiber_result_perc,calories_result_ratio
count,260.0,260.0,260.0
mean,23.35,9.323077,3.734876
std,9.734536,6.29054,0.330563
min,0.0,0.0,2.0
25%,18.0,5.75,3.571429
50%,23.0,8.5,3.728814
75%,30.0,11.0,4.0
max,56.0,47.0,4.385965


In [389]:
df['target_market'].value_counts()

Adult     152
Family     73
Child      34
None        1
dtype: int64

In [390]:
df = df[df.target_market  != "None"]

In [391]:
df['target_market'].value_counts()

Adult     152
Family     73
Child      34
dtype: int64

In [392]:
pd.get_dummies(df['target_market']).head()

Unnamed: 0,Adult,Child,Family
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


In [393]:
df = pd.concat([df, pd.get_dummies(df['target_market'])], axis=1)

In [394]:
## 
def gen_file_name(row):
    file_name = row.cover_image.split('/')[len(row.cover_image.split('/'))-1]
    file_name = 'img_' + str(row['index']).zfill (3) + '_' + (file_name).lower()
    #print('Processing ' + str(row['index']))
    return(file_name)

## 
def gen_cropped_file_path(row):
    file_name = row.file_name
    path_name = '../img/cropped/' + string.replace(file_name, '.jpg', '_cropped.png')
    return(path_name)
    #file_name = 'img_' + str(row['index']).zfill (3) + '_' + (file_name).lower()
    #print('Processing ' + str(row['index']))
    #return(file_name)

In [395]:
df['file_name'] = df.apply(gen_file_name, axis=1)
df['cropped_path'] = df.apply(gen_cropped_file_path, axis=1)

In [396]:
df.head()

Unnamed: 0,target_market,score,cover_image,sugar_result,fiber_result,sodium_result,sugar_per_serving,fiber_per_serving,sodium_per_serving,calories_per_serving,serving_size,index,url,sugar_result_perc,fiber_result_perc,calories_result_ratio,Adult,Child,Family,file_name,cropped_path
0,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,29,1,http://cerealfacts.org/cereal_nutrition_scores...,24,3,4.137931,0,0,1,img_001_anniesfruitybunnies.jpg,../img/cropped/img_001_anniesfruitybunnies_cro...
1,Family,52,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,310,7,1,90,110,29,2,http://cerealfacts.org/cereal_nutrition_scores...,24,3,3.793103,0,0,1,img_002_annieshoneybunnies.jpg,../img/cropped/img_002_annieshoneybunnies_crop...
2,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,29,3,http://cerealfacts.org/cereal_nutrition_scores...,24,3,4.137931,0,0,1,img_003_anniescinnamonrollbunnyos.jpg,../img/cropped/img_003_anniescinnamonrollbunny...
3,Family,64,http://cerealfacts.org/media/cereal_images/Ann...,7%,3%,379,2,1,110,120,29,4,http://cerealfacts.org/cereal_nutrition_scores...,7,3,4.137931,0,0,1,img_004_anniesorganicbunnyos.jpg,../img/cropped/img_004_anniesorganicbunnyos_cr...
4,Family,58,http://cerealfacts.org/media/cereal_images/Bar...,23%,10%,267,7,3,80,120,30,5,http://cerealfacts.org/cereal_nutrition_scores...,23,10,4.0,0,0,1,img_005_barbarasbakerypuffinspuffscrunchycocoa...,../img/cropped/img_005_barbarasbakerypuffinspu...


In [397]:
## The following code download the image with the URL privedes in the csv
'''
for name, url in zip(df.file_name, df.cover_image):
    if (os.path.isfile('../img/' + name) == False):
        resource = urllib.urlopen(url)
        output = open('../img/' + name, 'wb')
        output.write(resource.read())
        output.close()
'''

u"\nfor name, url in zip(df.file_name, df.cover_image):\n    if (os.path.isfile('../img/' + name) == False):\n        resource = urllib.urlopen(url)\n        output = open('../img/' + name, 'wb')\n        output.write(resource.read())\n        output.close()\n"

In [398]:
for file_name in df.file_name:
    if (os.path.isfile('../img/cropped/' + string.replace(file_name, '.jpg', '_cropped.png')) == False):
        img = Image.open('../img/' + file_name)
        img = img.convert("RGBA")
        pixdata = img.load()
        for y in xrange(img.size[1]):
            for x in xrange(img.size[0]):
                if pixdata[x, y] == (255, 255, 255, 255):
                    pixdata[x, y] = (255, 255, 255, 0)
        img.save('../img/cropped/' + string.replace(file_name, '.jpg', '_cropped.png'), "PNG")

In [399]:
## Helper function to get the RGB and HSL

def open_convert_image(input_img_file, numcolors=1):
    image = Image.open(input_img_file)
    result = image.convert('P', palette=Image.ADAPTIVE, colors=numcolors)
    result.putalpha(0)
    return(result)

def get_list_color(input_img_file, numcolors=1):
    return(open_convert_image(input_img_file, numcolors).getcolors())

def get_rgb_list(input_img_file, numcolors=2):
    colors_list = get_list_color(input_img_file, numcolors)
    result = []
    for color in colors_list:
        result.append(list(color[1][0:3]))
    return(result)

def df_get_rgb_list(row, numcolors=3):
    return(get_rgb_list(row.cropped_path, numcolors))

def get_hls_list(input_img_file, numcolors=1):
    result = []
    for rgb_set in get_rgb_list(input_img_file, numcolors):
        result.append(list(colorsys.rgb_to_hls(*[x/255.0 for x in rgb_set])))
    return(result)

def df_get_hls_list(row, numcolors=1):
    return(list(get_hls_list(row.cropped_path, numcolors))[numcolors - 1])

def df_get_h(row):
    return(list(get_hls_list(row.cropped_path, numcolors)[0]))

def hls(row, numcolors=1):
    return(list(get_hls_list(row, numcolors))[numcolors - 1])

def rgb(row, numcolors=1):
    return(get_rgb_list(row, numcolors)[numcolors - 1])

def closest_colour(requested_colour):
    pri_color = ['Red', 'Orange', 'Yellow', 'Green', 'Blue', 'Violet', 'Brown', 'Black', 'Grey', 'White']
    pri_color = [x.lower() for x in pri_color]
    min_colours = {}
    #for key, name in webcolors.css3_hex_to_names.items():
    for key, name in [x for x in itertools.ifilter(lambda x: x[1]  in pri_color, webcolors.css3_hex_to_names.items())]:
        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
        rd = (r_c - requested_colour[0]) ** 2
        gd = (g_c - requested_colour[1]) ** 2
        bd = (b_c - requested_colour[2]) ** 2
        min_colours[(rd + gd + bd)] = name
    return min_colours[min(min_colours.keys())]

def get_colour_name(requested_colour, result_act = False):
    try:
        closest_name = actual_name = webcolors.rgb_to_name(requested_colour)
    except ValueError:
        closest_name = closest_colour(requested_colour)
        actual_name = None
    if result_act:
        return actual_name, closest_name
    return closest_name

In [400]:
rgb(df.iloc[0,:].cropped_path, 2)

[239, 13, 16]

In [401]:
hls(df.iloc[0,:].cropped_path, 2)

[0.9977876106194691, 0.49411764705882355, 0.8968253968253967]

In [402]:
for num in range(1, 4):
    hls_color = lambda x: hls(x, num)
    rgb_color = lambda x: rgb(x, num)
    rgb_name = lambda x: get_colour_name(x)
    pri_rgb_color = lambda x: webcolors.name_to_rgb(x)
    
    df['color_org_h' + str(num)], df['color_org_l' + str(num)], df['color_org_s' + str(num)] = zip(*df['cropped_path'].map(hls_color))
    #df['color_r' + str(num)], df['color_g' + str(num)], df['color_b' + str(num)] = zip(*df['cropped_path'].map(rgb_color))
    df['color_hls' + str(num)] = df['cropped_path'].map(hls_color)
    df['color_rgb' + str(num)] = df['cropped_path'].map(rgb_color)
    df['color_rgb_name' + str(num)] = df['color_rgb' + str(num)].map(rgb_name)
    df['color_rgb_pri' + str(num)] = df['color_rgb_name' + str(num)].map(pri_rgb_color)
    df['color_r' + str(num)], df['color_g' + str(num)], df['color_b' + str(num)] = zip(*df['color_rgb_name' + str(num)].map(pri_rgb_color))

In [412]:
for num in range(1, 4):
    df_dummy = pd.get_dummies(df['color_rgb_name' + str(num)])
    df_dummy.columns = ['color_' + str(num) + '_' + x for x in list(df_dummy.columns.values)]
    df = pd.concat([df, df_dummy], axis=1)

In [413]:
df.head()

Unnamed: 0,target_market,score,cover_image,sugar_result,fiber_result,sodium_result,sugar_per_serving,fiber_per_serving,sodium_per_serving,calories_per_serving,serving_size,index,url,sugar_result_perc,fiber_result_perc,calories_result_ratio,Adult,Child,Family,file_name,cropped_path,color_org_h1,color_org_l1,color_org_s1,color_hls1,...,color_1_orange,color_1_red,color_1_violet,color_1_white,color_1_yellow,color_2_black,color_2_blue,color_2_brown,color_2_green,color_2_grey,color_2_orange,color_2_red,color_2_violet,color_2_white,color_2_yellow,color_3_black,color_3_blue,color_3_brown,color_3_green,color_3_grey,color_3_orange,color_3_red,color_3_violet,color_3_white,color_3_yellow
0,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,29,1,http://cerealfacts.org/cereal_nutrition_scores...,24,3,4.137931,0,0,1,img_001_anniesfruitybunnies.jpg,../img/cropped/img_001_anniesfruitybunnies_cro...,0.0,1.0,0.0,"[0.0, 1.0, 0.0]",...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
1,Family,52,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,310,7,1,90,110,29,2,http://cerealfacts.org/cereal_nutrition_scores...,24,3,3.793103,0,0,1,img_002_annieshoneybunnies.jpg,../img/cropped/img_002_annieshoneybunnies_crop...,0.045139,0.501961,0.755906,"[0.0451388888889, 0.501960784314, 0.755905511811]",...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,29,3,http://cerealfacts.org/cereal_nutrition_scores...,24,3,4.137931,0,0,1,img_003_anniescinnamonrollbunnyos.jpg,../img/cropped/img_003_anniescinnamonrollbunny...,0.052381,0.427451,0.642202,"[0.052380952381, 0.427450980392, 0.642201834862]",...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,Family,64,http://cerealfacts.org/media/cereal_images/Ann...,7%,3%,379,2,1,110,120,29,4,http://cerealfacts.org/cereal_nutrition_scores...,7,3,4.137931,0,0,1,img_004_anniesorganicbunnyos.jpg,../img/cropped/img_004_anniesorganicbunnyos_cr...,0.769663,0.264706,0.659259,"[0.769662921348, 0.264705882353, 0.659259259259]",...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,Family,58,http://cerealfacts.org/media/cereal_images/Bar...,23%,10%,267,7,3,80,120,30,5,http://cerealfacts.org/cereal_nutrition_scores...,23,10,4.0,0,0,1,img_005_barbarasbakerypuffinspuffscrunchycocoa...,../img/cropped/img_005_barbarasbakerypuffinspu...,0.103333,0.607843,0.5,"[0.103333333333, 0.607843137255, 0.5]",...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [404]:
pd.concat([df, pd.get_dummies(df['color_rgb_name1'], prefix_sep = ['color_1_'])], axis=1)

Unnamed: 0,target_market,score,cover_image,sugar_result,fiber_result,sodium_result,sugar_per_serving,fiber_per_serving,sodium_per_serving,calories_per_serving,serving_size,index,url,sugar_result_perc,fiber_result_perc,calories_result_ratio,Adult,Child,Family,file_name,cropped_path,color_org_h1,color_org_l1,color_org_s1,color_hls1,...,color_rgb_name2,color_rgb_pri2,color_r2,color_g2,color_b2,color_org_h3,color_org_l3,color_org_s3,color_hls3,color_rgb3,color_rgb_name3,color_rgb_pri3,color_r3,color_g3,color_b3,black,blue,brown,green,grey,orange,red,violet,white,yellow
0,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,29,1,http://cerealfacts.org/cereal_nutrition_scores...,24,3,4.137931,0,0,1,img_001_anniesfruitybunnies.jpg,../img/cropped/img_001_anniesfruitybunnies_cro...,0.000000,1.000000,0.000000,"[0.0, 1.0, 0.0]",...,red,"(255, 0, 0)",255,0,0,0.005631,0.317647,0.913580,"[0.00563063063063, 0.317647058824, 0.913580246...","[155, 12, 7]",brown,"(165, 42, 42)",165,42,42,0,0,0,0,0,0,0,0,1,0
1,Family,52,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,310,7,1,90,110,29,2,http://cerealfacts.org/cereal_nutrition_scores...,24,3,3.793103,0,0,1,img_002_annieshoneybunnies.jpg,../img/cropped/img_002_annieshoneybunnies_crop...,0.045139,0.501961,0.755906,"[0.0451388888889, 0.501960784314, 0.755905511811]",...,orange,"(255, 165, 0)",255,165,0,0.068871,0.401961,0.590244,"[0.068870523416, 0.401960784314, 0.590243902439]","[163, 92, 42]",brown,"(165, 42, 42)",165,42,42,0,0,1,0,0,0,0,0,0,0
2,Family,50,http://cerealfacts.org/media/cereal_images/Ann...,24%,3%,293,7,1,85,120,29,3,http://cerealfacts.org/cereal_nutrition_scores...,24,3,4.137931,0,0,1,img_003_anniescinnamonrollbunnyos.jpg,../img/cropped/img_003_anniescinnamonrollbunny...,0.052381,0.427451,0.642202,"[0.052380952381, 0.427450980392, 0.642201834862]",...,brown,"(165, 42, 42)",165,42,42,0.111111,0.952941,0.250000,"[0.111111111111, 0.952941176471, 0.25]","[246, 244, 240]",white,"(255, 255, 255)",255,255,255,0,0,1,0,0,0,0,0,0,0
3,Family,64,http://cerealfacts.org/media/cereal_images/Ann...,7%,3%,379,2,1,110,120,29,4,http://cerealfacts.org/cereal_nutrition_scores...,7,3,4.137931,0,0,1,img_004_anniesorganicbunnyos.jpg,../img/cropped/img_004_anniesorganicbunnyos_cr...,0.769663,0.264706,0.659259,"[0.769662921348, 0.264705882353, 0.659259259259]",...,grey,"(128, 128, 128)",128,128,128,0.766355,0.303922,0.690323,"[0.766355140187, 0.303921568627, 0.690322580645]","[88, 24, 131]",grey,"(128, 128, 128)",128,128,128,0,0,1,0,0,0,0,0,0,0
4,Family,58,http://cerealfacts.org/media/cereal_images/Bar...,23%,10%,267,7,3,80,120,30,5,http://cerealfacts.org/cereal_nutrition_scores...,23,10,4.000000,0,0,1,img_005_barbarasbakerypuffinspuffscrunchycocoa...,../img/cropped/img_005_barbarasbakerypuffinspu...,0.103333,0.607843,0.500000,"[0.103333333333, 0.607843137255, 0.5]",...,grey,"(128, 128, 128)",128,128,128,0.105882,0.531373,0.355649,"[0.105882352941, 0.53137254902, 0.355648535565]","[178, 147, 93]",grey,"(128, 128, 128)",128,128,128,0,0,0,0,1,0,0,0,0,0
5,Family,54,http://cerealfacts.org/media/cereal_images/Bar...,23%,3%,267,7,1,80,120,30,6,http://cerealfacts.org/cereal_nutrition_scores...,23,3,4.000000,0,0,1,img_006_barbarasbakerypuffinspuffsfruitmedley.jpg,../img/cropped/img_006_barbarasbakerypuffinspu...,0.106322,0.776471,0.508772,"[0.10632183908, 0.776470588235, 0.508771929825]",...,white,"(255, 255, 255)",255,255,255,0.095960,0.686275,0.412500,"[0.0959595959596, 0.686274509804, 0.4125]","[208, 180, 142]",grey,"(128, 128, 128)",128,128,128,0,0,0,0,0,0,0,0,1,0
6,Family,56,http://cerealfacts.org/media/cereal_images/Bar...,20%,20%,500,6,6,150,90,30,7,http://cerealfacts.org/cereal_nutrition_scores...,20,20,3.000000,0,0,1,img_007_barbarasbakerypuffinscinnamon.jpg,../img/cropped/img_007_barbarasbakerypuffinsci...,0.091398,0.674510,0.373494,"[0.0913978494624, 0.674509803922, 0.373493975904]",...,grey,"(128, 128, 128)",128,128,128,0.091503,0.615686,0.520408,"[0.0915032679739, 0.61568627451, 0.520408163265]","[208, 162, 106]",grey,"(128, 128, 128)",128,128,128,0,0,0,0,1,0,0,0,0,0
7,Family,68,http://cerealfacts.org/media/cereal_images/Bar...,20%,10%,267,6,3,80,120,30,8,http://cerealfacts.org/cereal_nutrition_scores...,20,10,4.000000,0,0,1,img_008_barbarasbakerypuffinshoneyrice.jpg,../img/cropped/img_008_barbarasbakerypuffinsho...,0.091398,0.678431,0.378049,"[0.0913978494624, 0.678431372549, 0.378048780488]",...,grey,"(128, 128, 128)",128,128,128,0.091398,0.678431,0.378049,"[0.0913978494624, 0.678431372549, 0.378048780488]","[204, 176, 142]",grey,"(128, 128, 128)",128,128,128,0,0,0,0,1,0,0,0,0,0
8,Family,46,http://cerealfacts.org/media/cereal_images/Bar...,20%,7%,767,6,2,230,110,30,9,http://cerealfacts.org/cereal_nutrition_scores...,20,7,3.666667,0,0,1,img_009_barbarasbakerypuffinspeanutbutter.jpg,../img/cropped/img_009_barbarasbakerypuffinspe...,0.088889,0.674510,0.361446,"[0.0888888888889, 0.674509803922, 0.361445783133]",...,grey,"(128, 128, 128)",128,128,128,0.088889,0.674510,0.361446,"[0.0888888888889, 0.674509803922, 0.361445783133]","[202, 174, 142]",grey,"(128, 128, 128)",128,128,128,0,0,0,0,1,0,0,0,0,0
9,Family,50,http://cerealfacts.org/media/cereal_images/Bar...,19%,19%,704,5,5,190,90,27,10,http://cerealfacts.org/cereal_nutrition_scores...,19,19,3.333333,0,0,1,img_010_barbarasbakerypuffinsoriginal.jpg,../img/cropped/img_010_barbarasbakerypuffinsor...,0.089080,0.678431,0.353659,"[0.0890804597701, 0.678431372549, 0.353658536585]",...,grey,"(128, 128, 128)",128,128,128,0.092857,0.560784,0.312500,"[0.0928571428571, 0.560784313725, 0.3125]","[178, 147, 108]",grey,"(128, 128, 128)",128,128,128,0,0,0,0,1,0,0,0,0,0


In [311]:
min(abs(0.45 - 0.50), abs(0.65 - 0.50), abs(0.51 - 0.50))

0.010000000000000009

In [307]:
#plus = lambda x: hls(x, num)
(pd.concat([(df['color_org_s1'] * df['color_org_l1'] - 0.5).abs(), (df['color_org_s2'] * df['color_org_l2'] - 0.5).abs(), (df['color_org_s3'] * df['color_org_l3'] - 0.5).abs()], axis=1).min(axis=1))


0      0.056863
1      0.120565
2      0.225490
3      0.290196
4      0.196078
5      0.104954
6      0.179592
7      0.243520
8      0.256201
9      0.260067
10     0.253723
11     0.204041
12     0.249911
13     0.258111
14     0.258111
15     0.401961
16     0.264298
17     0.251793
18     0.253723
19     0.260067
20     0.260067
21     0.300300
22     0.248075
23     0.264328
24     0.258111
25     0.045343
26     0.130460
27     0.045304
28     0.233725
29     0.453789
         ...   
230    0.410423
231    0.420392
232    0.001961
233    0.188235
234    0.174139
235    0.224271
236    0.001961
237    0.040056
238    0.040056
239    0.045541
240    0.031373
241    0.313725
242    0.225490
243    0.003673
244    0.147059
245    0.390196
246    0.385882
247    0.238439
248    0.362745
249    0.445343
250    0.308345
251    0.052874
252    0.500000
253    0.441176
254    0.207843
255    0.090196
256    0.013695
257    0.217647
258    0.465460
259    0.388235
dtype: float64

In [315]:
df["color_diff"] = pd.concat([(df['color_org_s1'] * df['color_org_l1'] - 0.5).abs(), (df['color_org_s2'] * df['color_org_l2'] - 0.5).abs(), (df['color_org_s3'] * df['color_org_l3'] - 0.5).abs()], axis=1).std(axis=1)
df["color_diff"] = pd.concat([(df['color_org_s1'] * df['color_org_l1'] - 0.5).abs(), (df['color_org_s2'] * df['color_org_l2'] - 0.5).abs(), (df['color_org_s3'] * df['color_org_l3'] - 0.5).abs()], axis=1).std(axis=1)

In [314]:
pd.concat([(df['color_org_s1'] * df['color_org_l1'] - 0.5).abs(), (df['color_org_s2'] * df['color_org_l2'] - 0.5).abs(), (df['color_org_s3'] * df['color_org_l3'] - 0.5).abs()], axis=1).std(axis=1).describe()

count    259.000000
mean       0.093832
std        0.081814
min        0.000000
25%        0.010882
50%        0.078725
75%        0.152270
max        0.287543
dtype: float64

In [371]:
from sklearn.cross_validation import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

In [414]:
col_list = list(df.columns.values)

In [415]:
col_list = col_list[16:]

In [416]:
col_list

['Adult',
 'Child',
 'Family',
 u'file_name',
 u'cropped_path',
 u'color_org_h1',
 u'color_org_l1',
 u'color_org_s1',
 u'color_hls1',
 u'color_rgb1',
 u'color_rgb_name1',
 u'color_rgb_pri1',
 u'color_r1',
 u'color_g1',
 u'color_b1',
 u'color_org_h2',
 u'color_org_l2',
 u'color_org_s2',
 u'color_hls2',
 u'color_rgb2',
 u'color_rgb_name2',
 u'color_rgb_pri2',
 u'color_r2',
 u'color_g2',
 u'color_b2',
 u'color_org_h3',
 u'color_org_l3',
 u'color_org_s3',
 u'color_hls3',
 u'color_rgb3',
 u'color_rgb_name3',
 u'color_rgb_pri3',
 u'color_r3',
 u'color_g3',
 u'color_b3',
 u'color_1_black',
 u'color_1_blue',
 u'color_1_brown',
 u'color_1_green',
 u'color_1_grey',
 u'color_1_orange',
 u'color_1_red',
 u'color_1_violet',
 u'color_1_white',
 u'color_1_yellow',
 u'color_1_black',
 u'color_1_blue',
 u'color_1_brown',
 u'color_1_green',
 u'color_1_grey',
 u'color_1_orange',
 u'color_1_red',
 u'color_1_violet',
 u'color_1_white',
 u'color_1_yellow',
 u'color_1_black',
 u'color_1_blue',
 u'color_1_bro

In [417]:
'''
col_list.remove('target_market')
col_list.remove('cover_image')
col_list.remove('url')
col_list.remove('score')
col_list.remove('index')
'''
col_list.remove('cropped_path')
col_list.remove('file_name')
col_list.remove('color_rgb_name1')
col_list.remove('color_rgb_name2')
col_list.remove('color_rgb_name3')
col_list.remove('color_hls1') 
col_list.remove('color_rgb1')
col_list.remove('color_hls2') 
col_list.remove('color_rgb2')
col_list.remove('color_hls3') 
col_list.remove('color_rgb3')
col_list.remove('color_rgb_pri1')
col_list.remove('color_rgb_pri2')
col_list.remove('color_rgb_pri3')

In [376]:
df[col_list].head()

Unnamed: 0,Adult,Child,Family,color_r1,color_g1,color_b1,color_r2,color_g2,color_b2,color_r3,color_g3,color_b3,color_org_h1,color_org_l1,color_org_s1,color_org_h2,color_org_l2,color_org_s2,color_org_h3,color_org_l3,color_org_s3,color_diff
0,0,0,1,255,255,255,255,0,0,165,42,42,0.0,1.0,0.0,0.997788,0.494118,0.896825,0.005631,0.317647,0.91358,0.225083
1,0,0,1,165,42,42,255,165,0,165,42,42,0.045139,0.501961,0.755906,0.113843,0.503922,0.72332,0.068871,0.401961,0.590244,0.078133
2,0,0,1,165,42,42,165,42,42,255,255,255,0.052381,0.427451,0.642202,0.052381,0.427451,0.642202,0.111111,0.952941,0.25,0.020943
3,0,0,1,165,42,42,128,128,128,128,128,128,0.769663,0.264706,0.659259,0.766355,0.303922,0.690323,0.766355,0.303922,0.690323,0.020377
4,0,0,1,128,128,128,128,128,128,128,128,128,0.103333,0.607843,0.5,0.103333,0.607843,0.5,0.105882,0.531373,0.355649,0.06636


In [421]:
X = df[col_list]
#y = df.sugar_result_perc
y = df.fiber_result_perc

In [511]:
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=1,
       random_state=0)
scores = cross_val_score(clf, X, y)
print "DecisionTreeClassifier:", scores.mean()

## RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=None,
     min_samples_split=1, random_state=0)
scores = cross_val_score(clf, X, y)
print "RandomForestClassifier:", scores.mean()

## ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100, max_depth=None,
    min_samples_split=1, random_state=0)
scores = cross_val_score(clf, X, y)
print "ExtraTreesClassifier:  ", scores.mean()

DecisionTreeClassifier: 0.336684906521
RandomForestClassifier: 0.332146706463
ExtraTreesClassifier:   0.382719810969


In [510]:
## RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=None,
     min_samples_split=1, random_state=0)
scores = cross_val_score(clf, X, y)
print "RandomForestClassifier:", scores.mean()

## ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100, max_depth=None,
    min_samples_split=1, random_state=0)
scores = cross_val_score(clf, X, y)
print "ExtraTreesClassifier:  ", scores.mean()

RandomForestClassifier: 0.332146706463
ExtraTreesClassifier:   0.382719810969


In [423]:
df.fiber_result_perc.describe()

count    259.000000
mean       9.339768
std        6.296948
min        0.000000
25%        6.000000
50%        9.000000
75%       11.000000
max       47.000000
Name: fiber_result_perc, dtype: float64

In [431]:
df.sugar_result_perc.describe()

count    259.000000
mean      23.440154
std        9.644017
min        0.000000
25%       18.000000
50%       23.000000
75%       30.000000
max       56.000000
Name: sugar_result_perc, dtype: float64

In [562]:
def setFiber(fiber):
    result = ""
    if fiber >= 11:
        result = "Good"
    elif fiber <= 6:
        #result = "Bad"
        result = "Average"
    else:
        result = "Average"
    return(result)
set_fiber = lambda x: setFiber(x)

def setSugar(sugar):
    result = ""
    if sugar >= 30:
        #result = "Bad"
        result = "Average"
    elif sugar <= 18:
        result = "Good"
    else:
        result = "Average"
    return(result)
set_sugar = lambda x: setSugar(x)

df['fiber_label'] = df['fiber_result_perc'].map(set_fiber)
df['sugar_label'] = df['sugar_result_perc'].map(set_sugar)

In [588]:
##balance the class
import random
print(df.fiber_label.value_counts())
#random.sample(df[df.fiber_label == 'Good'].index, 180)
idx_good = [random.choice(df[df.fiber_label == 'Good'].index) for _ in range(200)]
idx_average = [random.choice(df[df.fiber_label == 'Average'].index) for _ in range(200)]
idx = idx_good + idx_average

Average    179
Good        80
dtype: int64


In [589]:
X = df[col_list]
X = X[idx:]
#y = df.sugar_result_perc
y = df.fiber_label

## RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=None,
     min_samples_split=10, random_state=0)
scores = cross_val_score(clf, X, y, cv = 5)
print "RandomForestClassifier:", scores.mean()

## ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100, max_depth=None,
    min_samples_split=10, random_state=0)
scores = cross_val_score(clf, X, y, cv = 5)
print "ExtraTreesClassifier:  ", scores.mean()

IndexError: invalid slice

In [565]:
X = df[col_list]
#y = df.sugar_result_perc
y = df.sugar_label

## RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=None,
     min_samples_split=10, random_state=0)
scores = cross_val_score(clf, X, y, cv = 5)
print "RandomForestClassifier:", scores.mean()

## ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100, max_depth=None,
    min_samples_split=10)
scores = cross_val_score(clf, X, y, cv = 5)
print "ExtraTreesClassifier:  ", scores.mean()

RandomForestClassifier: 0.672553288369
ExtraTreesClassifier:   0.641406983693


In [536]:
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

clf.fit(X, y)
confusion_matrix(y, clf.predict(X))

array([[113,   9,   5],
       [  8,  52,   6],
       [ 16,   2,  48]])

In [538]:
from sklearn.metrics import classification_report 
print(classification_report(y, clf.predict(X)))

             precision    recall  f1-score   support

    Average       0.82      0.89      0.86       127
        Bad       0.83      0.79      0.81        66
       Good       0.81      0.73      0.77        66

avg / total       0.82      0.82      0.82       259



In [577]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(x_train, y_train)

print(confusion_matrix(y_test, clf.predict(x_test)))
print(classification_report(y_test, clf.predict(x_test)))

[[34  5]
 [11  2]]
             precision    recall  f1-score   support

    Average       0.76      0.87      0.81        39
       Good       0.29      0.15      0.20        13

avg / total       0.64      0.69      0.66        52



In [251]:
df['color_rgb_name1'].value_counts()

white     179
brown      29
grey       22
orange      9
black       8
violet      3
yellow      3
blue        3
red         2
green       1
dtype: int64

In [252]:
df['color_rgb_name2'].value_counts()

white     130
brown      44
grey       37
orange     19
red         7
black       7
yellow      5
violet      4
blue        4
green       2
dtype: int64

In [253]:
df['color_rgb_name3'].value_counts()

brown     66
white     64
grey      63
orange    26
black     14
red        8
yellow     7
violet     5
blue       4
green      2
dtype: int64

In [279]:
from scipy.spatial import distance

the_max_dist = 0
for each_comb in [x for x in itertools.combinations('012', 2)]:
    a = df['color_rgb1'][1][int(each_comb[0])]
    b = df['color_rgb1'][1][int(each_comb[1])]
    dst = distance.euclidean(a,b)
    the_max_dist = max(dst, the_max_dist)

print(the_max_dist)



192.0


In [276]:
[x for x in itertools.combinations('012', 2)][1][0]

u'0'

In [270]:
df['color_rgb1'][1]

0      [255, 255, 255]
1        [224, 84, 32]
2        [179, 83, 39]
3        [78, 23, 112]
4      [205, 167, 105]
5      [227, 206, 169]
6      [203, 175, 141]
7      [204, 176, 142]
8      [202, 174, 142]
9      [202, 175, 144]
10     [203, 175, 144]
11     [202, 175, 143]
12     [203, 175, 142]
13     [202, 176, 143]
14     [202, 175, 143]
15      [138, 118, 88]
16     [204, 176, 152]
17     [203, 176, 143]
18     [255, 255, 255]
19     [255, 255, 255]
20     [255, 255, 255]
21      [182, 156, 99]
22     [255, 255, 255]
23     [255, 255, 255]
24     [255, 255, 255]
25     [255, 255, 255]
26     [255, 255, 255]
27     [255, 255, 255]
28     [255, 255, 255]
29     [216, 220, 219]
            ...       
230    [255, 255, 255]
231    [255, 255, 255]
232    [255, 255, 255]
233    [255, 255, 255]
234    [255, 255, 255]
235    [231, 240, 242]
236    [255, 255, 255]
237    [255, 255, 255]
238    [255, 255, 255]
239    [232, 242, 247]
240    [231, 244, 247]
241    [234, 240, 241]
242      [1

In [225]:
webcolors.css3_hex_to_names.items()[1][1]

True

In [222]:
3 in [1, 2, 3]

True

In [197]:
len(df['color_rgb_name2'].value_counts())

52

In [198]:
len(df['color_rgb_name3'].value_counts())

55

(255, 255, 255)

In [None]:
rgb_name = lambda x: get_colour_name(x)
#df['color_rgb1_name'] = df['color_rgb1'].map(rgb_name)
df['color_rgb1'].map(rgb_name)

In [None]:
requested_colour = (119, 172, 152)
closest_name = get_colour_name(requested_colour)

print "Closest colour name:", closest_name

In [None]:
df.loc[140,]

In [None]:


ggplot(df, aes(x='sugar_result', y='serving_size', color='target_market ')) +\
    geom_point(position = "jitter") +\
    scale_color_brewer(type='qual') +\
    xlab("Sugar Per Serving") + ylab("Serving Size") + ggtitle("Cereal (Sugar)") +\
    facet_wrap("target_market", scales="free_x")

###EDA

Suger Per Serving
 - Interesting to see larger serving size does not have a hight correlation to the sugar
 - Is unstandable to see cereal which have child as targeted market has smaller serving size
 - Interesting to see cereal which have adult as targeted market has two culster of serving size
 - There seems to be a data point missing target_market 

In [None]:
ggplot(df, aes(x='sugar_per_serving', y='serving_size', color='target_market ')) +\
    geom_point(position = "jitter") +\
    scale_color_brewer(type='qual') +\
    xlab("Sugar Per Serving") + ylab("Serving Size") + ggtitle("Cereal (Sugar)") +\
    facet_wrap("target_market", scales="free_x")

* * * * *

Let's remove those has no target market

In [None]:
df = df[df.target_market  != "None"]

* * * * *

Fiber Per Serving
 - Only Family cereal has a strong corelation to the serving size

In [None]:
ggplot(df, aes(x='fiber_per_serving', y='serving_size', color='target_market ')) +\
    geom_point(position = "jitter") +\
    scale_color_brewer(type='qual') +\
    xlab("Fiber Per Serving") + ylab("Serving Size") + ggtitle("Cereal (Fiber)") +\
    facet_wrap("target_market", scales="free_x")

In [None]:
sns.set(style="ticks")

# Show the results of a linear regression within each dataset
sns.lmplot(x="fiber_per_serving", y="serving_size", col="target_market", hue="target_market", col_wrap=3
           , data=df)

* * * * *

Sodium Per Serving
 - Interesting to see only Adult and Family offer ZERO sodium cereal

In [None]:
ggplot(df, aes(x='sodium_per_serving ', y='serving_size', color='target_market ')) +\
    geom_point(position = "jitter") +\
    scale_color_brewer(type='qual') +\
    xlab("Sodium Per Serving") + ylab("Serving Size") + ggtitle("Cereal (Sodium)") +\
    facet_wrap("target_market", scales="free_x")

* * * * *

Calories Per Serving
 - Is safe to say that the bigger serving it is, the more calories it has. No exception

In [None]:
ggplot(df, aes(x='calories_per_serving  ', y='serving_size', color='target_market ')) +\
    geom_point(position = "jitter") +\
    scale_color_brewer(type='qual') +\
    xlab("Calories Per Serving") + ylab("Serving Size") + ggtitle("Cereal (Calories)") +\
    facet_wrap("target_market", scales="free_x")

In [None]:
sns.set(style="ticks")

# Show the results of a linear regression within each dataset
sns.lmplot(x="calories_per_serving", y="serving_size", col="target_market", hue="target_market", col_wrap=3
           , data=df)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
rs = np.random.RandomState(10)

# Set up the matplotlib figure
sns.despine(left=True)

# Generate a random univariate dataset
d = rs.normal(size=100)

# Plot a historgram and kernel density estimate
sns.distplot(df.calories_per_serving, color="m")

plt.tight_layout()

In [None]:
import seaborn as sns
sns.set()

sns.pairplot(df, hue="target_market")

In [None]:
import seaborn as sns
sns.set()

sns.pairplot(df, hue="score")

Features =]

In [None]:
## 
def gen_file_name(row):
    file_name = row.cover_image.split('/')[len(row.cover_image.split('/'))-1]
    file_name = 'img_' + str(row['index']).zfill (3) + '_' + (file_name).lower()
    #print('Processing ' + str(row['index']))
    return(file_name)

## 
def gen_cropped_file_path(row):
    file_name = row.file_name
    path_name = '../img/cropped/' + string.replace(file_name, '.jpg', '_cropped.png')
    return(path_name)
    #file_name = 'img_' + str(row['index']).zfill (3) + '_' + (file_name).lower()
    #print('Processing ' + str(row['index']))
    #return(file_name)

In [None]:
df['file_name'] = df.apply(gen_file_name, axis=1)
df['cropped_path'] = df.apply(gen_cropped_file_path, axis=1)

In [None]:
df.head()

In [None]:
for name, url in zip(df.file_name, df.cover_image):
    if (os.path.isfile('../img/' + name) == False):
        resource = urllib.urlopen(url)
        output = open('../img/' + name, 'wb')
        output.write(resource.read())
        output.close()

In [None]:
for file_name in df.file_name:
    if (os.path.isfile('../img/cropped/' + string.replace(file_name, '.jpg', '_cropped.png')) == False):
        img = Image.open('../img/' + file_name)
        img = img.convert("RGBA")
        pixdata = img.load()
        for y in xrange(img.size[1]):
            for x in xrange(img.size[0]):
                if pixdata[x, y] == (255, 255, 255, 255):
                    pixdata[x, y] = (255, 255, 255, 0)
        img.save('../img/cropped/' + string.replace(file_name, '.jpg', '_cropped.png'), "PNG")

In [None]:
## Helper function to get the RGB and HSL

def open_convert_image(input_img_file, numcolors=1):
    image = Image.open(input_img_file)
    result = image.convert('P', palette=Image.ADAPTIVE, colors=numcolors)
    result.putalpha(0)
    return(result)

def get_list_color(input_img_file, numcolors=1):
    return(open_convert_image(input_img_file, numcolors).getcolors())

def get_rgb_list(input_img_file, numcolors=2):
    colors_list = get_list_color(input_img_file, numcolors)
    result = []
    for color in colors_list:
        result.append(list(color[1][0:3]))
    return(result)

def df_get_rgb_list(row, numcolors=3):
    return(get_rgb_list(row.cropped_path, numcolors))

def get_hls_list(input_img_file, numcolors=1):
    result = []
    for rgb_set in get_rgb_list(input_img_file, numcolors):
        result.append(list(colorsys.rgb_to_hls(*[x/255.0 for x in rgb_set])))
    return(result)

def df_get_hls_list(row, numcolors=1):
    return(list(get_hls_list(row.cropped_path, numcolors))[numcolors - 1])

def df_get_h(row):
    return(list(get_hls_list(row.cropped_path, numcolors)[0]))

def hls(row, numcolors=1):
    return(list(get_hls_list(row, numcolors))[numcolors - 1])

def rgb(row, numcolors=1):
    return(get_rgb_list(row, numcolors)[numcolors - 1])



In [None]:
for num in range(1, 4):
    hls_color = lambda x: hls(x, num)
    rgb_color = lambda x: rgb(x, num)

    df['color_h' + str(num)], df['color_l' + str(num)], df['color_s' + str(num)] = zip(*df['cropped_path'].map(hls_color))
    df['color_r' + str(num)], df['color_g' + str(num)], df['color_b' + str(num)] = zip(*df['cropped_path'].map(rgb_color))

In [None]:
def setScore(score):
    result = ""
    if score >= 58:
        result = "Good"
    elif score < 48:
        result = "Bad"
    else:
        result = "Average"
    return(result)
set_score = lambda x: setScore(x)

df['score_label'] = df['score'].map(set_score)



In [None]:
df['score_label']

In [None]:
import seaborn as sns
sns.set()

sns.pairplot(df, hue="score_label")

In [None]:
df.corr().fiber_per_serving 

In [None]:
g = sns.PairGrid(df, vars=["color_h1", "color_l1", "color_s1"],
                 hue="score_label", aspect=1, size=5)
g.map(plt.scatter)

In [None]:
g = sns.PairGrid(df, vars=["color_r1", "color_g1", "color_b1"],
                 hue="score_label", aspect=1, size=5)
g.map(plt.scatter)

In [None]:
np.log(df.color_r1)

In [None]:
df_log = np.log(df[s for s in list(df.columns.values) if 'color' in s])

In [None]:
[s for s in list(df.columns.values) if 'color' in s]

In [None]:
df_log = np.log(df.loc[:, [s for s in list(df.columns.values) if 'color' in s]])

In [None]:
df_log = np.log(df.loc[:,'color_h1':'color_b3'])

In [None]:
df_log['score_label'] = df.score_label

In [None]:
df_log.head()

In [None]:
g = sns.PairGrid(df_log, vars=["color_r1", "color_g1", "color_b1"],
                 hue="score_label", aspect=1, size=5)
g.map(plt.scatter)

In [None]:
df.corr().score

In [None]:
df.corr().sugar_per_serving

In [None]:
df.corr().fiber_per_serving

In [None]:
corr = df.corr()
fig, ax = plt.subplots(figsize=(20, 20))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);

In [None]:
X = df.drop(['score'], axis=1)
y = df['score']

In [None]:
X = df.ix[:,16:33]

In [None]:
X.head()

In [None]:
F = feature_selection.f_classif(X, y)[0]
title = 'Cereal Features with F-Values'
plt.figure(figsize=(13, 8))
ax = sns.barplot(x=df.columns[16:33],y=F)
ax.set_title(title)
ax.set(xlabel="Features");


In [None]:
X = df.drop(['score'], axis=1)
y = df['score_label']
X = df.ix[:,16:33]
F = feature_selection.f_classif(X, y)[0]
title = 'Cereal Features with F-Values'
plt.figure(figsize=(13, 8))
ax = sns.barplot(x=df.columns[16:33],y=F)
ax.set_title(title)
ax.set(xlabel="Features");

In [237]:
X = df.ix[:,16:33]

In [238]:
X

Unnamed: 0,Adult,Child,Family,file_name,cropped_path,color_hls1,color_rgb1,color_rgb_name1,color_hls2,color_rgb2,color_rgb_name2,color_hls3,color_rgb3,color_rgb_name3,color_rgb_pri1,color_rgb_pri2,color_rgb_pri3
0,0,0,1,img_001_anniesfruitybunnies.jpg,../img/cropped/img_001_anniesfruitybunnies_cro...,"[0.0, 1.0, 0.0]","[255, 255, 255]",white,"[0.997787610619, 0.494117647059, 0.896825396825]","[239, 13, 16]",red,"[0.00563063063063, 0.317647058824, 0.913580246...","[155, 12, 7]",brown,"(255, 255, 255)","(255, 0, 0)","(165, 42, 42)"
1,0,0,1,img_002_annieshoneybunnies.jpg,../img/cropped/img_002_annieshoneybunnies_crop...,"[0.0451388888889, 0.501960784314, 0.755905511811]","[224, 84, 32]",brown,"[0.113843351548, 0.503921568627, 0.723320158103]","[220, 162, 37]",orange,"[0.068870523416, 0.401960784314, 0.590243902439]","[163, 92, 42]",brown,"(165, 42, 42)","(255, 165, 0)","(165, 42, 42)"
2,0,0,1,img_003_anniescinnamonrollbunnyos.jpg,../img/cropped/img_003_anniescinnamonrollbunny...,"[0.052380952381, 0.427450980392, 0.642201834862]","[179, 83, 39]",brown,"[0.052380952381, 0.427450980392, 0.642201834862]","[179, 83, 39]",brown,"[0.111111111111, 0.952941176471, 0.25]","[246, 244, 240]",white,"(165, 42, 42)","(165, 42, 42)","(255, 255, 255)"
3,0,0,1,img_004_anniesorganicbunnyos.jpg,../img/cropped/img_004_anniesorganicbunnyos_cr...,"[0.769662921348, 0.264705882353, 0.659259259259]","[78, 23, 112]",brown,"[0.766355140187, 0.303921568627, 0.690322580645]","[88, 24, 131]",grey,"[0.766355140187, 0.303921568627, 0.690322580645]","[88, 24, 131]",grey,"(165, 42, 42)","(128, 128, 128)","(128, 128, 128)"
4,0,0,1,img_005_barbarasbakerypuffinspuffscrunchycocoa...,../img/cropped/img_005_barbarasbakerypuffinspu...,"[0.103333333333, 0.607843137255, 0.5]","[205, 167, 105]",grey,"[0.103333333333, 0.607843137255, 0.5]","[205, 167, 105]",grey,"[0.105882352941, 0.53137254902, 0.355648535565]","[178, 147, 93]",grey,"(128, 128, 128)","(128, 128, 128)","(128, 128, 128)"
5,0,0,1,img_006_barbarasbakerypuffinspuffsfruitmedley.jpg,../img/cropped/img_006_barbarasbakerypuffinspu...,"[0.10632183908, 0.776470588235, 0.508771929825]","[227, 206, 169]",white,"[0.10632183908, 0.776470588235, 0.508771929825]","[227, 206, 169]",white,"[0.0959595959596, 0.686274509804, 0.4125]","[208, 180, 142]",grey,"(255, 255, 255)","(255, 255, 255)","(128, 128, 128)"
6,0,0,1,img_007_barbarasbakerypuffinscinnamon.jpg,../img/cropped/img_007_barbarasbakerypuffinsci...,"[0.0913978494624, 0.674509803922, 0.373493975904]","[203, 175, 141]",grey,"[0.0928571428571, 0.560784313725, 0.3125]","[178, 147, 108]",grey,"[0.0915032679739, 0.61568627451, 0.520408163265]","[208, 162, 106]",grey,"(128, 128, 128)","(128, 128, 128)","(128, 128, 128)"
7,0,0,1,img_008_barbarasbakerypuffinshoneyrice.jpg,../img/cropped/img_008_barbarasbakerypuffinsho...,"[0.0913978494624, 0.678431372549, 0.378048780488]","[204, 176, 142]",grey,"[0.0913978494624, 0.678431372549, 0.378048780488]","[204, 176, 142]",grey,"[0.0913978494624, 0.678431372549, 0.378048780488]","[204, 176, 142]",grey,"(128, 128, 128)","(128, 128, 128)","(128, 128, 128)"
8,0,0,1,img_009_barbarasbakerypuffinspeanutbutter.jpg,../img/cropped/img_009_barbarasbakerypuffinspe...,"[0.0888888888889, 0.674509803922, 0.361445783133]","[202, 174, 142]",grey,"[0.0888888888889, 0.674509803922, 0.361445783133]","[202, 174, 142]",grey,"[0.0888888888889, 0.674509803922, 0.361445783133]","[202, 174, 142]",grey,"(128, 128, 128)","(128, 128, 128)","(128, 128, 128)"
9,0,0,1,img_010_barbarasbakerypuffinsoriginal.jpg,../img/cropped/img_010_barbarasbakerypuffinsor...,"[0.0890804597701, 0.678431372549, 0.353658536585]","[202, 175, 144]",grey,"[0.0890804597701, 0.678431372549, 0.353658536585]","[202, 175, 144]",grey,"[0.0928571428571, 0.560784313725, 0.3125]","[178, 147, 108]",grey,"(128, 128, 128)","(128, 128, 128)","(128, 128, 128)"


In [None]:
X = df.drop(['score'], axis=1)
y = df['score_label']
X = df.ix[:,16:33]
F = feature_selection.f_classif(X, y)[0]
title = 'Cereal Features with F-Values'
plt.figure(figsize=(13, 8))
ax = sns.barplot(x=df.columns[16:33],y=F)
ax.set_title(title)
ax.set(xlabel="Features");

In [None]:
zip(*df.head()['rgb_list'])

In [None]:
get_hls_list('../img/cropped/img_001_anniesfruitybunnies_cropped.png', 1)

In [None]:
df['hls_list'] = df.apply(df_get_hls_list, axis=1)

#get_rgb_list('../img/cropped/img_001_anniesfruitybunnies_cropped.png', 3)

In [None]:
df['score'].describe()


In [None]:
df['h'], df['l'], df['s'] = zip(*df.apply(df_get_hls_list, axis=1))

In [None]:




hls_color = lambda x: hls(x, 2)

zip(*df['cropped_path'].map(hls_color))

In [None]:
zip(*df.apply(df_get_hls_list, axis=1))

In [None]:
df["hls_list"].str

In [None]:
df.head()