## Imports

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import time

from selenium import webdriver
from selenium.webdriver.common.by import By

from urllib.request import urlretrieve

import cv2

import sklearn
from sklearn.cluster import KMeans
from collections import Counter

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## Collect information and images

In [2]:
# this is the page where the paint swatch colors are from
url = 'https://www.williamsburgoils.com/products/colors'

# prepare the option for the chrome driver
options = webdriver.ChromeOptions()
options.add_argument('headless')

# start chrome browser
browser = webdriver.Chrome(options=options)

# open url
browser.get(url)

# allow to load
time.sleep(5)

In [3]:
# create an empty list to store the results of the following for loop
colors = []

# iterate through all of elements ("e") with the class name of "colorColumn"
for e in tqdm(browser.find_elements(By.CLASS_NAME, "colorColumn")):
    # create a new dictionary for each loop where the information from
    # each color can be stored
    color = {}

    # the text in each element returns the official name of the color
    # and the alternate name of the color as one item with a line break
    
    # splitting the text up creates a list that can be used to identify
    # the official name and the alternate name by index value, 
    # as done below
    text = e.text.split('\n')
    
    # add each value to the color dictionary created at the 
    # top of the loop
    color['name'] = text[0]
    color['alt_name'] = text[1]
    
    # each "e" contained multiple child elements ("line"), not all 
    # of which have the tag "img". if the line element contains 
    # the "img" tag, the code below grabs the "src" attribute and 
    # saves it to the color dictionary to be used a couple cells down
    for line in e.find_elements(By.TAG_NAME, 'img'):
        color['swatch_image_url'] = line.get_attribute('src')
    
    # append the color dict to the list created above
    colors.append(color)
    
    # sleep for a couple seconds to not hit the page too many times
    time.sleep(2)

# create a dataframe from the information collected
df = pd.DataFrame(colors)

100%|██████████| 176/176 [05:57<00:00,  2.03s/it]


In [4]:
# sorting the dataframe alphabetically by name
df = df.sort_values(by='name').reset_index(drop=True)

In [5]:
# create new columns with the paths to where the original 
# image files and the returned image files will be stored
df['path_to_image'] = df['name'].map(
    lambda x: f"./images/{x.lower().replace(' ','_')}_swatch.jpg")

df['path_to_returned_color'] = df['name'].map(
    lambda x: f"./returned_colors/{x.lower().replace(' ','_')}_returned.jpg")

In [6]:
# iterate through each row in the dataframe
for i in tqdm(range(len(df))):

    # call the "src" attribute collected while scraping
    # Williamsburg Oils' website for the url
    img_url = df.loc[i, 'swatch_image_url']

    # call the path created in the cell above that will
    # store the images in the images directory
    save_path = df.loc[i, 'path_to_image']

    # use url retrive with the variables above to save the images
    urlretrieve(img_url, save_path)

    # sleep to not hit the site too many times
    time.sleep(2)

100%|██████████| 176/176 [06:03<00:00,  2.07s/it]


## Computer vision & color identification

In this section, I adapted and/or pulled much of the code from [this article](https://towardsdatascience.com/image-color-identification-with-machine-learning-and-image-processing-using-python-f3dd0606bdca) by Piero Paialunga in Italy.

I used functions from the article to read in the image and convert to RGB using `cv2`, and then created another function the combined and adapted code that would read in the array, reshape it and use KMeans clustering to create the label and find the centroid for the image.

In [7]:
# create a function to read the image unchanged and return
# the image as an array of arrays in RGB format
def get_image(path):
    image = cv2.imread(path, cv2.IMREAD_UNCHANGED)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image

# create a function that returns the hex value
def RGB2HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]),
                                        int(color[2]))

In the function below, I set the default for the hyperparameter `number_of_colors` to 10, following from the original code I adapted this from. However, in the cell where I ultimately use the function, I pass in 1 for `number_of_colors` because I want the cetroid for the image as a whole. 

In [8]:
# create a function that takes in an image and creates labels
# using KMeans clustering and returns the most common label
# as both a hex and an RGB color

# note that the default number_of_colors is set to 10, but
# this can be tuned based on the user's needs
def get_most_common_color(path, number_of_colors=10):
    # get the image path
    image = get_image(path)

    # get the number of colors we want to retrieve
    number_of_colors = number_of_colors

    # reshape the image
    modified_image = image.reshape(image.shape[0] * image.shape[1], 3)

    #instantiate the KMeans model
    clf = KMeans(n_clusters=number_of_colors)

    # get the labels for the image from the model
    labels = clf.fit_predict(modified_image)

    # get the counts of the labels produced from the prediction
    counts = Counter(labels)

    # get the RGB values from the center of the clusters
    center_colors = clf.cluster_centers_

    # get ordered colors by iterating through the keys
    ordered_colors = [center_colors[i] for i in counts.keys()]

    # get the hex and rgb colors as dicts
    hex_dict = {i: RGB2HEX(ordered_colors[i]) for i in counts.keys()}
    rgb_dict = {i: ordered_colors[i] for i in counts.keys()}

    # most common label
    most_common_label = counts.most_common()[0][0]

    # most common hex color
    most_common_hex = hex_dict[most_common_label]

    # most common RGB values as integers
    most_common_rgb = [int(i) for i in rgb_dict[most_common_label]]

    return most_common_hex, most_common_rgb

In [9]:
# get the most common hex and rgb colors with n_clusters == 1
hex_rgb = df.path_to_image.map(lambda x: get_most_common_color(x,1))

# pull out the hex and rgb colors for the returned tuples and
# save them in the dataframe
df['most_common_hex'] = hex_rgb.map(lambda x: x[0])
df['most_common_rgb'] = hex_rgb.map(lambda x: x[1])

Unnamed: 0,name,alt_name,swatch_image_url,path_to_image,path_to_returned_color,most_common_hex,most_common_rgb
0,Alizarin Crimson,#6000684,https://goldenhub.goldenpaints.com/storage/upl...,./images/alizarin_crimson_swatch.jpg,./returned_colors/alizarin_crimson_returned.jpg,#542223,"[84, 34, 35]"
1,Alizarin Orange,#6000534,https://goldenhub.goldenpaints.com/storage/upl...,./images/alizarin_orange_swatch.jpg,./returned_colors/alizarin_orange_returned.jpg,#b25417,"[178, 84, 23]"
2,Alizarin Yellow,#6000514,https://goldenhub.goldenpaints.com/storage/upl...,./images/alizarin_yellow_swatch.jpg,./returned_colors/alizarin_yellow_returned.jpg,#956a22,"[149, 106, 34]"
3,Bismuth Vanadate Yellow,#6001929,https://goldenhub.goldenpaints.com/storage/upl...,./images/bismuth_vanadate_yellow_swatch.jpg,./returned_colors/bismuth_vanadate_yellow_retu...,#f4dd0c,"[244, 221, 12]"
4,Bohemian Green Earth,#6001021,https://goldenhub.goldenpaints.com/storage/upl...,./images/bohemian_green_earth_swatch.jpg,./returned_colors/bohemian_green_earth_returne...,#353821,"[53, 56, 33]"


In [10]:
# save the resulting dataframe as a csv file
df.to_csv('./williamsburg_oil_colors.csv', index=False)

## Create files from each of the resulting colors

In [11]:
# create jpg files for each of the resulting colors

# iterate through each row in the dataframe
for i in tqdm(range(len(df))):
    # create an array of arrays filled with the RGB color
    # identified from the computer vision and clustering
    rgb_img = [[df.loc[i, 'most_common_rgb']]*100]*100
    
    # update parameters for the figure
    plt.rcParams['figure.figsize'] = [2.5, 2.5]
    plt.rcParams['figure.constrained_layout.h_pad'] = 0
    plt.rcParams['figure.constrained_layout.w_pad'] = 0
    plt.rcParams['figure.constrained_layout.hspace'] = 0
    plt.rcParams['figure.constrained_layout.wspace'] = 0
    plt.rcParams['figure.constrained_layout.use'] = True
    plt.rcParams['axes.spines.left'] = 0
    plt.rcParams['axes.spines.right'] = 0
    plt.rcParams['axes.spines.bottom'] = 0
    plt.rcParams['axes.spines.top'] = 0

    # remove x and y ticks
    plt.xticks(ticks=[])
    plt.yticks(ticks=[])
    
    # plot the figure filled with the full color
    plt.imshow(rgb_img)
    
    # save the figure
    plt.savefig(df.loc[i, 'path_to_returned_color'])
    
    # close the figure
    plt.close()

100%|██████████| 176/176 [00:09<00:00, 18.43it/s]


In [12]:
print('|Color Name|Hex color|RGB Color|Paint swatch|Returned color|')
print('|-----|-----|-----|-----|-----|')
for i in range(len(df)):
    print(f"|{df.loc[i,'name']}|{df.loc[i,'most_common_hex']}|{df.loc[i,'most_common_rgb']}|![]({df.loc[i,'path_to_image']})|![]({df.loc[i,'path_to_returned_color']})|")

|Color Name|Hex color|RGB Color|Paint swatch|Returned color|
|-----|-----|-----|-----|-----|
|Alizarin Crimson|#542223|[84, 34, 35]|![](./images/alizarin_crimson_swatch.jpg)|![](./returned_colors/alizarin_crimson_returned.jpg)|
|Alizarin Orange|#b25417|[178, 84, 23]|![](./images/alizarin_orange_swatch.jpg)|![](./returned_colors/alizarin_orange_returned.jpg)|
|Alizarin Yellow|#956a22|[149, 106, 34]|![](./images/alizarin_yellow_swatch.jpg)|![](./returned_colors/alizarin_yellow_returned.jpg)|
|Bismuth Vanadate Yellow|#f4dd0c|[244, 221, 12]|![](./images/bismuth_vanadate_yellow_swatch.jpg)|![](./returned_colors/bismuth_vanadate_yellow_returned.jpg)|
|Bohemian Green Earth|#353821|[53, 56, 33]|![](./images/bohemian_green_earth_swatch.jpg)|![](./returned_colors/bohemian_green_earth_returned.jpg)|
|Brilliant Yellow Extra Pale|#f4f0d4|[244, 240, 212]|![](./images/brilliant_yellow_extra_pale_swatch.jpg)|![](./returned_colors/brilliant_yellow_extra_pale_returned.jpg)|
|Brilliant Yellow Pale|#f4eca