In [3]:
import pandas as pd
df = pd.read_json('./data.json')
df.head()
df.columns
pd.set_option('display.max_colwidth', None)



In [4]:

import requests
from colorthief import ColorThief
from io import BytesIO

# Function to get the dominant color from an image URL
def get_main_color(image_url):
    try:
        # Fetch the image using the requests library
        response = requests.get(image_url)
        image = BytesIO(response.content)
        
        # Use ColorThief to get the dominant color
        color_thief = ColorThief(image)
        dominant_color = color_thief.get_color(quality=10)  # Get the dominant color (10 = lower quality for faster processing)
        
        # Convert dominant color to a hex string
        return f"#{dominant_color[0]:02x}{dominant_color[1]:02x}{dominant_color[2]:02x}"
    except Exception as e:
        return None  # In case of any error (e.g., invalid image link)



In [5]:
import pandas as pd
import requests
from PIL import Image
from sklearn.cluster import KMeans
import numpy as np
from io import BytesIO


# Function to get multiple main colors from an image URL using KMeans clustering
def get_main_colors2(image_url, n_colors=3):
    try:
        # Fetch the image using the requests library
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content))

        # Resize the image to speed up processing (optional)
        image = image.resize((image.width // 5, image.height // 5))

        # Convert the image to RGB (in case it's in another format like RGBA)
        image = image.convert('RGB')

        # Get the pixels as a numpy array
        pixels = np.array(image)

        # Reshape the pixel array to a 2D array (each row is a pixel)
        pixels = pixels.reshape(-1, 3)

        # Use KMeans clustering to find the most common colors
        kmeans = KMeans(n_clusters=n_colors, random_state=42)
        kmeans.fit(pixels)

        # Get the RGB values of the cluster centers (the main colors)
        colors = kmeans.cluster_centers_

        # Convert RGB values to hex
        hex_colors = [f"#{int(color[0]):02x}{int(color[1]):02x}{int(color[2]):02x}" for color in colors]
        return hex_colors
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return []



In [6]:
import numpy as np
from PIL import Image
from sklearn.cluster import KMeans
import requests
from io import BytesIO
def rgb_to_hex(rgb):
    return '#{:02x}{:02x}{:02x}'.format(rgb[0], rgb[1], rgb[2])

# Function to get the main colors and the bounding boxes for those colors
def get_main_colors_and_swatch_coords(image_url, n_colors=3, tolerance=30):
    try:
        # Fetch the image using the requests library
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content))

        # Resize the image to speed up processing (optional)
        image = image.resize((image.width // 5, image.height // 5))

        # Convert the image to RGB (in case it's in another format like RGBA)
        image = image.convert('RGB')

        # Get the pixels as a numpy array
        pixels = np.array(image)

        # Reshape the pixel array to a 2D array (each row is a pixel)
        pixels = pixels.reshape(-1, 3)

        # Use KMeans clustering to find the most common colors
        kmeans = KMeans(n_clusters=n_colors, random_state=42)
        kmeans.fit(pixels)

        # Get the RGB values of the cluster centers (the main colors)
        colors = kmeans.cluster_centers_.astype(int)

        # Assign each pixel to the nearest cluster
        labels = kmeans.labels_

        # Prepare a list to store bounding box details (x, y, width, height, color)
        swatches = []

        # For each color (cluster), find the bounding box (min, max x and y)
        for color in colors:
            # Find the pixels that are closest to the current color (within tolerance)
            # We'll find pixels where the color difference is within a given tolerance
            diffs = np.abs(pixels - color)
            distances = np.sqrt(np.sum(diffs**2, axis=1))
            matching_pixels = np.where(distances <= tolerance)[0]  # Pixel indices that match the color

            if len(matching_pixels) == 0:
                continue  # Skip if no matching pixels are found for this color

            # Get the x, y coordinates of these pixels
            y_coords, x_coords = np.unravel_index(matching_pixels, (image.height, image.width))

            # Find the bounding box for the current color
            min_x = np.min(x_coords)
            max_x = np.max(x_coords)
            min_y = np.min(y_coords)
            max_y = np.max(y_coords)

            # Add the swatch information (x, y, width, height, color)
            swatches.append({
                'color': rgb_to_hex(color),  # RGB color as a tuple
                'x': int(min_x),
                'y': int(min_y),
                'width': int(max_x - min_x),
                'height': int(max_y - min_y),
                'origional_image_width':image.size[0],
                'selected':0 #0 if unselected, 1 if selectd
            })

        # Return the list of swatch details (coordinates, size, color)
        return swatches

    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return []

# Example usage
image_url = 'https://ids.si.edu/ids/deliveryService?id=NMAAHC-2007_3_568_001'  # Replace with your image URL
swatches = get_main_colors_and_swatch_coords(image_url, n_colors=5, tolerance = 1)

# Print swatch details: x, y, width, height, color
import pprint
pprint.pprint(swatches)



[{'color': '#aa5f30',
  'height': 0,
  'origional_image_width': 248,
  'selected': 0,
  'width': 0,
  'x': 83,
  'y': 64},
 {'color': '#dedfda',
  'height': 354,
  'origional_image_width': 248,
  'selected': 0,
  'width': 247,
  'x': 0,
  'y': 0},
 {'color': '#6e3112',
  'height': 45,
  'origional_image_width': 248,
  'selected': 0,
  'width': 165,
  'x': 53,
  'y': 95}]


In [7]:

# Apply the function to the DataFrame
df = df[:1000]
df['main_color'] = df['image_link'].apply(get_main_colors2)
df['swatches'] =df['image_link'].apply(get_main_colors_and_swatch_coords)


In [8]:
#Get the most vibrant color out of the selected colors
import colorsys
def hex_to_rgb(hex_color):
    hex_color = hex_color.lstrip('#') 
    return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

def rgb_to_hsl(r, g, b):
    r, g, b = r / 255.0, g / 255.0, b / 255.0
    h, s, l = colorsys.rgb_to_hls(r, g, b)
    return h, s, l

def get_vibrant_color(colors):
    return max(colors, key=lambda color: rgb_to_hsl(*hex_to_rgb(color))[1])
df['vibrant_color'] = df['main_color'].apply(get_vibrant_color)


In [9]:
#cluster each datapoint based on the vibrant color

vibrant_colors = np.array(df['vibrant_color'].apply(hex_to_rgb).tolist()) 
vibrant_colors_hsl = np.array([rgb_to_hsl(r, g, b) for r, g, b in vibrant_colors])


#DO Kmeans clustering
num_clusters = 10 
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(vibrant_colors_hsl)

# save cluster info for each datapoint
df['cluster_num'] = kmeans.labels_

cluster_centers = kmeans.cluster_centers_
df['cluster_x'] =[cluster_centers[label][0] for label in kmeans.labels_]
df['cluster_y'] = [cluster_centers[label][1] for label in kmeans.labels_]


cluster_x_range =(cluster_centers[:,0].min(), cluster_centers[:,0].max())
cluster_y_range =(cluster_centers[:,1].min(), cluster_centers[:,1].max())
df['cluster_x_range'] = [cluster_x_range] *len(df)
df['cluster_y_range'] = [cluster_y_range]* len(df)


In [10]:
#create a new dataset with the clustering info
cluster_info =[]
def rgb_to_hex(r, g, b):
    # Convert RGB to a hex color code and return it
    return f"#{r:02x}{g:02x}{b:02x}".upper()
for i in range(num_clusters):
    cluster_data = df[df['cluster_num'] == i]
    cluster_center = kmeans.cluster_centers_[i]
    
    #cluster positon info
    cluster_x = cluster_center[0]  # Hue
    cluster_y = cluster_center[1]  # Saturation
    cluster_x_range = (cluster_data['cluster_x'].min(), cluster_data['cluster_x'].max())
    cluster_y_range = (cluster_data['cluster_y'].min(), cluster_data['cluster_y'].max())


    #cluster HSL averages, maxes, etc
    cluster_rgb_avg = np.mean(vibrant_colors[cluster_data.index], axis=0)
    cluster_rgb_avg_hex = rgb_to_hex(int(cluster_rgb_avg[0]), int(cluster_rgb_avg[1]), int(cluster_rgb_avg[2]))

    cluster_hsl = vibrant_colors_hsl[cluster_data.index]
    saturations = cluster_hsl[:, 1] 
    max_saturation_idx = np.argmax(saturations)
    most_saturated_rgb = vibrant_colors[cluster_data.index][max_saturation_idx]
    most_saturated_rgb_hex = rgb_to_hex(int(most_saturated_rgb[0]), int(most_saturated_rgb[1]), int(most_saturated_rgb[2]))

    #creating the cluster entry
    cluster_info.append({
        'cluster_num': i,
        'cluster_x': cluster_x,
        'cluster_y': cluster_y,
        'cluster_x_range': cluster_x_range,
        'cluster_y_range': cluster_y_range,
        'cluster_size': len(cluster_data),
        'average_color': cluster_rgb_avg_hex,
        'most_saturated_color': most_saturated_rgb_hex,
    })
cluster_info_df = pd.DataFrame(cluster_info)
cluster_info_df.head()


Unnamed: 0,cluster_num,cluster_x,cluster_y,cluster_x_range,cluster_y_range,cluster_size,average_color,most_saturated_color
0,0,0.23107,0.776179,"(0.2310695512901395, 0.2310695512901395)","(0.7761791202967674, 0.7761791202967674)",74,#C6C7C3,#ECEEEC
1,1,0.922268,0.66436,"(0.9222682571480604, 0.9222682571480604)","(0.6643598615916955, 0.6643598615916955)",17,#ADA5A7,#CBC9CA
2,2,0.128767,0.741912,"(0.1287668688394064, 0.1287668688394064)","(0.7419117647058824, 0.7419117647058824)",24,#E0C599,#DDE9FD
3,3,0.122383,0.880344,"(0.12238285088894844, 0.12238285088894844)","(0.8803443328550933, 0.8803443328550933)",82,#E2E1DE,#FEFEFE
4,4,0.105586,0.578161,"(0.10558627773500176, 0.10558627773500176)","(0.57816091954023, 0.57816091954023)",58,#98948E,#B8B3B1


In [11]:
#save each dataset
df.head()
df.to_json('df_colorImage.json', orient="records", indent=4)
cluster_info_df.to_json('cluster_info.json', orient="records", indent=4)