In [1]:
from PIL.ImageColor import getrgb
import numpy as np

class Palette:
    colors = []  # html color codes
    def __init__(self, colors):
        assert len(colors) == 6
        self.colors = colors
        
    def to_numpy(self):
        numpy_colors = np.array([getrgb(color) for color in self.colors])
        return numpy_colors

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

def parse_root(root_url="https://www.design-seeds.com/"):
    soup = BeautifulSoup(requests.get(root_url).content, 'html.parser')
    category_urls = []
    for url in [url["href"] if "category" in url["href"] else None for url in soup.find_all("a")]:
        if url:
            category_urls.append(url)
    return category_urls


def parse_category(category_url="https://www.design-seeds.com/category/seasons/summer/"):
    soup = BeautifulSoup(requests.get(category_url).content, 'html.parser')
    design_urls = [elem.find("a")["href"] for elem in soup.find_all(class_="parker-featured-img")]
    return design_urls


def parse_design(design_url="https://www.design-seeds.com/seasons/summer/sweet-hues-7/"):
    soup = BeautifulSoup(requests.get(design_url).content, 'html.parser')
    image_url = soup.find(class_="parker-featured-img").contents[1]["src"]
    colors = [elem.text for elem in  soup.find_all(class_="palette-chips")[0].find_all("li")]
    return image_url, colors


def parse_all():
    image_urls = []
    palettes = []
    for category_url in tqdm(parse_root()):
        for design_url in parse_category(category_url):
            image_url, colors = parse_design(design_url)
            if colors:
                assert len(colors) == 6
                palette = Palette(colors)
                image_urls.append(image_url)
                palettes.append(palette.to_numpy())
            
    return pd.DataFrame({"image_url": image_urls, "palette": palettes})

In [214]:
data = parse_all()

100%|██████████| 20/20 [01:23<00:00,  4.17s/it]


In [215]:
data.to_csv("design-seeds-download.csv", index=False)

In [216]:
data = pd.read_csv("design-seeds-download.csv")

In [217]:
data.head()

Unnamed: 0,image_url,palette
0,https://www.design-seeds.com/wp-content/upload...,[[238 222 206]\n [214 181 170]\n [170 124 114]...
1,https://www.design-seeds.com/wp-content/upload...,[[232 216 204]\n [206 167 156]\n [ 97 86 78]...
2,https://www.design-seeds.com/wp-content/upload...,[[225 182 211]\n [145 70 95]\n [ 87 70 68]...
3,https://www.design-seeds.com/wp-content/upload...,[[243 240 242]\n [247 233 228]\n [251 185 171]...
4,https://www.design-seeds.com/wp-content/upload...,[[175 189 184]\n [ 84 130 121]\n [ 55 46 46]...


In [3]:
import requests

def save_image(url, path):
    img_data = requests.get(url).content
    with open(path, 'wb') as handler:
        handler.write(img_data)

In [4]:
import os

def save_images(df):
    if not os.path.exists("data"):
        os.mkdir("data")
    for i, row in tqdm(df.iterrows(), total=len(data)):
        image_url = row["image_url"]
        save_image(image_url, f"data/{i}.jpg")

In [220]:
save_images(data)

100%|██████████| 120/120 [02:27<00:00,  1.23s/it]


In [221]:
for i, row in data.iterrows():
    row["image_url"] = f"data/{i}.jpg"
data.columns = ["image_path", "palette"]
data.to_csv("design-seeds.csv", index=False)

In [5]:
data = pd.read_csv("design-seeds.csv")

In [6]:
data

Unnamed: 0,image_path,palette
0,data/0.jpg,[[238 222 206]\n [214 181 170]\n [170 124 114]...
1,data/1.jpg,[[232 216 204]\n [206 167 156]\n [ 97 86 78]...
2,data/2.jpg,[[225 182 211]\n [145 70 95]\n [ 87 70 68]...
3,data/3.jpg,[[243 240 242]\n [247 233 228]\n [251 185 171]...
4,data/4.jpg,[[175 189 184]\n [ 84 130 121]\n [ 55 46 46]...
...,...,...
115,data/115.jpg,[[252 230 232]\n [247 122 146]\n [146 218 183]...
116,data/116.jpg,[[253 209 201]\n [177 215 184]\n [241 110 136]...
117,data/117.jpg,[[177 227 208]\n [103 189 151]\n [214 164 86]...
118,data/118.jpg,[[240 211 199]\n [232 125 116]\n [179 46 34]...


In [76]:
import cv2
from PIL import Image
def crop_image(path="data/0.jpg"):
    MIN_PIXELS = 30467
    img = cv2.imread(path)
    if np.prod(img.shape) < MIN_PIXELS:
        print(np.prod(img.shape), path)
        return None
    if img.shape[0] / img.shape[1] < 1:
        y1_relative = 740 / 1197
        y2_relative = 745 / 1197
        y1 = int(y1_relative * img.shape[1])
        y2 = int(y2_relative * img.shape[1])
        
        y3_relative = 302 / 470
        y4_relative = 307 / 470
        y3 = int(y3_relative * img.shape[1])
        y4 = int(y4_relative * img.shape[1])
        
        if img[:, y1:y2, :].mean() > 254:
            y_cut = int(725 / 1197 * img.shape[1])
            return img[:, :y_cut, :]
        elif img[:, y3:y4, :].mean() > 254:
            y_cut = int(300 / 740 * img.shape[1])
            return img[:, :y_cut, :]
        else:
            print(path)
            return None
    else:
        y1_relative = 800 / 1307
        y2_relative = 805 / 1307
        y1 = int(y1_relative * img.shape[0])
        y2 = int(y2_relative * img.shape[0])
        
        y3_relative = 268 / 436
        y4_relative = 272 / 436
        y3 = int(y3_relative * img.shape[0])
        y4 = int(y4_relative * img.shape[0])
        
        if img[y1:y2, :, :].mean() > 254:
            y_cut = int(795 / 1307 * img.shape[0])
            return img[:y_cut, :, :]
        elif img[y3:y4, :, :].mean() > 254:
            y_cut = int(265 / 436 * img.shape[0])
            return img[:y_cut, :, :]
        else:
            print(path)
            return None

In [77]:
import os
def crop_images():
    image_paths = []
    palettes = []
    if not os.path.exists("data-cropped"):
        os.makedirs("data-cropped")
    i = 0
    for _, row in tqdm(data.iterrows()):
#         print(row["image_path"])
        img = crop_image(row["image_path"])
        if img is not None:
            cv2.imwrite(f"data-cropped/{i}.jpg", img)
            i += 1
            image_paths.append(f"data-cropped/{i}.jpg")
            palettes.append(row["palette"])
            
    return pd.DataFrame({"path": image_paths, "palette": palettes})

In [78]:
data_cropped = crop_images()

120it [00:04, 27.42it/s]


In [79]:
data_cropped

Unnamed: 0,path,palette
0,data-cropped/1.jpg,[[238 222 206]\n [214 181 170]\n [170 124 114]...
1,data-cropped/2.jpg,[[232 216 204]\n [206 167 156]\n [ 97 86 78]...
2,data-cropped/3.jpg,[[225 182 211]\n [145 70 95]\n [ 87 70 68]...
3,data-cropped/4.jpg,[[243 240 242]\n [247 233 228]\n [251 185 171]...
4,data-cropped/5.jpg,[[175 189 184]\n [ 84 130 121]\n [ 55 46 46]...
...,...,...
115,data-cropped/116.jpg,[[252 230 232]\n [247 122 146]\n [146 218 183]...
116,data-cropped/117.jpg,[[253 209 201]\n [177 215 184]\n [241 110 136]...
117,data-cropped/118.jpg,[[177 227 208]\n [103 189 151]\n [214 164 86]...
118,data-cropped/119.jpg,[[240 211 199]\n [232 125 116]\n [179 46 34]...


In [81]:
data_cropped.to_csv("design-seeds-cropped.csv", index=False)

255.0

(398, 470, 3)