In [1]:
import requests

def save_image(url, path):
    img_data = requests.get(url).content
    with open(path, 'wb') as handler:
        handler.write(img_data)
        

from PIL.ImageColor import getrgb
import numpy as np

class Palette:
    colors = []  # html color codes
    def __init__(self, colors):
        assert len(colors) == 6
        self.colors = colors
        
    def to_numpy(self):
        numpy_colors = np.array([getrgb(color) for color in self.colors])
        return numpy_colors

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

def parse_root(root_url="https://www.design-seeds.com/"):
    soup = BeautifulSoup(requests.get(root_url).content, 'html.parser')
    category_urls = []
    for url in [url["href"] if "category" in url["href"] else None for url in soup.find_all("a")]:
        if url:
            category_urls.append(url)
    return category_urls


def parse_category(category_url="https://www.design-seeds.com/category/seasons/summer/"):
    soup = BeautifulSoup(requests.get(category_url).content, 'html.parser')
    design_urls = [elem.find("a")["href"] for elem in soup.find_all(class_="parker-featured-img")]
    return design_urls


def parse_design(design_url="https://www.design-seeds.com/seasons/summer/sweet-hues-7/"):
    soup = BeautifulSoup(requests.get(design_url).content, 'html.parser')
    image_url = soup.find(class_="parker-featured-img").contents[1]["src"]
    colors = [elem.text for elem in  soup.find_all(class_="palette-chips")[0].find_all("li")]
    return image_url, colors


def parse_all():
    image_urls = []
    palettes = []
    for category_url in tqdm(parse_root()):
        for design_url in parse_category(category_url):
            image_url, colors = parse_design(design_url)
            if colors:
                assert len(colors) == 6
                palettte = Palette(colors)
                image_urls.append(image_url)
                palettes.append(palette.to_numpy())
            
    return pd.DataFrame({"image_url": image_urls, "palette": palettes})

In [257]:
data = parse_all()

Category ['category', 'seasons'] parsed...
Category ['seasons', 'summer'] parsed...
Category ['seasons', 'autumn'] parsed...
Category ['seasons', 'winter'] parsed...
Category ['seasons', 'spring'] parsed...
Category ['category', 'in-nature'] parsed...
Category ['in-nature', 'flora'] parsed...
Category ['in-nature', 'heavens'] parsed...
Category ['in-nature', 'succulents'] parsed...
Category ['in-nature', 'creatures'] parsed...
Category ['in-nature', 'nature-made'] parsed...
Category ['category', 'wander'] parsed...
Category ['wander', 'wanderlust'] parsed...
Category ['wander', 'sea'] parsed...
Category ['category', 'studio-hues'] parsed...
Category ['studio-hues', 'collage'] parsed...
Category ['studio-hues', 'maker'] parsed...
Category ['category', 'edible-hues'] parsed...
Category ['edible-hues', 'culinary-color'] parsed...
Category ['edible-hues', 'sweet-tooth'] parsed...


In [263]:
data.to_csv("design-seeds.csv")

In [3]:
data = pd.read_csv("design-seeds.csv")

In [7]:
import os

def save_images(df):
    if not os.path.exists("data"):
        os.mkdir("data")
    for i, row in tqdm(df.iterrows()):
        image_url = row["image_url"]
        save_image(image_url, f"data/{i}.jpg")
        np.save(f"data/{i}.npy", row["palette"])

In [8]:
save_images(data)

120it [02:22,  1.19s/it]
