In [1]:
import requests
import argparse
from bs4 import BeautifulSoup
from tqdm import tqdm
import io
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError

from multiprocessing import Pool
from PIL import Image

from repalette.constants import BASE_DATA_DIR, RAW_DATA_DIR, DATA_DIR, DATABASE_PATH
from repalette.utils.models import RawImage
from repalette.utils.models import Base

In [2]:
DESIGN_SEEDS_PAGES_ROOT = r"https://www.design-seeds.com/blog/page/"

In [3]:
from repalette.utils.data_preparation import get_image_urls_and_palettes, get_image_name

In [4]:
image_urls = []
palettes = []

i = 0
bar = tqdm(desc="Pages downloaded")
response = requests.get(DESIGN_SEEDS_PAGES_ROOT + str(i))
bar.update(n=1)
while response.status_code != 404:
    bs = BeautifulSoup(response.content, "html.parser")
    posts = bs.find_all(class_="entry-content")

    for post in posts:
        image_url = post.find_all(class_="attachment-full")[0]["src"]
        palette = [header.text for header in post.find_all("h5")[1:]]

        image_urls.append(image_url)
        palettes.append(palette)
        
    bar.update(n=1)

    i += 1
    response = requests.get(DESIGN_SEEDS_PAGES_ROOT + str(i))
    
    if i == 5:
        break

Pages downloaded: 6it [00:06,  1.23s/it]

In [4]:
if not os.path.exists(BASE_DATA_DIR):
    os.mkdir(BASE_DATA_DIR)
if not os.path.exists(RAW_DATA_DIR):
    os.mkdir(RAW_DATA_DIR)
if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)

engine = create_engine(f"sqlite:///{DATABASE_PATH}")
# create a configured "Session" class
Session = sessionmaker(bind=engine)
Base.metadata.create_all(engine)

In [6]:
def download_image_to_database(image):
    image_url, palette = image
    image_name = get_image_name(image_url)
    image_save_path = os.path.join(DATA_DIR, image_name)

    # create a database Session
    session = Session()

    raw_image = RawImage(
        path=image_save_path,
        palette=palette,
        url=image_url,
        name=image_name,
    )
    try:
        session.add(raw_image)
        # if add successful (new image) - download image
        image_data = requests.get(image_url).content
        image = Image.open(io.BytesIO(image_data))
        raw_image.height = image.height
        raw_image.width = image.width
        session.commit()
        # save image on disk
        image.save(image_save_path, "PNG")

    except IntegrityError:  # image already in the database
        pass


Pages downloaded: 6it [00:13,  2.21s/it]A

  3%|▎         | 1/30 [00:01<00:49,  1.71s/it][A
  7%|▋         | 2/30 [00:01<00:35,  1.27s/it][A
 13%|█▎        | 4/30 [00:02<00:23,  1.09it/s][A
 17%|█▋        | 5/30 [00:02<00:18,  1.38it/s][A
 23%|██▎       | 7/30 [00:02<00:12,  1.85it/s][A
 30%|███       | 9/30 [00:02<00:08,  2.34it/s][A
 33%|███▎      | 10/30 [00:03<00:08,  2.39it/s][A
 43%|████▎     | 13/30 [00:03<00:05,  2.92it/s][A
 47%|████▋     | 14/30 [00:04<00:05,  3.15it/s][A
 50%|█████     | 15/30 [00:04<00:05,  2.88it/s][A
 57%|█████▋    | 17/30 [00:04<00:03,  3.35it/s][A
 63%|██████▎   | 19/30 [00:05<00:02,  4.21it/s][A
 70%|███████   | 21/30 [00:05<00:02,  3.93it/s][A
 73%|███████▎  | 22/30 [00:06<00:03,  2.41it/s][A
 77%|███████▋  | 23/30 [00:06<00:02,  2.79it/s][A
 83%|████████▎ | 25/30 [00:06<00:01,  3.62it/s][A
 87%|████████▋ | 26/30 [00:06<00:00,  4.41it/s][A
 90%|█████████ | 27/30 [00:07<00:00,  5.22it/s][A
 93%|█████████▎| 28/30 [00:07<00:00,  3.98it

In [5]:
session = Session()
raw_images = session.query(RawImage).all()

In [6]:
raw_images

[<repalette.utils.models.raw_image.RawImage at 0x7ffa6397ab80>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa6397abe0>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa63986160>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa639861f0>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa63986280>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa63986310>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa639863a0>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa63986430>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa639864c0>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa63986550>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa63986610>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa639866d0>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa63986790>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa63986850>,
 <repalette.utils.models.raw_image.RawImage at 0x7ffa63986910>,
 <repalette.utils.models.raw_image.RawIm