In [1]:
import os
import shutil
from logger import logger

In [2]:
# Set train and test directory
train_dir = '../data/images_clothes/train/'
discarded_dir = '../data/images_clothes/discarded/'
test_dir = '../data/images_clothes/test/'

In [3]:
main_dir = 'images_clothes'

In [4]:
os.path.join('/data', main_dir, 'train')

'/data/images_clothes/train'

### Prep Images

In [13]:
# Create new directory structure to directory
def mirror_dir_structure(current_dir, new_dir):
    for image_dir in os.listdir(current_dir):
        if not image_dir.startswith('.'):
            try:
                os.makedirs(os.path.join(new_dir, image_dir))
            except OSError:
                print '{} already exists in {}'.format(image_dir, new_dir)

In [49]:
mirror_dir_structure(train_dir, discarded_dir)

In [50]:
# Discard images below threshold size
def discard_images_below_size(train_dir, discard_dir, threshold_size=4000):
    for image_dir in os.listdir(train_dir):
        if not image_dir.startswith('.'):
            image_paths = os.listdir(os.path.join(train_dir, image_dir))

            discarded_images = 0
            for image in image_paths:
                if not image.startswith('.'):
                    image_path = os.path.join(train_dir, image_dir, image)
                    image_size = os.path.getsize(image_path)

                    if image_size < threshold_size:
                        discarded_path = os.path.join(discarded_dir, image_dir, image)
                        shutil.move(image_path, discarded_path)
                        discarded_images += 1

            print '{} has {} images discarded'.format(image_dir, discarded_images)

In [51]:
discard_images_below_size(train_dir, discarded_dir, 4000)

Clothing, Shoes & Jewelry -> Men -> Accessories -> Belts has 11 images discarded
Clothing, Shoes & Jewelry -> Men -> Accessories -> Bow Ties & Cummerbunds -> Bow Ties has 5 images discarded
Clothing, Shoes & Jewelry -> Men -> Accessories -> Gloves & Mittens -> Cold Weather Gloves has 4 images discarded
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Baseball Caps has 6 images discarded
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Fedoras has 2 images discarded
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Skullies & Beanies has 4 images discarded
Clothing, Shoes & Jewelry -> Men -> Accessories -> Neckties has 16 images discarded
Clothing, Shoes & Jewelry -> Men -> Accessories -> Sunglasses & Eyewear Accessories -> Sunglasses has 16 images discarded
Clothing, Shoes & Jewelry -> Men -> Accessories -> Wallets, Card Cases & Money Organizers -> Wallets has 9 images discarded
Clothing, Shoes & Jewelry -> Men -> Clothing -> Active -> 

### Create sample of all categories for POC

In [18]:
train_samp_dir = '../data/images_clothes/train_samp/'

In [19]:
mirror_dir_structure(train_dir, train_samp_dir)

Clothing, Shoes & Jewelry -> Men -> Accessories -> Belts already exists in ../data/images_clothes/train_samp/
Clothing, Shoes & Jewelry -> Men -> Accessories -> Bow Ties & Cummerbunds -> Bow Ties already exists in ../data/images_clothes/train_samp/
Clothing, Shoes & Jewelry -> Men -> Accessories -> Gloves & Mittens -> Cold Weather Gloves already exists in ../data/images_clothes/train_samp/
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Baseball Caps already exists in ../data/images_clothes/train_samp/
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Fedoras already exists in ../data/images_clothes/train_samp/
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Skullies & Beanies already exists in ../data/images_clothes/train_samp/
Clothing, Shoes & Jewelry -> Men -> Accessories -> Neckties already exists in ../data/images_clothes/train_samp/
Clothing, Shoes & Jewelry -> Men -> Accessories -> Sunglasses & Eyewear Accessories -> Sunglasse

In [16]:
def copy_to_dir(current_dir, new_dir, number_to_copy=1000):
    for image_dir in os.listdir(train_dir):
        if not image_dir.startswith('.'):
            image_paths = os.listdir(os.path.join(current_dir, image_dir))
            
            image_count = number_to_copy
            print '{}: {} being copied'.format(image_dir, image_count)
            for image in image_paths:
                if not image.startswith('.') and image_count > 0:
                    original_image = os.path.join(current_dir, image_dir, image)
                    image_to_copy = os.path.join(new_dir, image_dir, image)
                    shutil.copy(original_image, image_to_copy)
                    image_count -= 1

In [17]:
copy_to_dir(train_dir, train_samp_dir, 1000)

Clothing, Shoes & Jewelry -> Men -> Accessories -> Belts: 1000 being copied
Clothing, Shoes & Jewelry -> Men -> Accessories -> Bow Ties & Cummerbunds -> Bow Ties: 1000 being copied
Clothing, Shoes & Jewelry -> Men -> Accessories -> Gloves & Mittens -> Cold Weather Gloves: 1000 being copied
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Baseball Caps: 1000 being copied
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Fedoras: 1000 being copied
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Skullies & Beanies: 1000 being copied
Clothing, Shoes & Jewelry -> Men -> Accessories -> Neckties: 1000 being copied
Clothing, Shoes & Jewelry -> Men -> Accessories -> Sunglasses & Eyewear Accessories -> Sunglasses: 1000 being copied
Clothing, Shoes & Jewelry -> Men -> Accessories -> Wallets, Card Cases & Money Organizers -> Wallets: 1000 being copied
Clothing, Shoes & Jewelry -> Men -> Clothing -> Active -> Active Shirts & Tees: 1000 being copied

### Split into train and test

In [20]:
test_samp_dir = '../data/images_clothes/test_samp/'

In [21]:
mirror_dir_structure(train_samp_dir, test_samp_dir)

In [22]:
# Move to new directory (for splitting train and test data)
def move_to_dir(current_dir, new_dir, percentage_to_move=0.1):
    for image_dir in os.listdir(current_dir):
        if not image_dir.startswith('.'):
            image_paths = os.listdir(os.path.join(current_dir, image_dir))
            image_count = len(image_paths)
            move_count = image_count * percentage_to_move
            print '{} has {} images; Images to move: {}'.format(image_dir, image_count, move_count)
            
            moved_count = 0
            for image in image_paths:
                if not image.startswith('.') and moved_count < move_count:
                    original_image = os.path.join(current_dir, image_dir, image)
                    image_to_move = os.path.join(new_dir, image_dir, image)
                    shutil.move(original_image, image_to_move)
                    moved_count += 1

            print '{} has {} images moved'.format(image_dir, moved_count)

In [23]:
move_to_dir(train_samp_dir, test_samp_dir, 0.1)

Clothing, Shoes & Jewelry -> Men -> Accessories -> Belts has 1000 images; Images to move: 100.0
Clothing, Shoes & Jewelry -> Men -> Accessories -> Belts has 100 images moved
Clothing, Shoes & Jewelry -> Men -> Accessories -> Bow Ties & Cummerbunds -> Bow Ties has 1000 images; Images to move: 100.0
Clothing, Shoes & Jewelry -> Men -> Accessories -> Bow Ties & Cummerbunds -> Bow Ties has 100 images moved
Clothing, Shoes & Jewelry -> Men -> Accessories -> Gloves & Mittens -> Cold Weather Gloves has 1000 images; Images to move: 100.0
Clothing, Shoes & Jewelry -> Men -> Accessories -> Gloves & Mittens -> Cold Weather Gloves has 100 images moved
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Baseball Caps has 1000 images; Images to move: 100.0
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Baseball Caps has 100 images moved
Clothing, Shoes & Jewelry -> Men -> Accessories -> Hats & Caps -> Fedoras has 1000 images; Images to move: 100.0
Clothing, Shoes & J