In [None]:
%pylab inline
import pandas as pd
import numpy as np
from pathlib import Path
import random
import shutil
import tqdm

from nb_005 import *

In [None]:
IMGNET = Path('/DATA/kaggle/imgnetloc/')
IMAGES_TRAIN = Path('/DATA/kaggle/imgnetloc/ILSVRC/Data/CLS-LOC/train/')
IMAGES_VAL = Path('/DATA/kaggle/imgnetloc/ILSVRC/Data/CLS-LOC/val/')
TRAIN_SOLUTION_CSV = IMGNET/'LOC_train_solution.csv'
VALID_SOLUTION_CSV = IMGNET/'LOC_val_solution.csv'
ANNO_TRAIN = Path('/DATA/kaggle/imgnetloc/ILSVRC/Annotations/CLS-LOC/train/')
ANNO_VAL = Path('/DATA/kaggle/imgnetloc/ILSVRC/Annotations/CLS-LOC/val/')


In [None]:
# parse one line of class file, just going to grab first descriptions
def parse_class_line(l):
    id = l.split(' ')[0]
    classes = l[len(id):].strip().split(',')
    return id, classes[0].strip()

# read in mapping of class id to text description
def read_classes(fn):
    classes = dict(map(parse_class_line, open(fn,'r').readlines()))
    return classes

In [None]:
classes = read_classes(IMGNET/'LOC_synset_mapping.txt')

In [None]:
def get_img_fns(img_train_path, class_id):
    img_fns = []
    for fn in (img_train_path/class_id).iterdir():
        img_fns.append(fn)
    return img_fns

In [None]:
def plot_samples(clsid):
    img_fns = get_img_fns(IMAGES_TRAIN, clsid)
    images = [open_image(fn) for fn in np.random.choice(img_fns, 9)]
    _,axes = plt.subplots(3,3, figsize=(12,6))
    for i in range(9):
        images[i].show(axes[i//3,i%3])

In [None]:
to_pull = {
    'n01443537': 'goldfish',
    'n01669191': 'box turtle',
    'n01774750': 'tarantula',
    'n01641577': 'bullfrog',
    'n01882714': 'koala',
    'n01983481': 'American lobster',
    'n02114367': 'timber wolf',
    'n02115641': 'dingo',
    'n02317335': 'starfish',
    'n01806143': 'peacock',
    'n01484850': 'great white shark',
    'n03063689': 'coffeepot',
    'n03272010': 'electric guitar',
    'n03124170': 'cowboy hat',
    'n02799071': 'baseball',
    'n03400231': 'frying pan',
    'n03452741': 'grand piano',
    'n02802426': 'basketball',
    'n02692877': 'airship',
    'n02787622': 'banjo',
    'n03785016': 'moped',
    'n04252077': 'snowmobile',
    'n02088466': 'bloodhound',
    'n04254680': 'soccer ball',
    'n02504458': 'African elephant',
    'n03345487': 'fire engine',
    'n03642806': 'squirrel',
    'n03063599': 'coffee mug',
}

pull_classes = to_pull.keys()

In [None]:
for k in to_pull:
    plot_samples(k)

In [None]:
total_images = 0
for clsid, name in to_pull.items():
    img_fns = get_img_fns(IMAGES_TRAIN, clsid)
    num_images = len(img_fns)
    total_images += num_images
    print(name, num_images)
print('total images:', total_images)

In [None]:
valid_df = pd.read_csv(VALID_SOLUTION_CSV)
train_df = pd.read_csv(TRAIN_SOLUTION_CSV)

len(train_df), len(valid_df)

In [None]:
train_df['classid'] = train_df.ImageId.apply(lambda x: x.split('_')[0])

def parse_prediction_string(s):
    ids = []
    items = s.split(' ')
    pred_count = len(items) // 5
    for i in range(pred_count):
        ids.append(items[i*5])
    return ids[0]

valid_df['classid'] = valid_df.PredictionString.apply(parse_prediction_string)

In [None]:
small_train_df = train_df.loc[train_df.classid.isin(pull_classes)]
small_valid_df = valid_df.loc[valid_df.classid.isin(pull_classes)]
len(pull_classes), small_train_df.shape, small_valid_df.shape

In [None]:
IMGNET_SMALL = Path('/DATA/kaggle/imgnetloc_small/')
SMALL_DATA = IMGNET_SMALL/'ILSVRC/Data/CLS-LOC'
SMALL_ANNO = IMGNET_SMALL/'ILSVRC/Annotations/CLS-LOC'


In [None]:
SMALL_DATA.mkdir(parents=True, exist_ok=True)
SMALL_ANNO.mkdir(parents=True, exist_ok=True)

In [None]:
(dirpath/'train').mkdir(parents=True, exist_ok=True)
(dirpath/'val').mkdir(parents=True, exist_ok=True)
(SMALL_DATA/'train').mkdir(parents=True, exist_ok=True)
(SMALL_ANNO/'val').mkdir(parents=True, exist_ok=True)

In [None]:
# copy training directories
for k in tqdm.tqdm_notebook(pull_classes):
    src_data_path = IMAGES_TRAIN/k
    dest_data_path = SMALL_DATA/'train'/k
    if dest_data_path.exists():
        shutil.rmtree(dest_data_path) 
    shutil.copytree(src_data_path, dest_data_path)
    
    src_data_path = ANNO_TRAIN/k
    dest_data_path = SMALL_ANNO/'train'/k
    if dest_data_path.exists():
        shutil.rmtree(dest_data_path)
    shutil.copytree(src_data_path, dest_data_path)
    

In [None]:
# copy validation directories
dest_val_data = SMALL_DATA/'val'
dest_val_anno = SMALL_ANNO/'val'
if dest_val_data.exists():
    shutil.rmtree(dest_val_data)
if dest_val_anno.exists():
    shutil.rmtree(dest_val_anno)

dest_val_data.mkdir(parents=True, exist_ok=True)
dest_val_anno.mkdir(parents=True, exist_ok=True)

for ix, row in tqdm.tqdm_notebook(list(small_valid_df.ImageId.items())):
    src_file = IMAGES_VAL/f'{row}.JPEG'
    dest_file = dest_val_data/f'{row}.JPEG'
    shutil.copyfile(src_file, dest_file)
    
    src_file = ANNO_VAL/f'{row}.xml'
    dest_file = dest_val_anno/f'{row}.xml'
    shutil.copyfile(src_file, dest_file)

In [None]:
# copy class file
src_file = IMGNET/'LOC_synset_mapping.txt'
dst_file = IMGNET_SMALL/'LOC_synset_mapping.txt'

with open(src_file,'r') as rf:
    src_lines = rf.readlines()
with open(dst_file,'w') as wf:
    for line in src_lines:
        clsid = line.split(' ')[0]
        if clsid in pull_classes:
            wf.write(line)
        

# copy train train loc file
src_file = IMGNET/'LOC_train_solution.csv'
dst_file = IMGNET_SMALL/'LOC_train_solution.csv'

with open(src_file,'r') as rf:
    src_lines = rf.readlines()
with open(dst_file,'w') as wf:
    for line in src_lines:
        clsid = line.split(' ')[0]
        if clsid in pull_classes:
            wf.write(line)


In [None]:
# copy train loc
src_file = IMGNET/'LOC_train_solution.csv'
dst_file = IMGNET_SMALL/'LOC_train_solution.csv'

with open(src_file,'r') as rf:
    src_lines = rf.readlines()

with open(dst_file,'w') as wf:
    wf.write(src_lines[0])
    for line in src_lines[1:]:
        clsid = line[0:9]
        if clsid in pull_classes:
            wf.write(line)


In [None]:
# copy val loc
src_file = IMGNET/'LOC_val_solution.csv'
dst_file = IMGNET_SMALL/'LOC_val_solution.csv'

with open(src_file,'r') as rf:
    src_lines = rf.readlines()

with open(dst_file,'w') as wf:
    wf.write(src_lines[0])
    for line in src_lines[1:]:
        clsid = line[0:9]
        if clsid in pull_classes:
            wf.write(line)


In [None]:
src_file = IMGNET/'LOC_val_solution.csv'
dst_file = IMGNET_SMALL/'LOC_val_solution.csv'

with open(src_file,'r') as rf:
    src_lines = rf.readlines()
    
with open(dst_file,'w') as wf:
    wf.write(src_lines[0])
    for line in src_lines[1:]:
        clsid = line.split(',')[1][0:9]
        if clsid in pull_classes:
            wf.write(line)

In [None]:
small_train_df.groupby('classid').ImageId.count()