In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("dark")
plt.rcParams['figure.figsize'] = 16, 12
import pandas as pd
from tqdm import tqdm_notebook
import io
from PIL import Image
from glob import glob
from collections import defaultdict
import os
import pickle

import torchvision.transforms as transforms

from kaggle_camera_model_id_lib.utils import NCrops

In [6]:
raw_train_dir = '/home/mephistopheies/storage2/data/camera-model-id/raw/train/'
train_files = glob(os.path.join(raw_train_dir, '*/*.*'))
print(len(train_files))

2750


In [9]:
out_dir = '/home/mephistopheies/storage2/data/camera-model-id/'

with open(os.path.join(out_dir, 'train_all.tsv'), 'w') as f:
    f.write('\n'.join(train_files))

In [7]:
df = pd.DataFrame(
    [(s.split('/')[-2], s) for s in train_files], 
    columns=['class', 'fname'])

In [12]:
df.head()

Unnamed: 0,class,fname
0,Samsung-Galaxy-S4,/home/mephistopheies/storage2/data/camera-mode...
1,Samsung-Galaxy-S4,/home/mephistopheies/storage2/data/camera-mode...
2,Samsung-Galaxy-S4,/home/mephistopheies/storage2/data/camera-mode...
3,Samsung-Galaxy-S4,/home/mephistopheies/storage2/data/camera-mode...
4,Samsung-Galaxy-S4,/home/mephistopheies/storage2/data/camera-mode...


In [14]:
df['class'].value_counts().shape

(10,)

In [15]:
df['class'].value_counts()

Samsung-Galaxy-Note3    275
HTC-1-M7                275
Motorola-Droid-Maxx     275
Sony-NEX-7              275
Samsung-Galaxy-S4       275
iPhone-6                275
iPhone-4s               275
LG-Nexus-5x             275
Motorola-X              275
Motorola-Nexus-6        275
Name: class, dtype: int64

In [16]:
m = 25

train = []
val = []

for c in df['class'].unique():
    df_tmp = df[df['class'] == c].copy()
    val_tmp = np.random.choice(df_tmp['fname'].tolist(), size=m, replace=False).tolist()
    val.extend(val_tmp)
    train_tmp = set(df_tmp['fname'].tolist()).difference(val)
    print(c, len(val_tmp), len(train_tmp))    
    train.extend(train_tmp)
    
print(len(train))
print(len(val))

Samsung-Galaxy-S4 25 250
iPhone-4s 25 250
HTC-1-M7 25 250
Samsung-Galaxy-Note3 25 250
Motorola-X 25 250
LG-Nexus-5x 25 250
iPhone-6 25 250
Motorola-Nexus-6 25 250
Sony-NEX-7 25 250
Motorola-Droid-Maxx 25 250
2500
250


In [17]:
with open(os.path.join(out_dir, 'train.tsv'), 'w') as f:
    f.write('\n'.join(train))

In [18]:
shapes = []
for fname in tqdm_notebook(val):
    key = fname.split('/')[-2]
    img = Image.open(fname).convert('RGB')
    shapes.append(img.size)




In [19]:
pd.Series(shapes).value_counts()

(3264, 2448)    50
(4128, 2322)    50
(4160, 3120)    31
(6000, 4000)    25
(3024, 4032)    24
(1520, 2688)    22
(4320, 2432)    21
(3120, 4160)    18
(2432, 4320)     4
(2688, 1520)     3
(3088, 4160)     1
(4032, 3024)     1
dtype: int64

In [20]:
len(NCrops(np.zeros((1520, 1520, 3)), crop_size=512, step=504))

9

In [21]:
center_crop = transforms.CenterCrop(1520)

In [22]:
crop_size = 512
step = 504

data = defaultdict(lambda: defaultdict(list))

for fname in tqdm_notebook(val):
    key = fname.split('/')[-2]
    img = Image.open(fname).convert('RGB')
    img = center_crop(img)
    img = np.array(img)   
    data[key][fname].extend(NCrops(img, crop_size=crop_size, step=step))




In [23]:
[(k, sum([len(b) for (a, b) in v.items()])) for (k, v) in data.items()]

[('HTC-1-M7', 225),
 ('LG-Nexus-5x', 225),
 ('Motorola-X', 225),
 ('iPhone-6', 225),
 ('Sony-NEX-7', 225),
 ('iPhone-4s', 225),
 ('Motorola-Nexus-6', 225),
 ('Samsung-Galaxy-S4', 225),
 ('Motorola-Droid-Maxx', 225),
 ('Samsung-Galaxy-Note3', 225)]

In [24]:
n = min([sum([len(b) for (a, b) in v.items()]) for (k, v) in data.items()])
n

225

In [25]:
val = []
for k in data.keys():
    for fname in data[k].keys():
        val.extend([(k, img) for img in data[k][fname]])

#         ixs = np.random.choice(range(len(data[k][fname])), size=int(n/m), replace=False)
#         for i in ixs:
#             val.append((k, data[k][fname][i]))
            
print(len(val))

2250


In [26]:
val_dir = '/home/mephistopheies/storage2/data/camera-model-id/val/'

for ix, (key, img) in tqdm_notebook(enumerate(val), total=len(val)):
    key_dir = os.path.join(val_dir, key)
    if not os.path.isdir(key_dir):
        os.mkdir(key_dir)
    np.savez(os.path.join(key_dir, '%i.npz' % ix), data=img)


