# Clothing 1M data download and preparation

## Data download

Data can be downloaded from: https://data.vision.ee.ethz.ch/cvl/webvision/download.html   


e.g. wget https://data.vision.ee.ethz.ch/cvl/webvision/flickr_resized_256.tar 

In [None]:
import sys
sys.path.append("..")
import numpy as np
import random


import pickle
import time
import itertools
import logging
from tqdm import tqdm
import os
import datetime
import random
import pandas as pd
import models
import shutil
import matplotlib.pyplot as plt
import numpy as npn
import torchvision
from collections import Counter
from sklearn.model_selection import StratifiedKFold
import utils
import warnings
from PIL import Image
import h5py


import utils
import warnings

plt.ion()
plt.show()
%load_ext autoreload
%autoreload 2
from sklearn.exceptions import ConvergenceWarning
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
path = "../../../datasets/webvision"

dataset_name = "webvision"
google_only = True

fn = f"{path}/info/val_filelist.txt"

df = pd.read_csv(fn, sep=" ", header= None)
print(df.shape, len(df[1].unique()))
df[0] = df[0].apply(lambda x: f"val_images_256/{x}") # add folder path
df.head()

In [None]:
google = f"{path}/info/train_filelist_google.txt"
flicker = f"{path}/info/train_filelist_flickr.txt"
if google_only:
    df_noisy =pd.read_csv(google, sep=" ", header=None) # use only google data
else:
    df_noisy = pd.concat([
        pd.read_csv(google, sep=" ", header=None),
        pd.read_csv(flicker, sep=" ", header=None)
    ], ignore_index = True)


print(df_noisy.shape)
df_noisy.head()

# Select the number of classes to use

In [None]:
nb_classes = 50

df = df[df[1] <nb_classes]

df_noisy = df_noisy[df_noisy[1] <nb_classes]

df.reset_index(drop=True).to_pickle(f"{path}/info/test_{nb_classes}.pkl")

## Create train - val splits

In [None]:
skf = StratifiedKFold(n_splits=20,shuffle=True, random_state=1) # generate folds
X = np.arange(df_noisy.shape[0])

In [None]:
for j, (train_index, val_index) in enumerate(skf.split(X, df_noisy[1].values)):
    break

len(train_index), len(val_index)

In [None]:
df_noisy.iloc[val_index].reset_index(drop=True).to_pickle(f"{path}/info/val_{nb_classes}.pkl")
df_noisy.iloc[train_index].reset_index(drop=True).to_pickle(f"{path}/info/train_{nb_classes}.pkl")

In [None]:
img_path, img_class = df.iloc[0].values
img_path, img_class

# Plot randomly selected images per class

In [None]:
for i in np.arange(50):
    plt.figure(figsize = (7, 3))
    for j in range(3):
        plt.subplot(1, 3, j+1)
        ds = df[df[1] ==i ].reset_index(drop=True).copy()
        img_path, img_class  = ds.iloc[np.random.randint(0, len(ds))].values
        full_img_path = f"{path}/{img_path}"
        img = Image.open(full_img_path)
        plt.title(f"Class {i}")
        plt.imshow(img)
    plt.show()

# Create h5 file

In [None]:
img_size = 128
nb_classes= 50

In [None]:
for name in ["val", "test", "train"]:
    df = pd.read_pickle(f"{path}/info/{name}_{nb_classes}.pkl")
    if google_only:
        uid = f"{path}/info/google{name}_{nb_classes}_{img_size}"
    else:
        uid = f"{path}/info/{name}_{nb_classes}_{img_size}"
    with h5py.File(f"{uid}.hdf5", 'a') as h:
        # Create dataset inside HDF5 file to store images
        images = h.create_dataset('images',
                                  (df.shape[0], img_size, img_size, 3),
                                  dtype='uint8')
        print(
            f"\n {name} : Reading {df.shape} images and captions, storing to file...\n"
        )

        for i in tqdm(range(df.shape[0])):
            img_path, _ = df.iloc[i].values
            image = Image.open(f"{path}/{img_path}")
            image = image.resize((img_size, img_size), Image.ANTIALIAS)
            images[i] = np.asarray(image)  # Save image to HDF5 file

        labels = df[1].values.astype(int)
        print(f"Min label {min(labels)}, Max {max(labels)}")
        np.save(f"{uid}.npy", labels)
    h.close()
    print(f"Saved to {uid}")