In [14]:
%matplotlib inline

import sys

sys.path.append("..")

# Create sketchy test dataset

It will contain only 10 images per class per mode, for testing purposes.

In [8]:
image_root = r'C:\Users\Chopan\Documents\Data\sketchy\photo\tx_000000000000'

sketch_root = r'C:\Users\Chopan\Documents\Data\sketchy\sketch\tx_000000000000'

test_image_root = r'C:\Users\Chopan\Documents\Data\sketchy_test\photo'

test_sketch_root = r'C:\Users\Chopan\Documents\Data\sketchy_test\sketch'

In [2]:
# Create subfolders if they do not exist

import os

# test image root
if not os.path.exists(test_image_root):
    os.makedirs(test_image_root)
    
# test sketch root
if not os.path.exists(test_image_root):
    os.makedirs(test_image_root)

# test image subfolders
for cls_name in os.listdir(image_root):
    if not os.path.exists(os.path.join(test_image_root, cls_name)):
        os.makedirs(os.path.join(test_image_root, cls_name))
        
# test sketch subfolers
for cls_name in os.listdir(sketch_root):
    if not os.path.exists(os.path.join(test_sketch_root, cls_name)):
        os.makedirs(os.path.join(test_sketch_root, cls_name))

In [3]:
from shutil import copyfile

n_copy = 10

for cls_name in os.listdir(image_root):
    for img_path in os.listdir(os.path.join(image_root, cls_name))[:n_copy]:
        src = os.path.join(image_root, cls_name, img_path)
        dst = os.path.join(test_image_root, cls_name, img_path)
        copyfile(src, dst)

for cls_name in os.listdir(sketch_root):
    for img_path in os.listdir(os.path.join(sketch_root, cls_name))[:n_copy]:
        src = os.path.join(sketch_root, cls_name, img_path)
        dst = os.path.join(test_sketch_root, cls_name, img_path)
        copyfile(src, dst)

# Create sample vector datasets

The purpose of this dataset is to have a simple task to test discriminators on. Discriminators will classify feature vectors as belonging to a particular mode.

Two classes are generated, with n-dimensional feature vectors (x), where n should correspond to the dimension of feature vectors for our Common Vector Space. Feature vectors for the first class are randomnly generated using a uniform distribuition, while feature vectors for the second class are generated by adding $1.0$ to a randomly generated number from a uniform distribuition, making them easy to distinguish.

A variant using one-hot encodings as labels is also generated, to explore their impact on model performance.

In [3]:
import os, pickle, random

import pandas as pd

from settings import ROOT_DIR

Binary labels

In [11]:
n = 100000
dimension = 100

data = pd.DataFrame(columns=['class', 'vector'])
data['class'] = [random.randint(0,1) for i in range(n)]
data['vector'] = data['class'].apply(lambda c: [c + random.uniform(0, 1) for i in range(dimension)])
data.head()

Unnamed: 0,class,vector
0,0,"[0.6041758526284728, 0.4841415290691602, 0.918..."
1,0,"[0.9606537468168959, 0.4109090515245104, 0.777..."
2,0,"[0.9373595670444902, 0.6821984845245117, 0.413..."
3,1,"[1.034660754472529, 1.0549446192799117, 1.2925..."
4,1,"[1.2651677344896024, 1.1693411321433889, 1.758..."


In [12]:
pickle.dump(data, open(os.path.join(ROOT_DIR, 'static', 'pickles', 'discriminators', 'sample-vectors.pickle'), 'wb'))

One-hot encoded labels

In [5]:
n = 100000
dimension = 100

data = pd.DataFrame(columns=['class', 'vector'])
data['class'] = [random.randint(0,1) for i in range(n)]
data['vector'] = data['class'].apply(lambda c: [c + random.uniform(0, 1) for i in range(dimension)])
data['class'] = data['class'].apply(lambda c: [1 - c, c])
data.head()

Unnamed: 0,class,vector
0,"[1, 0]","[0.09496853316286591, 0.7606455964516935, 0.69..."
1,"[0, 1]","[1.3017434475616079, 1.1067716520584432, 1.624..."
2,"[0, 1]","[1.6181892431335871, 1.3569118964677038, 1.749..."
3,"[0, 1]","[1.2077994265391951, 1.0290185558746596, 1.189..."
4,"[1, 0]","[0.11540202312160885, 0.6322493661006999, 0.14..."


In [6]:
pickle.dump(data, open(os.path.join(ROOT_DIR, 'static', 'pickles', 'discriminators', 'sample-vectors-onehot.pickle'), 'wb'))

# NL annotations

First off, let's process the class names. We will be using the information encoded in class names to endow the common vector space with semantic information.

In [10]:
class_names = os.listdir(image_root)

classes = pd.DataFrame(columns=['class', 'vector', 'tsne'])
classes['class'] = class_names
classes.head()

Unnamed: 0,class,vector,tsne
0,airplane,,
1,alarm_clock,,
2,ant,,
3,ape,,
4,apple,,


In [11]:
# Split compound names

import re

classes['class'] = classes['class'].apply(lambda cls: ' '.join(re.split(r'(?:_|-)', cls)))
classes.head()

Unnamed: 0,class,vector,tsne
0,airplane,,
1,alarm clock,,
2,ant,,
3,ape,,
4,apple,,


In [18]:
# Clean class names with the textpreprocess package

from modules.textpreprocess.compound_cleaners.en import full_clean

classes['class'] = classes['class'].apply(full_clean)
classes.head()

Unnamed: 0,class,vector,tsne
0,airplane,,
1,alarm clock,,
2,ant,,
3,ape,,
4,apple,,


In [20]:
# Generate word vectors

from modules.wordvectors.en import document_vector

classes['vector'] = classes['class'].apply(document_vector)
classes.head()

Unnamed: 0,class,vector,tsne
0,airplane,"[0.15440000593662262, 0.08799999952316284, 0.0...",
1,alarm clock,"[0.021550001576542854, -0.06300000101327896, -...",
2,ant,"[-0.04919999837875366, 0.14069999754428864, 0....",
3,ape,"[-0.05689999833703041, -0.09179999679327011, 0...",
4,apple,"[-0.06599999964237213, -0.027000000700354576, ...",


In [21]:
# Create a TSNE projection for visualization purposes

from sklearn.manifold import TSNE

classes['tsne'] = list(TSNE(n_components=2).fit_transform(np.vstack(classes['vector'].values)))
classes.head()

Unnamed: 0,class,vector,tsne
0,airplane,"[0.15440000593662262, 0.08799999952316284, 0.0...","[46.075233, -45.859596]"
1,alarm clock,"[0.021550001576542854, -0.06300000101327896, -...","[-3.020899, 19.042398]"
2,ant,"[-0.04919999837875366, 0.14069999754428864, 0....","[-32.18552, 4.3863773]"
3,ape,"[-0.05689999833703041, -0.09179999679327011, 0...","[-60.452953, -24.02946]"
4,apple,"[-0.06599999964237213, -0.027000000700354576, ...","[20.124369, 69.90395]"


In [22]:
# Visualization

import plotly.graph_objs as go

from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True)

aux = np.vstack(classes['tsne'])

trace = go.Scattergl(
    x = aux[:, 0],
    y = aux[:, 1],
    text = classes['class'].values,
    mode = 'markers',
    marker = dict(
        size=16,
        color = np.random.randn(len(aux)),
        colorscale='Viridis'
    )
)
data = [trace]
iplot(data)