# Data Preprocessing

In [1]:
import sys
import shutil
from pathlib import Path 

In [2]:
def preprocess_emoji(dset_path):
    # make emoji unicode vocabulary 
    code_vocaburary = {}
    code_path = Path('dataset/description/unicode.txt')
    code_list = code_path.read_text(encoding='utf-8').split('\n')
    for index, data in enumerate(code_list):
        code_vocaburary[data] = index
    
    # chack dataset path
    image_path = Path(dset_path)
    if image_path.exists() == False:
        exit('Check your dataset path!')

    # copy designated emoji images
    for filepath in list(image_path.glob("./**/64/**/*.png")):
        if str(filepath.name.split(".")[0]) in code_list:
            shutil.copyfile(filepath, \
                'dataset/edited/' + str(code_vocaburary[filepath.name.split(".")[0]]) + '.png')

In [3]:
print("Started pre-processing")
preprocess_emoji("dataset/original")
print("Completed pre-processing")

Started pre-processing
Completed pre-processing


## Constants

In [33]:
IMAGE_SHAPE = (64, 64, 3)
EMBEDDING_DIM = 300
IMAGE_DIR = "dataset/edited/"
TXT_DIR = "dataset/description/detailed/"

# Data Loading

In [34]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import img_to_array, load_img
from nltk.tokenize import sent_tokenize
import numpy as np
import sys
import re

In [35]:
def load_dataset(img_dir, txt_dir, img_shape, split_rate = 0.1):
    t_path = Path(txt_dir)
    i_path = Path(img_dir)
    
    images = dict()
    texts = dict()
    
    for filename in list(i_path.glob("*.png")):
        name = filename.name.replace('.png', '')
        images[name] = filename.resolve()
    
    for filename in list(t_path.glob("*.txt")):
        name = filename.name.replace('.txt', '')
        texts[name] = filename.read_text(encoding='utf-8').lower()

    image_list = []
    caption_list = []
    numbers = []
    
    for name, item_path in images.items():
        if name in texts:
            text = texts[name]
            text = text.replace("“", "") # need to remove explicitly as ascii has only one double-quotes, no start-end double-quotes
            text = text.replace("”", "")
            tokenized = sent_tokenize(text) # tokenizes sentences, delimiter = "."
            label_number = int(name)

            for sentence in tokenized:
                regex_any_symbol = re.compile("[!-/:-@[-`{-~]")
                filtered_sentence = re.sub(regex_any_symbol, "", sentence) # removes any symbol from description
#                 print(filtered_sentence)
                image = img_to_array(load_img(item_path, target_size=(img_shape[0], img_shape[1])))
                image = (image.astype(np.float32) / 127.5) - 1.
#                 print(image)
                image_list.append(image)
                caption_list.append(filtered_sentence)
                numbers.append(label_number)
                
    image_list = np.array(image_list)
    caption_list = np.array(caption_list)
    numbers = np.array(numbers)
    
    print('Dataset Size: %s' % len(image_list))
    image_train, image_test, caption_train, caption_test, numbers_train, numbers_test = train_test_split(image_list, caption_list, numbers, test_size=split_rate)
    
    return image_train, caption_train, image_test, caption_test, numbers_train, numbers_test

In [36]:
image_train, caption_train, image_test, caption_test, numbers_train, numbers_test = \
    load_dataset(IMAGE_DIR,TXT_DIR, IMAGE_SHAPE)

Dataset Size: 260


# Word2Em

In [26]:
import urllib.request
import os
import zipfile
import numpy as np

In [29]:
def load_glove(glove_file_path, embedding_dim = EMBEDDING_DIM):
    print("Loading glove file, please wait...")
    _word2em = {}
    file = open(glove_file_path, mode='rt', encoding='utf8')
    for line in file:
        words = line.strip().split()
        word = words[0]
        embeds = np.array(words[1:], dtype=np.float32)
        _word2em[word] = embeds
    file.close()
    print("Finished.")
    return _word2em

In [30]:
word2em = load_glove("/home/kaustubh/emotigan/utils/glove.6B.300d.txt")

Loading glove file, please wait...
Finished.


# Sentence2Em

In [31]:
def vectorize_sentence(sentence, embedding_dim = EMBEDDING_DIM):
    words = sentence.split(" ")
    em = np.zeros(shape=(embedding_dim, ))
    for word in words:
        try:
            em = np.add(em, word2em[word])
        except KeyError:
            print("Error: Not found \""+word+"\"")
    return em

In [32]:
# testing
len(vectorize_sentence(caption_test[0]))

300

# Model

In [14]:
# GPU setting
import tensorflow as tf
from tensorflow.keras.backend import set_session

config = tf.ConfigProto(
            gpu_options = tf.GPUOptions(
                visible_device_list="0", # specify GPU number
                allow_growth=True)
        )

print(config)

set_session(tf.Session(config=config))

gpu_options {
  allow_growth: true
  visible_device_list: "0"
}



In [37]:
from __future__ import print_function, division

from keras.layers import Input, Dense, Reshape, Flatten, Dropout, concatenate
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras import backend as kb
from keras.layers import Lambda

import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import numpy as np

import math
import pandas as pd
import sys
import time
from PIL import Image

In [38]:
def build_generator(latent_dim = 100, embedding_dim = EMBEDDING_DIM):
    generator_input = Input(shape=(self.latent_dim, ), name="g_input")
    cond_input = Input(shape=(self.embedding_dim, ), name="cond_g_input")
    cond_output = Dense(100)(cond_input)

    G = concatenate([generator_input, cond_output])
    G = Dense(256 * 8 * 8, activation="relu")(G)
    G = Reshape((8, 8, 256))(G)
    G = UpSampling2D()(G)
    G = Conv2D(256, kernel_size=3, padding="same")(G)
    G = BatchNormalization(momentum=0.8)(G)
    G = Activation("relu")(G)
    G = UpSampling2D()(G)
    G = Conv2D(128, kernel_size=3, padding="same")(G)
    G = BatchNormalization(momentum=0.8)(G)
    G = Activation("relu")(G)
    G = UpSampling2D()(G)
    G = Conv2D(64, kernel_size=3, padding="same")(G)
    G = BatchNormalization(momentum=0.8)(G)
    G = Activation("relu")(G)
    G = Conv2D(self.channels, kernel_size=3, padding="same")(G)
    generator_output = Activation("tanh")(G)

    generator = Model([generator_input, cond_input], generator_output)
    generator.summary()

    return generator

In [39]:
def build_discriminator(img_shape = IMAGE_SHAPE, embedding_dim = EMBEDDING_DIM):
    discriminator_input = Input(shape=self.img_shape, name="d_input")
    cond_input = Input(shape=(self.embedding_dim, ), name="cond_d_input")
    D = Conv2D(64, kernel_size=3, strides=2, padding="same")(discriminator_input)
    D = LeakyReLU(alpha=0.2)(D)
    D = Dropout(0.25)(D)
    D = Conv2D(128, kernel_size=3, strides=2, padding="same")(D)
    D = ZeroPadding2D(padding=((0,1),(0,1)))(D)
    D = BatchNormalization(momentum=0.8)(D)
    D = LeakyReLU(alpha=0.2)(D)
    D = Dropout(0.25)(D)
    D = Conv2D(256, kernel_size=3, strides=1, padding="same")(D)
    D = BatchNormalization(momentum=0.8)(D)
    D = LeakyReLU(alpha=0.2)(D)
    D = Dropout(0.25)(D)
    D = Conv2D(512, kernel_size=3, strides=2, padding="same")(D)
    D = BatchNormalization(momentum=0.8)(D)
    D = LeakyReLU(alpha=0.2)(D)

    cond_d_hidden = Dense(100)(cond_input)
    cond_d_hidden = Reshape((1, 1, 100))(cond_d_hidden)
    cond_d_output = Lambda(lambda x: kb.tile(x, [1, 9, 9, 1]))(cond_d_hidden)

    D = concatenate([D, cond_d_output], axis=-1)
    D = Conv2D(512, kernel_size=3, strides=1, padding='same')(D)
    D = BatchNormalization(momentum=0.8)(D)
    D = LeakyReLU(alpha=0.1)(D)
    D = Dropout(0.25)(D)
    D = Flatten()(D)
    discriminator_output = Dense(1, activation='sigmoid')(D)

    discriminator = Model([discriminator_input, cond_input], discriminator_output)
    discriminator.summary()

    return discriminator