## Skill label prediction with image-based features (text and color with descriptive tags)
## Text recognition only

In [1]:
import pandas as pd 
import random
import numpy as np
import copy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kfold=KFold(n_splits=10)

# for LSTM (keras with tf backend)
import gzip
import os
import pickle
import requests
import time
import re
os.environ['KERAS_BACKEND']='cntk'
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.models import Sequential, load_model
from keras import regularizers
from keras.optimizers import SGD, Adam
from keras.initializers import he_normal
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import History, CSVLogger
from keras.utils import to_categorical
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')

MAX_DOC_LEN = 40
VOCAB_SIZE = 3000
EMBEDDING_DIM = 100

  from ._conv import register_converters as _register_converters
Using CNTK backend


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/edithzeng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/edithzeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocessing

In [None]:
vizwiz_features_train_color = pd.read_csv('azure_features_images/data/vizwiz_train_color_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                                    quotechar='"', error_bad_lines=False)
vizwiz_features_train_text = pd.read_csv('azure_features_images/data/vizwiz_train_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)


vizwiz_features_val_color = pd.read_csv('azure_features_images/data/vizwiz_val_color_recognition.csv',
                                  delimiter=';', engine='python',
                                  dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                                  quotechar='"', error_bad_lines=False)
vizwiz_features_val_text = pd.read_csv('azure_features_images/data/vizwiz_val_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)


vqa_features_train_color = pd.read_csv('azure_features_images/data/vqa_train_color_recognition.csv',
                                 delimiter=';', engine='python', 
                                 dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                                 quotechar='"', error_bad_lines=False)
vqa_features_train_text = pd.read_csv('azure_features_images/data/vqa_train_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)
vqa_features_val_color = pd.read_csv('azure_features_images/data/vqa_val_color_recognition.csv',
                               delimiter=';', engine='python',
                               dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                               quotechar='"', error_bad_lines=False)
vqa_features_val_text = pd.read_csv('azure_features_images/data/vqa_val_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)


vizwiz_targets_train = pd.read_csv('../vizwiz_skill_typ_train.csv', dtype={'QID':str},
                                   delimiter=',', quotechar='"',
                                   engine='python', error_bad_lines=False)
vizwiz_targets_val = pd.read_csv('../vizwiz_skill_typ_val.csv', dtype={'QID':str},
                                 delimiter=',', quotechar='"', engine='python', error_bad_lines=False)
vqa_targets_train = pd.read_csv('../vqa_skill_typ_train.csv', dtype={'QID':str},
                               engine='python', quotechar='"', error_bad_lines=False)
vqa_targets_val = pd.read_csv('../vqa_skill_typ_val.csv', dtype={'QID':str},
                               engine='python', quotechar='"', error_bad_lines=False)

In [None]:
vizwiz_features_train_text.head(2)

In [406]:
vizwiz_train.sample(1)

Unnamed: 0_level_0,IMG,QSN,TXT,OBJ,COL,CNT,OTH,question,ocr_text,handwritten_text,descriptions,tags,dominant_colors
QID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
VizWiz_train_000000017125.jpg,VizWiz_train_000000017125.jpg,What is this?,1,1,0,0,0,What is this?,"['\\einz', '•heart', '&', 'st', 'rofk', 'du', ...","['HEINZ', '- CANADA FANCY -', 'ATO JUICE', '-C...","[{'text': 'a bottle of beer on a table', 'conf...","['indoor', 'bottle', 'table', 'sitting', 'coff...","['Grey', 'White', 'Black']"


In [14]:
def join_feature_target(feature_df_text, feature_df_color, target_df):
    feature_text = copy.deepcopy(feature_df_text)
    feature_color = copy.deepcopy(feature_df_color)
    target = copy.deepcopy(target_df)
    # text features 
    feature_text.rename({'qid': 'QID'}, axis=1, inplace=True)
    feature_text.set_index('QID', inplace=True)
    # color features
    feature_color.rename({'qid': 'QID'}, axis=1, inplace=True)
    feature_color.set_index('QID', inplace=True)
    # join features
    features = feature_text.join(feature_color[['descriptions','tags','dominant_colors']],
                                 on='QID',
                                 how='outer')
    # join features with target
    target = target[['QID', 'IMG', 'QSN', 'TXT', 'OBJ', 'COL', 'CNT', 'OTH']]
    target.set_index('QID', inplace=True)
    target = target.astype(dtype=str)
    df = target.join(features, on='QID', how='inner')
    df['descriptions'].astype(list)
    return df

def lem(s):
    arr = s.split(" ")
    lem = WordNetLemmatizer()
    op = ""
    for w in arr:
        word = lem.lemmatize(w) + ' '
        op += word
    return op

def preprocess_text(feature_columns):
    """ output an nparray with single document per data point """
    ip = copy.deepcopy(feature_columns).values
    op = []
    for i in range(ip.shape[0]):
        doc      =  ""
        for j in range(ip.shape[1]):
            # clean up chars
            s    =  str(ip[i][j])
            s    =  s.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+'"}).lower() + " "
            if j == 1:             # clean descriptions
                s = re.sub(r'confidence\s+\d+', '', s)
                s = re.sub(r'text', '', s)
            # lexicon normalize
            s    = lem(s)
            doc  += s
        op.append(doc)
    op = np.asarray(op)
    return op

In [16]:
vizwiz_train   = join_feature_target(vizwiz_features_train_text, 
                                   vizwiz_features_train_color, 
                                   vizwiz_targets_train)
vizwiz_val     = join_feature_target(vizwiz_features_val_text, 
                                 vizwiz_features_val_color, 
                                 vizwiz_targets_val)
vqa_train      = join_feature_target(vqa_features_train_text, 
                                   vqa_features_train_color, 
                                   vqa_targets_train)
vqa_val        = join_feature_target(vizwiz_features_val_text, 
                                 vqa_features_val_color, 
                                 vqa_targets_val)
print(vizwiz_train.shape, vqa_train.shape, vizwiz_train.shape[0] + vqa_train.shape[0])
print(vizwiz_val.shape, vqa_val.shape, vizwiz_val.shape[0] + vqa_val.shape[0])

(14257, 13) (3230, 13) 17487
(2247, 13) (513, 13) 2760


In [21]:
# create X and Y

train = pd.concat([vizwiz_train, vqa_train], axis=0)
val   = pd.concat([vizwiz_val, vqa_val], axis=0)
print("Training: {}\nValidation: {}".format(train.shape, val.shape))

features_train = preprocess_text(train[['QSN','descriptions', 'tags', 'dominant_colors', 
                                        'handwritten_text', 'ocr_text']])
txt_train      = train['TXT'].values
col_train      = train['COL'].values
cnt_train      = train['CNT'].values

features_val   = preprocess_text(val[['QSN', 'descriptions', 'tags', 'dominant_colors',
                                      'handwritten_text', 'ocr_text']])
txt_val        = val['TXT'].values.astype('float32')
col_val        = val['COL'].values.astype('float32')
cnt_val        = val['CNT'].values.astype('float32')

Training: (17487, 13)
Validation: (2760, 13)


In [22]:
features_train[random.randint(0,len(features_val))]

'what kind of pudding is this   a person holding a remote control   person indoor remote sitting holding man orange control red television boy girl young bed playing food table room video game white standing shirt  black      '

In [51]:
features_val[random.randint(0,len(features_val))]

'i know this is healthy choice but what dinner is it   a close up of food on a table   indoor table food sitting green plate restaurant wooden sandwich white  black brown  healthy choice complete meall with dessert solwin healthy  complete meal healthy with dessert •hoice 9g fiber 310 calorie homestyle bury salis steak salisbury steak in sautéed onion red apple ca crisp multigrain  '

In [52]:
# tokenize
tok        = Tokenizer(num_words=VOCAB_SIZE, 
                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                       lower=True,
                       split=" ")
tok.fit_on_texts(features_train)

# create sequences & pad
train_seq  = tok.texts_to_sequences(features_train)
train_seq  = sequence.pad_sequences(train_seq, maxlen=MAX_DOC_LEN)
val_seq    = tok.texts_to_sequences(features_val)
val_seq    = sequence.pad_sequences(val_seq, maxlen=MAX_DOC_LEN)

In [55]:
# check class distribution
text_recognition_labels = to_categorical(np.asarray(txt_train)).astype('float32')
color_recognition_labels = to_categorical(np.asarray(col_train)).astype('float32')
print('Number of samples each class: ')
print('Text recognition', text_recognition_labels.sum(axis=0))
print('Color recognition', color_recognition_labels.sum(axis=0))
n_classes = 2

Number of samples each class: 
Text recognition [9119. 8368.]
Color recognition [10926.  6561.]


## LSTM model (skip-gram word2vec)
config for CPU only

In [57]:
import numpy as np
import pandas as pd
import os
import pickle
import requests
import time
import gzip

os.environ['KERAS_BACKEND'] = 'cntk'

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.models import Sequential, load_model
from keras import regularizers
from keras.optimizers import SGD
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import History, CSVLogger
from keras.utils import to_categorical

import nltk 
import gensim
import logging

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

MAX_DOC_LEN = 40
VOCAB_SIZE = 3000
EMBEDDING_DIM = 100

In [58]:
# punkt sentence level tokenizer
sent_lst = []
for doc in features_train:
    sentences = nltk.tokenize.sent_tokenize(doc)
    for sent in sentences:
        word_lst = [w for w in nltk.tokenize.word_tokenize(sent) if w.isalnum()]
        sent_lst.append(word_lst)

In [59]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
word2vec_model = gensim.models.Word2Vec(sentences=sent_lst,
                                        min_count=6,
                                        size=EMBEDDING_DIM,
                                        sg=1,
                                        workers=os.cpu_count())

2019-02-28 02:08:24,304 : INFO : collecting all words and their counts
2019-02-28 02:08:24,305 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-02-28 02:08:24,370 : INFO : PROGRESS: at sentence #10000, processed 383294 words, keeping 24880 word types
2019-02-28 02:08:24,417 : INFO : collected 33634 word types from a corpus of 671659 raw words and 17487 sentences
2019-02-28 02:08:24,418 : INFO : Loading a fresh vocabulary
2019-02-28 02:08:24,440 : INFO : effective_min_count=6 retains 3819 unique words (11% of original 33634, drops 29815)
2019-02-28 02:08:24,441 : INFO : effective_min_count=6 leaves 629542 word corpus (93% of original 671659, drops 42117)
2019-02-28 02:08:24,450 : INFO : deleting the raw counts dictionary of 33634 items
2019-02-28 02:08:24,452 : INFO : sample=0.001 downsamples 68 most-common words
2019-02-28 02:08:24,452 : INFO : downsampling leaves estimated 412555 word corpus (65.5% of prior 629542)
2019-02-28 02:08:24,461 : INFO : estima

In [60]:
embeddings_index = {}

for word in word2vec_model.wv.vocab:
    coefs = np.asarray(word2vec_model.wv[word], dtype='float32')
    embeddings_index[word] = coefs

print('Total %s word vectors' % len(embeddings_index))

# Initial word embedding
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word, i in tok.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i < VOCAB_SIZE:
        embedding_matrix[i] = embedding_vector

Total 3819 word vectors


In [72]:
def lstm_create_train(labels, learning_rate, lstm_dim, batch_size, num_epochs, optimizer_param, regularization=1e-7):
    
    l2_reg = regularizers.l2(regularization)
    
    # init model
    embedding_layer = Embedding(VOCAB_SIZE,
                                EMBEDDING_DIM,
                                input_length=MAX_DOC_LEN,
                                trainable=True,
                                mask_zero=False,
                                embeddings_regularizer=l2_reg,
                                weights=[embedding_matrix])
    lstm_layer = LSTM(units=lstm_dim, kernel_regularizer=l2_reg)
    dense_layer = Dense(n_classes,
                        activation='softmax', 
                        kernel_regularizer=l2_reg)

    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(lstm_layer))
    model.add(Dropout(0.5))
    model.add(dense_layer)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer_param,
                  metrics=['acc'])
    history = History()
    csv_logger = CSVLogger('./LSTM/text/{}_{}_{}_{}.log'.format(learning_rate, regularization, batch_size, num_epochs),
                           separator=',',
                           append=True)
    t1 = time.time()
    # model fit
    model.fit(train_seq,
              labels.astype('float32'),
              batch_size=batch_size,
              epochs=num_epochs,
              callbacks=[history, csv_logger],
              verbose=2)
    t2 = time.time()
    # save hdf5
    model.save('./LSTM/text/{}_{}_{}_{}_model.h5'.format(learning_rate, regularization, batch_size, num_epochs))
    np.savetxt('./LSTM/text/{}_{}_{}_{}_time.txt'.format(learning_rate, regularization, batch_size, num_epochs), 
               [regularization, (t2-t1) / 3600])
    with open('./LSTM/text/{}_{}_{}_{}_history.txt'.format(learning_rate, regularization, batch_size, num_epochs), "w") as res_file:
        res_file.write(str(history.history))

In [78]:
L = 2e-1
R = 1e-14
B = 32
E = 30
lstm_create_train(labels=text_recognition_labels,
                                  learning_rate=L,
                                  lstm_dim=150, 
                                  batch_size=B,
                                  num_epochs=E, 
                                  optimizer_param=SGD(lr=L, nesterov=True),
                                  regularization=R)
model = load_model('./LSTM/text/{}_{}_{}_{}_model.h5'.format(L,R,B,E))
preds = model.predict(val_seq, verbose=0)
print("Learning rate: {} Regularization: {} Batch size: {} Epoch: {}".format(L,R,B,E))
print(("Accuracy = {0} \t AUC = {1}".format(accuracy_score(txt_val, preds.argmax(axis=1)), 
                                                            roc_auc_score(txt_val, preds[:,1]))))

Epoch 1/30
 - 31s - loss: 0.4646 - acc: 0.7892
Epoch 2/30
 - 31s - loss: 0.4124 - acc: 0.8254
Epoch 3/30
 - 31s - loss: 0.3873 - acc: 0.8391
Epoch 4/30
 - 31s - loss: 0.3728 - acc: 0.8457
Epoch 5/30
 - 31s - loss: 0.3622 - acc: 0.8525
Epoch 6/30
 - 31s - loss: 0.3563 - acc: 0.8542
Epoch 7/30
 - 31s - loss: 0.3511 - acc: 0.8585
Epoch 8/30
 - 31s - loss: 0.3448 - acc: 0.8593
Epoch 9/30
 - 31s - loss: 0.3416 - acc: 0.8619
Epoch 10/30
 - 31s - loss: 0.3355 - acc: 0.8674
Epoch 11/30
 - 31s - loss: 0.3305 - acc: 0.8671
Epoch 12/30
 - 31s - loss: 0.3310 - acc: 0.8672
Epoch 13/30
 - 31s - loss: 0.3264 - acc: 0.8707
Epoch 14/30
 - 31s - loss: 0.3200 - acc: 0.8730
Epoch 15/30
 - 31s - loss: 0.3205 - acc: 0.8727
Epoch 16/30
 - 31s - loss: 0.3173 - acc: 0.8753
Epoch 17/30
 - 31s - loss: 0.3141 - acc: 0.8760
Epoch 18/30
 - 31s - loss: 0.3098 - acc: 0.8788
Epoch 19/30
 - 31s - loss: 0.3080 - acc: 0.8775
Epoch 20/30
 - 31s - loss: 0.3061 - acc: 0.8792
Epoch 21/30
 - 31s - loss: 0.2993 - acc: 0.8809
E

In [None]:
L = 2e-1
R = 0
B = 32
E = 30
lstm_create_train(labels=text_recognition_labels,
                                  learning_rate=L,
                                  lstm_dim=250, 
                                  batch_size=B,
                                  num_epochs=E, 
                                  optimizer_param=SGD(lr=L, nesterov=True),
                                  regularization=R)
model = load_model('./LSTM/text/{}_{}_{}_{}_model.h5'.format(L,R,B,E))
preds = model.predict(val_seq, verbose=0)
print("Learning rate: {} Regularization: {} Batch size: {} Epoch: {}".format(L,R,B,E))
print(("Accuracy = {0} \t AUC = {1}".format(accuracy_score(txt_val, preds.argmax(axis=1)), 
                                                            roc_auc_score(txt_val, preds[:,1]))))
# todo

### Training and validation on VizWiz only

In [388]:
L = 1e-1
R = 0
B = 32
E = 30
lstm_create_train(labels=text_recognition_labels,
                                  learning_rate=L,
                                  lstm_dim=100, 
                                  batch_size=B,
                                  num_epochs=E, 
                                  optimizer_param=SGD(lr=L, nesterov=True),
                                  regularization=R)
model = load_model('./LSTM/text/{}_{}_{}_{}_model.h5'.format(L,R,B,E))
preds = model.predict(val_seq, verbose=0)
print("Learning rate: {} Regularization: {} Batch size: {} Epoch: {}".format(L,R,B,E))
print(("Accuracy = {0} \t AUC = {1}".format(accuracy_score(txt_val, preds.argmax(axis=1)), 
                                                            roc_auc_score(txt_val, preds[:,1]))))

Epoch 1/30
 - 23s - loss: 0.4962 - acc: 0.7631
Epoch 2/30
 - 23s - loss: 0.4303 - acc: 0.8098
Epoch 3/30
 - 23s - loss: 0.4079 - acc: 0.8227
Epoch 4/30
 - 23s - loss: 0.3894 - acc: 0.8347
Epoch 5/30
 - 23s - loss: 0.3854 - acc: 0.8377
Epoch 6/30
 - 23s - loss: 0.3775 - acc: 0.8409
Epoch 7/30
 - 23s - loss: 0.3691 - acc: 0.8448
Epoch 8/30
 - 23s - loss: 0.3630 - acc: 0.8495
Epoch 9/30
 - 23s - loss: 0.3604 - acc: 0.8510
Epoch 10/30
 - 23s - loss: 0.3538 - acc: 0.8542
Epoch 11/30
 - 23s - loss: 0.3521 - acc: 0.8548
Epoch 12/30
 - 23s - loss: 0.3485 - acc: 0.8579
Epoch 13/30
 - 23s - loss: 0.3430 - acc: 0.8620
Epoch 14/30
 - 23s - loss: 0.3418 - acc: 0.8604
Epoch 15/30
 - 23s - loss: 0.3409 - acc: 0.8587
Epoch 16/30
 - 23s - loss: 0.3366 - acc: 0.8641
Epoch 17/30
 - 23s - loss: 0.3327 - acc: 0.8657
Epoch 18/30
 - 23s - loss: 0.3311 - acc: 0.8687
Epoch 19/30
 - 23s - loss: 0.3316 - acc: 0.8641
Epoch 20/30
 - 23s - loss: 0.3307 - acc: 0.8673
Epoch 21/30
 - 23s - loss: 0.3254 - acc: 0.8710
E