## Skill label prediction with image-based features (text and color with descriptive tags)

In [189]:
import pandas as pd 
import random
import numpy as np
import copy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kfold=KFold(n_splits=10)

# for LSTM (keras with tf backend)
import gzip
import os
import pickle
import requests
import time
import re
os.environ['KERAS_BACKEND']='cntk'
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.models import Sequential, load_model
from keras import regularizers
from keras.optimizers import SGD
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import History, CSVLogger
from keras.utils import to_categorical
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

MAX_DOC_LEN = 40
VOCAB_SIZE = 3000
EMBEDDING_DIM = 100

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/edithzeng/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Preprocessing

In [None]:
vizwiz_features_train_color = pd.read_csv('azure_features_images/data/vizwiz_train_color_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                                    quotechar='"', error_bad_lines=False)
vizwiz_features_train_text = pd.read_csv('azure_features_images/data/vizwiz_train_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)


vizwiz_features_val_color = pd.read_csv('azure_features_images/data/vizwiz_val_color_recognition.csv',
                                  delimiter=';', engine='python',
                                  dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                                  quotechar='"', error_bad_lines=False)
vizwiz_features_val_text = pd.read_csv('azure_features_images/data/vizwiz_val_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)


vqa_features_train_color = pd.read_csv('azure_features_images/data/vqa_train_color_recognition.csv',
                                 delimiter=';', engine='python', 
                                 dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                                 quotechar='"', error_bad_lines=False)
vqa_features_train_text = pd.read_csv('azure_features_images/data/vqa_train_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)
vqa_features_val_color = pd.read_csv('azure_features_images/data/vqa_val_color_recognition.csv',
                               delimiter=';', engine='python',
                               dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                               quotechar='"', error_bad_lines=False)
vqa_features_val_text = pd.read_csv('azure_features_images/data/vqa_val_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)


vizwiz_targets_train = pd.read_csv('../vizwiz_skill_typ_train.csv',
                                   delimiter=',', quotechar='"',
                                   engine='python', error_bad_lines=False)
vizwiz_targets_val = pd.read_csv('../vizwiz_skill_typ_val.csv',
                                 delimiter=',', quotechar='"', engine='python', error_bad_lines=False)
vqa_targets_train = pd.read_csv('../vqa_skill_typ_train.csv',
                               engine='python', quotechar='"', error_bad_lines=False)
vqa_targets_val = pd.read_csv('../vqa_skill_typ_val.csv',
                               engine='python', quotechar='"', error_bad_lines=False)

In [241]:
vizwiz_features_train_text.head(2)

Unnamed: 0,qid,question,ocr_text,handwritten_text
0,VizWiz_train_000000000000.jpg,What's the name of this product?,"['b', 'sil', 'leaves', '0.62', 'oz', '(170)']",['NET WT O. 62 02 ( 179)']
1,VizWiz_train_000000000001.jpg,Can you tell me what is in this can please?,[],['^TAKE Three 1^']


In [242]:
vizwiz_train.sample(1)

Unnamed: 0_level_0,IMG,QSN,TXT,OBJ,COL,CNT,OTH,question,ocr_text,handwritten_text,descriptions,tags,dominant_colors
QID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
VizWiz_train_000000009759.jpg,VizWiz_train_000000009759.jpg,what's this?,0,1,0,0,0,what's this?,[],[],"[{'text': 'a close up of a computer', 'confide...","['sitting', 'man', 'computer', 'black', 'stand...","['Grey', 'White']"


In [243]:
def join_feature_target(feature_df_text, feature_df_color, target_df):
    feature_text = copy.deepcopy(feature_df_text)
    feature_color = copy.deepcopy(feature_df_color)
    target = copy.deepcopy(target_df)
    # text features 
    feature_text.rename({'qid': 'QID'}, axis=1, inplace=True)
    feature_text.set_index('QID', inplace=True)
    # color features
    feature_color.rename({'qid': 'QID'}, axis=1, inplace=True)
    feature_color.set_index('QID', inplace=True)
    # join features
    features = feature_text.join(feature_color[['descriptions','tags','dominant_colors']],
                                 on='QID',
                                 how='outer')
    # join features with target
    target = target[['QID', 'IMG', 'QSN', 'TXT', 'OBJ', 'COL', 'CNT', 'OTH']]
    target.set_index('QID', inplace=True)
    target = target.astype(dtype=str)
    df = target.join(features, on='QID', how='inner')
    df['descriptions'].astype(list)
    return df

def lem(s):
    arr = s.split(" ")
    lem = WordNetLemmatizer()
    op = ""
    for w in arr:
        word = lem.lemmatize(w) + ' '
        op += word
    return op

def preprocess_text(feature_columns):
    """ output an nparray with single document per data point """
    ip = copy.deepcopy(feature_columns).values
    op = []
    for i in range(ip.shape[0]):
        doc      =  ""
        for j in range(ip.shape[1]):
            # clean up chars
            s    =  str(ip[i][j])
            s    =  s.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+'"}).lower() + " "
            if j == 1:             # clean descriptions
                s = re.sub(r'confidence\s+\d+', '', s)
                s = re.sub(r'text', '', s)
            # lexicon normalize
            s    = lem(s)
            doc  += (s.strip())
        op.append(doc)
    op = np.asarray(op)
    return op

In [244]:
vizwiz_train   = join_feature_target(vizwiz_features_train_text, 
                                   vizwiz_features_train_color, 
                                   vizwiz_targets_train)
vizwiz_val     = join_feature_target(vizwiz_features_val_text, 
                                 vizwiz_features_val_color, 
                                 vizwiz_targets_val)

In [247]:
# create X and Y
features_train = cleanse(vizwiz_train[['QSN', 
                                       'descriptions', 'tags', 'dominant_colors', 
                                       'handwritten_text', 'ocr_text']])
txt_train      = vizwiz_train["TXT"].values
obj_train      = vizwiz_train["OBJ"].values
col_train      = vizwiz_train["COL"].values
cnt_train      = vizwiz_train["CNT"].values

features_val   = cleanse(vizwiz_val[['QSN', 
                                     'descriptions', 'tags', 'dominant_colors', 
                                     'handwritten_text', 'ocr_text']])
txt_val        = vizwiz_val["TXT"].values
obj_val        = vizwiz_val["OBJ"].values
col_val        = vizwiz_val["COL"].values
cnt_val        = vizwiz_val["CNT"].values

In [246]:
features_train[random.randint(0,len(features_val))]

'testing what is this  floor indoor sitting room standing cat black living woman white young playing kitchen laying table man night game holding sink oven grey   '

In [108]:
features_val[random.randint(0,len(features_val))]



In [248]:
# tokenize
tok        = Tokenizer(num_words=VOCAB_SIZE, 
                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                       lower=True,
                       split=" ")
tok.fit_on_texts(features_train)

# create sequences & pad
train_seq  = tok.texts_to_sequences(features_train)
train_seq  = sequence.pad_sequences(train_seq, maxlen=MAX_DOC_LEN)
val_seq    = tok.texts_to_sequences(features_val)
val_seq    = sequence.pad_sequences(val_seq, maxlen=MAX_DOC_LEN)

In [254]:
# check class distribution
text_recognition_labels = to_categorical(np.asarray(txt_train)).astype('float32')
color_recognition_labels = to_categorical(np.asarray(col_train)).astype('float32')
print('Number of samples each class - Vizwiz - train')
print('Text recognition', text_recognition_labels.sum(axis=0))
print('Color recognition', color_recognition_labels.sum(axis=0))
n_classes = 2

Number of samples each class - Vizwiz - train
Text recognition [6247. 8010.]
Color recognition [8844. 5413.]


## LSTM model (skip-gram word2vec)
config for cpu only

In [255]:
import numpy as np
import pandas as pd
import os
import pickle
import requests
import time
import gzip

os.environ['KERAS_BACKEND'] = 'cntk'

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.models import Sequential, load_model
from keras import regularizers
from keras.optimizers import SGD
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import History, CSVLogger
from keras.utils import to_categorical

import nltk 
import gensim
import logging

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

MAX_DOC_LEN = 40
VOCAB_SIZE = 3000
EMBEDDING_DIM = 100

In [256]:
# punkt sentence level tokenizer
nltk.download('punkt')
sent_lst = []
for doc in features_train:
    sentences = nltk.tokenize.sent_tokenize(doc)
    for sent in sentences:
        word_lst = [w for w in nltk.tokenize.word_tokenize(sent) if w.isalnum()]
        sent_lst.append(word_lst)

[nltk_data] Downloading package punkt to /home/edithzeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [257]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
word2vec_model = gensim.models.Word2Vec(sentences=sent_lst,
                                        min_count=6,
                                        size=EMBEDDING_DIM,
                                        sg=1,
                                        workers=os.cpu_count())

2019-02-26 18:40:10,420 : INFO : collecting all words and their counts
2019-02-26 18:40:10,421 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-02-26 18:40:10,484 : INFO : PROGRESS: at sentence #10000, processed 392086 words, keeping 26072 word types
2019-02-26 18:40:10,510 : INFO : collected 33236 word types from a corpus of 554278 raw words and 14257 sentences
2019-02-26 18:40:10,511 : INFO : Loading a fresh vocabulary
2019-02-26 18:40:10,534 : INFO : effective_min_count=6 retains 3750 unique words (11% of original 33236, drops 29486)
2019-02-26 18:40:10,535 : INFO : effective_min_count=6 leaves 511934 word corpus (92% of original 554278, drops 42344)
2019-02-26 18:40:10,544 : INFO : deleting the raw counts dictionary of 33236 items
2019-02-26 18:40:10,545 : INFO : sample=0.001 downsamples 66 most-common words
2019-02-26 18:40:10,545 : INFO : downsampling leaves estimated 321841 word corpus (62.9% of prior 511934)
2019-02-26 18:40:10,553 : INFO : estima

In [258]:
embeddings_index = {}

for word in word2vec_model.wv.vocab:
    coefs = np.asarray(word2vec_model.wv[word], dtype='float32')
    embeddings_index[word] = coefs

print('Total %s word vectors' % len(embeddings_index))

# Initial word embedding
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word, i in tok.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i < VOCAB_SIZE:
        embedding_matrix[i] = embedding_vector

Total 3750 word vectors


In [259]:
def lstm_create_train(labels, lstm_dim, batch_size, num_epochs, optimizer_param, regularization=1e-7):
    
    l2_reg = regularizers.l2(regularization)
    
    # init model
    embedding_layer = Embedding(VOCAB_SIZE,
                                EMBEDDING_DIM,
                                input_length=MAX_DOC_LEN,
                                trainable=True,
                                mask_zero=False,
                                embeddings_regularizer=l2_reg,
                                weights=[embedding_matrix])
    lstm_layer = LSTM(units=lstm_dim, kernel_regularizer=l2_reg)
    dense_layer = Dense(n_classes, activation='softmax', kernel_regularizer=l2_reg)

    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(lstm_layer))
    model.add(Dropout(0.2))  # todo
    model.add(dense_layer)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer_param,
                  metrics=['acc'])
    fname = "lstm"
    history = History()
    csv_logger = CSVLogger('./{0}_{1}.log'.format(fname, regularization),
                           separator=',',
                           append=True)
    t1 = time.time()
    # model fit
    model.fit(train_seq,
              labels.astype('float32'),
              batch_size=batch_size,
              epochs=num_epochs,
              callbacks=[history, csv_logger],
              verbose=2)
    t2 = time.time()
    # save h5
    model.save('./LSTM/{0}_{1}_model.h5'.format(fname, regularization))
    np.savetxt('./LSTM/{0}_{1}_time.txt'.format(fname, regularization), 
               [regularization, (t2-t1) / 3600])
    with open('./LSTM/{0}_{1}_history.txt'.format(fname, regularization), "w") as res_file:
        res_file.write(str(history.history))

In [260]:
# reduce learning rate for larger batch GD - 85%
lstm_create_train(labels=text_recognition_labels,
                  lstm_dim=100, 
                  batch_size=50,
                  num_epochs=30, 
                  optimizer_param=SGD(lr=0.05, nesterov=True),
                  regularization=1e-3)

Epoch 1/30
 - 15s - loss: 13.2357 - acc: 0.7572
Epoch 2/30
 - 15s - loss: 12.4616 - acc: 0.7983
Epoch 3/30
 - 15s - loss: 11.7785 - acc: 0.8045
Epoch 4/30
 - 15s - loss: 11.1353 - acc: 0.8167
Epoch 5/30
 - 15s - loss: 10.5292 - acc: 0.8219
Epoch 6/30
 - 15s - loss: 9.9641 - acc: 0.8230
Epoch 7/30
 - 15s - loss: 9.4277 - acc: 0.8294
Epoch 8/30
 - 15s - loss: 8.9240 - acc: 0.8287
Epoch 9/30
 - 15s - loss: 8.4473 - acc: 0.8350
Epoch 10/30
 - 15s - loss: 7.9911 - acc: 0.8381
Epoch 11/30
 - 15s - loss: 7.5703 - acc: 0.8397
Epoch 12/30
 - 15s - loss: 7.1678 - acc: 0.8421
Epoch 13/30
 - 15s - loss: 6.7909 - acc: 0.8418
Epoch 14/30
 - 15s - loss: 6.4314 - acc: 0.8452
Epoch 15/30
 - 15s - loss: 6.0942 - acc: 0.8453
Epoch 16/30
 - 15s - loss: 5.7795 - acc: 0.8451
Epoch 17/30
 - 15s - loss: 5.4707 - acc: 0.8482
Epoch 18/30
 - 15s - loss: 5.1938 - acc: 0.8475
Epoch 19/30
 - 15s - loss: 4.9234 - acc: 0.8509
Epoch 20/30
 - 15s - loss: 4.6680 - acc: 0.8500
Epoch 21/30
 - 15s - loss: 4.4286 - acc: 0.8

In [None]:
# reduce learning rate for larger batch GD - 85%
lstm_create_train(labels=text_recognition_labels,
                  lstm_dim=100, 
                  batch_size=50,
                  num_epochs=50, 
                  optimizer_param=SGD(lr=0.06, nesterov=True),
                  regularization=1e-3)

Epoch 1/50
 - 15s - loss: 13.1635 - acc: 0.7596
Epoch 2/50
 - 15s - loss: 12.2640 - acc: 0.7968
Epoch 3/50
 - 15s - loss: 11.4644 - acc: 0.8063
Epoch 4/50
 - 15s - loss: 10.7211 - acc: 0.8159
Epoch 5/50
 - 15s - loss: 10.0295 - acc: 0.8230
Epoch 6/50
 - 15s - loss: 9.3818 - acc: 0.8275
Epoch 7/50
 - 15s - loss: 8.7857 - acc: 0.8271
Epoch 8/50
 - 15s - loss: 8.2246 - acc: 0.8319
Epoch 9/50
 - 15s - loss: 7.6998 - acc: 0.8378
Epoch 10/50
 - 15s - loss: 7.2135 - acc: 0.8396
Epoch 11/50
 - 15s - loss: 6.7595 - acc: 0.8407
Epoch 12/50
 - 15s - loss: 6.3373 - acc: 0.8409
Epoch 13/50
 - 15s - loss: 5.9380 - acc: 0.8444
Epoch 14/50
 - 15s - loss: 5.5710 - acc: 0.8445
Epoch 15/50
 - 15s - loss: 5.2227 - acc: 0.8477
Epoch 16/50
 - 15s - loss: 4.8992 - acc: 0.8512
Epoch 17/50
 - 15s - loss: 4.5982 - acc: 0.8507
Epoch 18/50
 - 15s - loss: 4.3205 - acc: 0.8526
Epoch 19/50
 - 15s - loss: 4.0563 - acc: 0.8507
Epoch 20/50
 - 15s - loss: 3.8103 - acc: 0.8528
Epoch 21/50
 - 15s - loss: 3.5844 - acc: 0.8

In [None]:
lstm_create_train(labels=text_recognition_labels,
                  lstm_dim=100,
                  batch_size=50,
                  num_epochs=50,
                  optimizer_param=SGD(lr=0.06, nesterov=True),
                  regularization=0.1)

In [None]:
model = load_model('./LSTM/lstm_{0}_model.h5'.format(1e-3))
preds = model.predict(val_seq, verbose=0)
print(("Accuracy = {0} \t AUC = {1}".format(accuracy_score(txt_val, preds.argmax(axis=1)), 
       roc_auc_score(txt_val, preds[:,1]))))

## Color recognition

In [None]:
lstm_create_train(labels=color_recognition_labels,
                  lstm_dim=100, 
                  batch_size=50,
                  num_epochs=30, 
                  optimizer_param=SGD(lr=0.05, nesterov=True),
                  regularization=1e-3)

In [None]:
lstm_create_train(labels=color_recognition_labels,
                  lstm_dim=100, 
                  batch_size=100,
                  num_epochs=30, 
                  optimizer_param=SGD(lr=0.05, nesterov=True),
                  regularization=1e-3)

In [None]:
lstm_create_train(labels=color_recognition_labels,
                  lstm_dim=100, 
                  batch_size=100,
                  num_epochs=30, 
                  optimizer_param=SGD(lr=0.03, nesterov=True),
                  regularization=1e-3)

In [None]:
lstm_create_train(labels=color_recognition_labels,
                  lstm_dim=100, 
                  batch_size=200,
                  num_epochs=30, 
                  optimizer_param=SGD(lr=0.03, nesterov=True),
                  regularization=1e-3)