## Skill label prediction with image-based features (text and color with descriptive tags)

In [189]:
import pandas as pd 
import random
import numpy as np
import copy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kfold=KFold(n_splits=10)

# for LSTM (keras with tf backend)
import gzip
import os
import pickle
import requests
import time
import re
os.environ['KERAS_BACKEND']='cntk'
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.models import Sequential, load_model
from keras import regularizers
from keras.optimizers import SGD
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import History, CSVLogger
from keras.utils import to_categorical
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')

MAX_DOC_LEN = 40
VOCAB_SIZE = 3000
EMBEDDING_DIM = 100

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/edithzeng/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Preprocessing

In [None]:
vizwiz_features_train_color = pd.read_csv('azure_features_images/data/vizwiz_train_color_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                                    quotechar='"', error_bad_lines=False)
vizwiz_features_train_text = pd.read_csv('azure_features_images/data/vizwiz_train_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)


vizwiz_features_val_color = pd.read_csv('azure_features_images/data/vizwiz_val_color_recognition.csv',
                                  delimiter=';', engine='python',
                                  dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                                  quotechar='"', error_bad_lines=False)
vizwiz_features_val_text = pd.read_csv('azure_features_images/data/vizwiz_val_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)


vqa_features_train_color = pd.read_csv('azure_features_images/data/vqa_train_color_recognition.csv',
                                 delimiter=';', engine='python', 
                                 dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                                 quotechar='"', error_bad_lines=False)
vqa_features_train_text = pd.read_csv('azure_features_images/data/vqa_train_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)
vqa_features_val_color = pd.read_csv('azure_features_images/data/vqa_val_color_recognition.csv',
                               delimiter=';', engine='python',
                               dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'tags':list, 'dominant_colors':list},
                               quotechar='"', error_bad_lines=False)
vqa_features_val_text = pd.read_csv('azure_features_images/data/vqa_val_text_recognition.csv',
                                    delimiter=';', engine='python', 
                                    dtype={'qid':str, 'question':str, 'descriptions':list,
                                          'ocr_text':list, 'handwritten_text':list},
                                    quotechar='"', error_bad_lines=False)


vizwiz_targets_train = pd.read_csv('../vizwiz_skill_typ_train.csv',
                                   delimiter=',', quotechar='"',
                                   engine='python', error_bad_lines=False)
vizwiz_targets_val = pd.read_csv('../vizwiz_skill_typ_val.csv',
                                 delimiter=',', quotechar='"', engine='python', error_bad_lines=False)
vqa_targets_train = pd.read_csv('../vqa_skill_typ_train.csv',
                               engine='python', quotechar='"', error_bad_lines=False)
vqa_targets_val = pd.read_csv('../vqa_skill_typ_val.csv',
                               engine='python', quotechar='"', error_bad_lines=False)

In [241]:
vizwiz_features_train_text.head(2)

Unnamed: 0,qid,question,ocr_text,handwritten_text
0,VizWiz_train_000000000000.jpg,What's the name of this product?,"['b', 'sil', 'leaves', '0.62', 'oz', '(170)']",['NET WT O. 62 02 ( 179)']
1,VizWiz_train_000000000001.jpg,Can you tell me what is in this can please?,[],['^TAKE Three 1^']


In [242]:
vizwiz_train.sample(1)

Unnamed: 0_level_0,IMG,QSN,TXT,OBJ,COL,CNT,OTH,question,ocr_text,handwritten_text,descriptions,tags,dominant_colors
QID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
VizWiz_train_000000009759.jpg,VizWiz_train_000000009759.jpg,what's this?,0,1,0,0,0,what's this?,[],[],"[{'text': 'a close up of a computer', 'confide...","['sitting', 'man', 'computer', 'black', 'stand...","['Grey', 'White']"


In [290]:
def join_feature_target(feature_df_text, feature_df_color, target_df):
    feature_text = copy.deepcopy(feature_df_text)
    feature_color = copy.deepcopy(feature_df_color)
    target = copy.deepcopy(target_df)
    # text features 
    feature_text.rename({'qid': 'QID'}, axis=1, inplace=True)
    feature_text.set_index('QID', inplace=True)
    # color features
    feature_color.rename({'qid': 'QID'}, axis=1, inplace=True)
    feature_color.set_index('QID', inplace=True)
    # join features
    features = feature_text.join(feature_color[['descriptions','tags','dominant_colors']],
                                 on='QID',
                                 how='outer')
    # join features with target
    target = target[['QID', 'IMG', 'QSN', 'TXT', 'OBJ', 'COL', 'CNT', 'OTH']]
    target.set_index('QID', inplace=True)
    target = target.astype(dtype=str)
    df = target.join(features, on='QID', how='inner')
    df['descriptions'].astype(list)
    return df

def lem(s):
    arr = s.split(" ")
    lem = WordNetLemmatizer()
    op = ""
    for w in arr:
        word = lem.lemmatize(w) + ' '
        op += word
    return op

def preprocess_text(feature_columns):
    """ output an nparray with single document per data point """
    ip = copy.deepcopy(feature_columns).values
    op = []
    for i in range(ip.shape[0]):
        doc      =  ""
        for j in range(ip.shape[1]):
            # clean up chars
            s    =  str(ip[i][j])
            s    =  s.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+'"}).lower() + " "
            if j == 1:             # clean descriptions
                s = re.sub(r'confidence\s+\d+', '', s)
                s = re.sub(r'text', '', s)
            # lexicon normalize
            s    = lem(s)
            doc  += s
        op.append(doc)
    op = np.asarray(op)
    return op

In [291]:
vizwiz_train   = join_feature_target(vizwiz_features_train_text, 
                                   vizwiz_features_train_color, 
                                   vizwiz_targets_train)
vizwiz_val     = join_feature_target(vizwiz_features_val_text, 
                                 vizwiz_features_val_color, 
                                 vizwiz_targets_val)

In [292]:
# create X and Y
features_train = preprocess_text(vizwiz_train[['QSN', 
                                              'descriptions', 'tags', 'dominant_colors', 
                                              'handwritten_text', 'ocr_text']])
txt_train      = vizwiz_train["TXT"].values
obj_train      = vizwiz_train["OBJ"].values
col_train      = vizwiz_train["COL"].values
cnt_train      = vizwiz_train["CNT"].values

features_val   = preprocess_text(vizwiz_val[['QSN', 
                                             'descriptions', 'tags', 'dominant_colors',
                                             'handwritten_text', 'ocr_text']])
txt_val        = vizwiz_val["TXT"].values
obj_val        = vizwiz_val["OBJ"].values
col_val        = vizwiz_val["COL"].values
cnt_val        = vizwiz_val["CNT"].values

In [297]:
features_train[random.randint(0,len(features_val))]

'is it possible to tell the ingredient of this pizza and the brand  thanks   a pizza sitting on top of a table   indoor table sitting food pizza plate red phone eating cat laying  brown red  stone tombsto original    '

In [108]:
features_val[random.randint(0,len(features_val))]



In [298]:
# tokenize
tok        = Tokenizer(num_words=VOCAB_SIZE, 
                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                       lower=True,
                       split=" ")
tok.fit_on_texts(features_train)

# create sequences & pad
train_seq  = tok.texts_to_sequences(features_train)
train_seq  = sequence.pad_sequences(train_seq, maxlen=MAX_DOC_LEN)
val_seq    = tok.texts_to_sequences(features_val)
val_seq    = sequence.pad_sequences(val_seq, maxlen=MAX_DOC_LEN)

In [299]:
# check class distribution
text_recognition_labels = to_categorical(np.asarray(txt_train)).astype('float32')
color_recognition_labels = to_categorical(np.asarray(col_train)).astype('float32')
print('Number of samples each class - Vizwiz - train')
print('Text recognition', text_recognition_labels.sum(axis=0))
print('Color recognition', color_recognition_labels.sum(axis=0))
n_classes = 2

Number of samples each class - Vizwiz - train
Text recognition [6247. 8010.]
Color recognition [8844. 5413.]


## LSTM model (skip-gram word2vec)
config for cpu only

In [255]:
import numpy as np
import pandas as pd
import os
import pickle
import requests
import time
import gzip

os.environ['KERAS_BACKEND'] = 'cntk'

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.models import Sequential, load_model
from keras import regularizers
from keras.optimizers import SGD
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import History, CSVLogger
from keras.utils import to_categorical

import nltk 
import gensim
import logging

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

MAX_DOC_LEN = 40
VOCAB_SIZE = 3000
EMBEDDING_DIM = 100

#MAX_DOC_LEN = 40
#VOCAB_SIZE = 7000
#EMBEDDING_DIM = 150

In [300]:
# punkt sentence level tokenizer
sent_lst = []
for doc in features_train:
    sentences = nltk.tokenize.sent_tokenize(doc)
    for sent in sentences:
        word_lst = [w for w in nltk.tokenize.word_tokenize(sent) if w.isalnum()]
        sent_lst.append(word_lst)

In [301]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
word2vec_model = gensim.models.Word2Vec(sentences=sent_lst,
                                        min_count=6,
                                        size=EMBEDDING_DIM,
                                        sg=1,
                                        workers=os.cpu_count())

2019-02-26 22:45:30,009 : INFO : collecting all words and their counts
2019-02-26 22:45:30,009 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-02-26 22:45:30,072 : INFO : PROGRESS: at sentence #10000, processed 383294 words, keeping 24880 word types
2019-02-26 22:45:30,099 : INFO : collected 31762 word types from a corpus of 541792 raw words and 14257 sentences
2019-02-26 22:45:30,100 : INFO : Loading a fresh vocabulary
2019-02-26 22:45:30,126 : INFO : effective_min_count=6 retains 3565 unique words (11% of original 31762, drops 28197)
2019-02-26 22:45:30,126 : INFO : effective_min_count=6 leaves 501856 word corpus (92% of original 541792, drops 39936)
2019-02-26 22:45:30,135 : INFO : deleting the raw counts dictionary of 31762 items
2019-02-26 22:45:30,136 : INFO : sample=0.001 downsamples 67 most-common words
2019-02-26 22:45:30,136 : INFO : downsampling leaves estimated 318802 word corpus (63.5% of prior 501856)
2019-02-26 22:45:30,143 : INFO : estima

In [302]:
embeddings_index = {}

for word in word2vec_model.wv.vocab:
    coefs = np.asarray(word2vec_model.wv[word], dtype='float32')
    embeddings_index[word] = coefs

print('Total %s word vectors' % len(embeddings_index))

# Initial word embedding
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word, i in tok.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i < VOCAB_SIZE:
        embedding_matrix[i] = embedding_vector

Total 3565 word vectors


In [303]:
def lstm_create_train(labels, lstm_dim, batch_size, num_epochs, optimizer_param, regularization=1e-7):
    
    l2_reg = regularizers.l2(regularization)
    
    # init model
    embedding_layer = Embedding(VOCAB_SIZE,
                                EMBEDDING_DIM,
                                input_length=MAX_DOC_LEN,
                                trainable=True,
                                mask_zero=False,
                                embeddings_regularizer=l2_reg,
                                weights=[embedding_matrix])
    lstm_layer = LSTM(units=lstm_dim, kernel_regularizer=l2_reg)
    dense_layer = Dense(n_classes, activation='softmax', kernel_regularizer=l2_reg)

    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(lstm_layer))
    model.add(Dropout(0.2))  # todo
    model.add(dense_layer)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer_param,
                  metrics=['acc'])
    fname = "lstm"
    history = History()
    csv_logger = CSVLogger('./LSTM/{0}_{1}.log'.format(fname, regularization),
                           separator=',',
                           append=True)
    t1 = time.time()
    # model fit
    model.fit(train_seq,
              labels.astype('float32'),
              batch_size=batch_size,
              epochs=num_epochs,
              callbacks=[history, csv_logger],
              verbose=2)
    t2 = time.time()
    # save h5
    model.save('./LSTM/{0}_{1}_model.h5'.format(fname, regularization))
    np.savetxt('./LSTM/{0}_{1}_time.txt'.format(fname, regularization), 
               [regularization, (t2-t1) / 3600])
    with open('./LSTM/{0}_{1}_history.txt'.format(fname, regularization), "w") as res_file:
        res_file.write(str(history.history))

In [None]:
lstm_create_train(labels=text_recognition_labels,
                  lstm_dim=100, 
                  batch_size=60,
                  num_epochs=80, 
                  optimizer_param=SGD(lr=0.06, nesterov=True),
                  regularization=1e-8)

Epoch 1/80
 - 14s - loss: 0.5271 - acc: 0.7457
Epoch 2/80
 - 14s - loss: 0.4520 - acc: 0.7934
Epoch 3/80
 - 14s - loss: 0.4299 - acc: 0.8062
Epoch 4/80
 - 15s - loss: 0.4132 - acc: 0.8157
Epoch 5/80
 - 15s - loss: 0.4032 - acc: 0.8249
Epoch 6/80
 - 15s - loss: 0.3910 - acc: 0.8317
Epoch 7/80
 - 15s - loss: 0.3867 - acc: 0.8316
Epoch 8/80
 - 15s - loss: 0.3776 - acc: 0.8374
Epoch 9/80
 - 15s - loss: 0.3740 - acc: 0.8410
Epoch 10/80
 - 15s - loss: 0.3706 - acc: 0.8419
Epoch 11/80
 - 15s - loss: 0.3627 - acc: 0.8504
Epoch 12/80
 - 15s - loss: 0.3628 - acc: 0.8470
Epoch 13/80
 - 15s - loss: 0.3565 - acc: 0.8502
Epoch 14/80
 - 15s - loss: 0.3538 - acc: 0.8538
Epoch 15/80
 - 16s - loss: 0.3519 - acc: 0.8551
Epoch 16/80
 - 15s - loss: 0.3514 - acc: 0.8548
Epoch 17/80
 - 15s - loss: 0.3483 - acc: 0.8564
Epoch 18/80
 - 15s - loss: 0.3461 - acc: 0.8583
Epoch 19/80
 - 15s - loss: 0.3451 - acc: 0.8599
Epoch 20/80
 - 16s - loss: 0.3425 - acc: 0.8616
Epoch 21/80
 - 15s - loss: 0.3415 - acc: 0.8612
E

In [280]:
# lemmatization
lstm_create_train(labels=text_recognition_labels,
                  lstm_dim=100, 
                  batch_size=60,
                  num_epochs=80, 
                  optimizer_param=SGD(lr=0.06, nesterov=True),
                  regularization=1e-8)

Epoch 1/80
 - 14s - loss: 0.5209 - acc: 0.7502
Epoch 2/80
 - 14s - loss: 0.4539 - acc: 0.7946
Epoch 3/80
 - 14s - loss: 0.4314 - acc: 0.8098
Epoch 4/80
 - 14s - loss: 0.4201 - acc: 0.8127
Epoch 5/80
 - 14s - loss: 0.4067 - acc: 0.8211
Epoch 6/80
 - 14s - loss: 0.3972 - acc: 0.8267
Epoch 7/80
 - 14s - loss: 0.3925 - acc: 0.8312
Epoch 8/80
 - 14s - loss: 0.3863 - acc: 0.8345
Epoch 9/80
 - 14s - loss: 0.3792 - acc: 0.8366
Epoch 10/80
 - 14s - loss: 0.3787 - acc: 0.8401
Epoch 11/80
 - 14s - loss: 0.3725 - acc: 0.8436
Epoch 12/80
 - 14s - loss: 0.3658 - acc: 0.8456
Epoch 13/80
 - 14s - loss: 0.3628 - acc: 0.8458
Epoch 14/80
 - 14s - loss: 0.3584 - acc: 0.8502
Epoch 15/80
 - 14s - loss: 0.3623 - acc: 0.8490
Epoch 16/80
 - 14s - loss: 0.3549 - acc: 0.8530
Epoch 17/80
 - 14s - loss: 0.3542 - acc: 0.8540
Epoch 18/80
 - 14s - loss: 0.3510 - acc: 0.8554
Epoch 19/80
 - 14s - loss: 0.3489 - acc: 0.8572
Epoch 20/80
 - 14s - loss: 0.3464 - acc: 0.8557
Epoch 21/80
 - 14s - loss: 0.3435 - acc: 0.8576
E

In [None]:
model = load_model('./LSTM/lstm_{0}_model.h5'.format(1e-3))
preds = model.predict(val_seq, verbose=0)
roc_auc_score(txt_val, preds[:,1])
#print(("Accuracy = {0} \t AUC = {1}".format(accuracy_score(txt_val, preds.argmax(axis=1)), 
#       roc_auc_score(txt_val, preds[:,1]))))

## Color recognition

In [None]:
lstm_create_train(labels=color_recognition_labels,
                  lstm_dim=100, 
                  batch_size=50,
                  num_epochs=30, 
                  optimizer_param=SGD(lr=0.05, nesterov=True),
                  regularization=1e-3)

In [None]:
lstm_create_train(labels=color_recognition_labels,
                  lstm_dim=100, 
                  batch_size=100,
                  num_epochs=30, 
                  optimizer_param=SGD(lr=0.05, nesterov=True),
                  regularization=1e-3)

In [None]:
lstm_create_train(labels=color_recognition_labels,
                  lstm_dim=100, 
                  batch_size=100,
                  num_epochs=50, 
                  optimizer_param=SGD(lr=0.03, nesterov=True),
                  regularization=1e-3)

In [None]:
lstm_create_train(labels=color_recognition_labels,
                  lstm_dim=100, 
                  batch_size=200,
                  num_epochs=30, 
                  optimizer_param=SGD(lr=0.02, nesterov=True),
                  regularization=1e-3)