In [5]:
from tensorflow.python.client import device_lib
import tensorflow as tf
from skill_label_classifier import *

In [None]:
experiment = SkillClassifier()
experiment.import_data()
experiment.create_df()

experiment.choose_dataset('vizwiz')
# experiment.set_features(['QSN', 'descriptions', 'tags', 'dominant_colors','handwritten_text', 'ocr_text'])
experiment.set_features(['descriptions', 'tags', 'dominant_colors', 'handwritten_text', 'ocr_text'])
experiment.set_targets()

features_train = experiment.features_train
features_val   = experiment.features_val

# check class distribution
text_recognition_labels = to_categorical(np.asarray(experiment.txt_train)).astype('float32')
color_recognition_labels = to_categorical(np.asarray(experiment.col_train)).astype('float32')
print('Number of samples each class: ')
print('Text recognition', text_recognition_labels.sum(axis=0))
print('Color recognition', color_recognition_labels.sum(axis=0))
n_classes = 2

# get targets
txt_train      = experiment.txt_train
col_train      = experiment.col_train
txt_val        = experiment.txt_val
col_val        = experiment.col_val

In [7]:
# tokenize
tok        = Tokenizer(num_words=VOCAB_SIZE, 
                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                       lower=True,
                       split=" ")
tok.fit_on_texts(features_train)

# create sequences & pad
train_seq  = tok.texts_to_sequences(features_train)
train_seq  = sequence.pad_sequences(train_seq, maxlen=MAX_DOC_LEN)
val_seq    = tok.texts_to_sequences(features_val)
val_seq    = sequence.pad_sequences(val_seq, maxlen=MAX_DOC_LEN)

# punkt sentence level tokenizer
sent_lst = [] 
for doc in features_train:
    sentences = nltk.tokenize.sent_tokenize(doc)
    for sent in sentences:
        word_lst = [w for w in nltk.tokenize.word_tokenize(sent) if w.isalnum()]
        sent_lst.append(word_lst)

# word2vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
word2vec_model = gensim.models.Word2Vec(sentences=sent_lst,
                                        min_count=6,
                                        size=EMBEDDING_DIM,
                                        sg=1,
                                        workers=os.cpu_count())

embeddings_index = {}
for word in word2vec_model.wv.vocab:
    coefs = np.asarray(word2vec_model.wv[word], dtype='float32')
    embeddings_index[word] = coefs
print('Total %s word vectors' % len(embeddings_index))

# Initial word embedding
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word, i in tok.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i < VOCAB_SIZE:
        embedding_matrix[i] = embedding_vector

2019-03-07 00:43:30,214 : INFO : collecting all words and their counts
2019-03-07 00:43:30,215 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-03-07 00:43:30,274 : INFO : PROGRESS: at sentence #10000, processed 346668 words, keeping 32892 word types
2019-03-07 00:43:30,299 : INFO : collected 43239 word types from a corpus of 489472 raw words and 14257 sentences
2019-03-07 00:43:30,299 : INFO : Loading a fresh vocabulary
2019-03-07 00:43:30,324 : INFO : effective_min_count=6 retains 3312 unique words (7% of original 43239, drops 39927)
2019-03-07 00:43:30,324 : INFO : effective_min_count=6 leaves 437841 word corpus (89% of original 489472, drops 51631)
2019-03-07 00:43:30,334 : INFO : deleting the raw counts dictionary of 43239 items
2019-03-07 00:43:30,335 : INFO : sample=0.001 downsamples 63 most-common words
2019-03-07 00:43:30,336 : INFO : downsampling leaves estimated 265959 word corpus (60.7% of prior 437841)
2019-03-07 00:43:30,343 : INFO : estimat

Total 3312 word vectors


In [None]:
L = 1e-2
R = 0
B = 32
E = 30
start = time.time()
lstm_create_train(train_seq, embedding_matrix,
                 labels=text_recognition_labels, skill='text',
                 learning_rate=L,
                 lstm_dim=100,
                 batch_size=B,
                 num_epochs=E,
                 optimizer_param=SGD(lr=L, nesterov=True),
                  regularization=R)
end = time.time()
model = load_model('./LSTM/text/{}_{}_{}_{}_model.h5'.format(L,R,B,E))
preds = model.predict(val_seq, verbose=0)
print("Learning rate: {} Regularization: {} Batch size: {} Epoch: {}".format(L,R,B,E))
print(("Accuracy = {0} \t AUC = {1}".format(accuracy_score(txt_val, preds.argmax(axis=1)), 
                                                            roc_auc_score(txt_val, preds[:,1]))))
print("----- Total training: {} seconds -----".format(end-start))

Epoch 1/30
 - 46s - loss: 0.6098 - acc: 0.6770
Epoch 2/30
 - 44s - loss: 0.5028 - acc: 0.7667
Epoch 3/30
 - 43s - loss: 0.4723 - acc: 0.7823
Epoch 4/30
 - 43s - loss: 0.4584 - acc: 0.7933
Epoch 5/30
 - 43s - loss: 0.4517 - acc: 0.7956
Epoch 6/30
 - 43s - loss: 0.4457 - acc: 0.7978
Epoch 7/30
 - 44s - loss: 0.4401 - acc: 0.8052
Epoch 8/30
 - 44s - loss: 0.4343 - acc: 0.8063
Epoch 9/30
 - 43s - loss: 0.4338 - acc: 0.8074
Epoch 10/30
 - 43s - loss: 0.4305 - acc: 0.8120
Epoch 11/30
 - 43s - loss: 0.4278 - acc: 0.8093
Epoch 12/30
 - 43s - loss: 0.4250 - acc: 0.8152
Epoch 13/30
 - 44s - loss: 0.4227 - acc: 0.8153
Epoch 14/30
 - 43s - loss: 0.4191 - acc: 0.8180
Epoch 15/30
 - 43s - loss: 0.4189 - acc: 0.8171
Epoch 16/30
 - 43s - loss: 0.4179 - acc: 0.8167
Epoch 17/30
 - 43s - loss: 0.4154 - acc: 0.8188
Epoch 18/30
 - 43s - loss: 0.4112 - acc: 0.8236
Epoch 19/30
 - 44s - loss: 0.4098 - acc: 0.8241
Epoch 20/30
 - 44s - loss: 0.4093 - acc: 0.8207
Epoch 21/30
