In [69]:
import numpy as np
import pandas as pd
import pickle

In [70]:
import glob

data_dir = './MLDS_hw2_data/'
test_out_dir = './test_out_dir/'
peer_out_dir = './peer_out_dir/'

train_feat_list = glob.glob(data_dir+'training_data/feat/*')
test_feat_list = glob.glob(data_dir+'testing_data/feat/*')
print(len(train_feat_list), 'training set,', len(test_feat_list), 'testing set')

1450 training set, 100 testing set


In [71]:
print('loading training features ...')
train_data = list()
for train_feat in train_feat_list:
    train_data.append(np.load(train_feat))
else:
    train_data = np.array(train_data)
    print('train_data.shape:', train_data.shape, 'train_data.dtype:', train_data.dtype)
    print('pickling train_data')
    pickle.dump(train_data, open('train_data.pkl', 'wb'))

loading training features ...
train_data.shape: (1450, 80, 4096) train_data.dtype: float64
pickling train_data


In [72]:
print('loading testing features ...')
test_data = list()
for test_feat in test_feat_list:
    test_data.append(np.load(test_feat))
else:
    test_data = np.array(test_data)
    print('test_data.shape:', test_data.shape, 'test_data.dtype:', test_data.dtype)
    print('pickling test_data')
    pickle.dump(test_data, open('test_data.pkl', 'wb'))

loading testing features ...
test_data.shape: (100, 80, 4096) test_data.dtype: float64
pickling test_data


In [73]:
import pickle
print('loading train_data, test_data from .pkl ...')
train_data = pickle.load(open('train_data.pkl', 'rb'))
test_data = pickle.load(open('test_data.pkl', 'rb'))

loading train_data, test_data from .pkl ...


In [74]:
import json
print('loading training label ...')
train_label_json = json.load(open(data_dir+'training_label.json'))
print('loading training id ...')
training_id = [x.split('/')[-1].replace('.npy','') for x in train_feat_list]

print('loading testing label ...')
test_label_json = json.load(open(data_dir+'testing_label.json'))
print('loading testing id ...')
testing_id = [x.split('/')[-1].replace('.npy','') for x in test_feat_list]

loading training label ...
loading training id ...
loading testing label ...
loading testing id ...


In [75]:
import re
caption_words = set()
caption_dict = {'2':'two', '3':'three', '4':'four', '5':'five', '6':'six'}
for tj in train_label_json + test_label_json:
    for c in tj['caption']:
        c = c.replace('one hundred', '100')
        c = c.replace('two hundred', '200')
        s = [re.sub('[\",.;?!%”“]', '', s.lower()) for s in c.split(' ')]
        
        s[-1] = s[-1].replace('.', '')
        caption_words |= set(s)
else:
    caption_words.add('<BOS>')
    caption_words.add('<EOS>')
    caption_words.add('<NULL>')
    
    caption_words.remove('2')
    caption_words.remove('3')
    caption_words.remove('4')
    caption_words.remove('5')
    caption_words.remove('6')
            
    print('there are', len(caption_words), 'words in training/testing label')

there are 6346 words in training/testing label


In [76]:
from sklearn import preprocessing
print('caption_words label encoding...')
le = preprocessing.LabelEncoder()
le.fit(list(caption_words))
print('encoding', len(le.classes_), 'classes')
#le.transform([1, 1, 2, 6]) 
#le.inverse_transform([0, 0, 1, 2])

caption_words label encoding...
encoding 6346 classes


In [77]:
import copy
train_label_json_split = copy.deepcopy(train_label_json)
for tid in training_id:
    for i, tj in enumerate(train_label_json_split):
        if tid == tj['id']:
            caption = list()
            caption_one_hot = list()
            caption_mask = list()
            for c in tj['caption']:
                c = c.replace('one hundred', '100')
                c = c.replace('two hundred', '200')
                s = [re.sub('[\",.;?!%”“]', '', s.lower()) for s in c.split(' ')]
                
                # 將2, 3, 4, 5, 6用two, three, four, five, six取代
                for si, ss in enumerate(s):
                    if ss in caption_dict:
                        print(ss, 'hit at i:', i, 'replaced by', caption_dict[ss])
                        s[si] = caption_dict[ss]
                        
                s.insert(0, '<BOS>')
                s.append('<EOS>')
                for si in range(len(s), 37):
                    s.append('<NULL>')
                caption.append(s)
                s_one_hot = le.transform(s)
                caption_one_hot.append(s_one_hot)
                caption_mask.append(s_one_hot == le.transform(['<NULL>']))
            else:
                train_label_json_split[i]['caption'] = caption
                train_label_json_split[i]['one_hot'] = caption_one_hot
                train_label_json_split[i]['mask'] = caption_mask

2 hit at i: 1069 replaced by two
2 hit at i: 1449 replaced by two
4 hit at i: 659 replaced by four
4 hit at i: 659 replaced by four
3 hit at i: 25 replaced by three
6 hit at i: 931 replaced by six
5 hit at i: 1429 replaced by five


In [78]:
import copy
test_label_json_split = copy.deepcopy(test_label_json)
for tid in testing_id:
    for i, tj in enumerate(test_label_json_split):
        if tid == tj['id']:
            caption = list()
            caption_one_hot = list()
            caption_mask = list()
            for c in tj['caption']:
                c = c.replace('one hundred', '100')
                c = c.replace('two hundred', '200')
                s = [re.sub('[\",.;?!%”“]', '', s.lower()) for s in c.split(' ')]
                
                # 將2, 3, 4, 5, 6用two, three, four, five, six取代
                for si, ss in enumerate(s):
                    if ss in caption_dict:
                        print(ss, 'hit at i:', i, 'replaced by', caption_dict[ss])
                        s[si] = caption_dict[ss]
                        
                s.insert(0, '<BOS>')
                s.append('<EOS>')
                for si in range(len(s), 37):
                    s.append('<NULL>')
                caption.append(s)
                s_one_hot = le.transform(s)
                caption_one_hot.append(s_one_hot)
                caption_mask.append(s_one_hot == le.transform(['<NULL>']))
            else:
                test_label_json_split[i]['caption'] = caption
                test_label_json_split[i]['one_hot'] = caption_one_hot
                test_label_json_split[i]['mask'] = caption_mask

In [79]:
print('max length of caption is', 37)
#max_ = 0
#for t in train_label_json_split+test_label_json_split:
#    if len(t['caption']) > max_:
#        max_ = len(t['caption'])
#else:
#    print('max length of caption is', max_)

max length of caption is 37


In [80]:
print('encoding phase length:', train_data.shape[0], ' decoding phase length:', 37)
print('pad train_data from', train_data.shape[1:], 'to (', train_data.shape[1]+37, ',', train_data.shape[2],')') 

train_data_padding = list()
for t in train_data:
    train_data_padding.append(np.concatenate([t, np.zeros((37, 4096))]))
else:
    train_data_padding = np.array(train_data_padding)
    print('shape after padding:', train_data_padding.shape)

encoding phase length: 1450  decoding phase length: 37
pad train_data from (80, 4096) to ( 117 , 4096 )
shape after padding: (1450, 117, 4096)


In [81]:
print('encoding phase length:', test_data.shape[0], ' decoding phase length:', 37)
print('pad test_data from', test_data.shape[1:], 'to (', test_data.shape[1]+37, ',', test_data.shape[2],')') 

test_data_padding = list()
for t in test_data:
    test_data_padding.append(np.concatenate([t, np.zeros((37, 4096))]))
else:
    test_data_padding = np.array(test_data_padding)
    print('shape after padding:', test_data_padding.shape)

encoding phase length: 100  decoding phase length: 37
pad test_data from (80, 4096) to ( 117 , 4096 )
shape after padding: (100, 117, 4096)


In [104]:
import random
train_label_padding = list()
train_label_mask = list()

test_label_padding = list()
test_label_mask = list()
def random_pick_gen_label():
    global train_label_padding
    global train_label_mask
    global test_label_padding
    global test_label_mask
#if True:    
    for i, tid in enumerate(training_id):
        for tj in train_label_json_split:
            if tid == tj['id']:
                k = random.randint(0, len(tj['one_hot'])-1)
                
                pad80_label37 = np.concatenate((np.full((80, ), le.transform(['<NULL>'])), tj['one_hot'][k]))
                pad80_label37_bool = np.concatenate((np.full((80, ), False), tj['mask'][k]))
                
                train_label_padding.append(pad80_label37.reshape((117,1)))
                train_label_mask.append(pad80_label37_bool.reshape((117,1)))
    else:
        train_label_padding = np.array(train_label_padding)
        train_label_padding = train_label_padding.reshape((1450, 117, 1))
        train_label_mask = np.array(train_label_mask)
        train_label_mask = train_label_mask.reshape((1450, 117, 1))
        
                

    for i, tid in enumerate(testing_id):
        for tj in test_label_json_split:
            if tid == tj['id']:
                k = random.randint(0, len(tj['one_hot'])-1)
                
                pad80_label37 = np.concatenate((np.full((80, ), le.transform(['<NULL>'])), tj['one_hot'][k]))
                pad80_label37_bool = np.concatenate((np.full((80, ), False), tj['mask'][k]))
                
                test_label_padding.append(pad80_label37.reshape((117,1)))
                test_label_mask.append(pad80_label37_bool.reshape((117,1)))
    else:
        test_label_padding = np.array(test_label_padding)
        test_label_padding = test_label_padding.reshape((100, 117, 1))
        test_label_mask = np.array(test_label_mask)
        test_label_mask = test_label_mask.reshape((100, 117, 1))
        

random_pick_gen_label()
print('train_label_padding.shape:', train_label_padding.shape, 'train_label_mask.shape:', train_label_mask.shape)
print('test_label_padding.shape:', test_label_padding.shape, 'test_label_mask.shape:', test_label_mask.shape)


train_label_padding.shape: (1450, 117, 1) train_label_mask.shape: (1450, 117, 1)
test_label_padding.shape: (100, 117, 1) test_label_mask.shape: (100, 117, 1)


In [139]:
import keras

import numpy as np
import keras.backend as K 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.layers import GRU
from keras.layers import LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras import metrics
from sklearn.model_selection import StratifiedKFold

from pylab import *

Using TensorFlow backend.


In [None]:
model = Sequential()
model.add(LSTM(256, return_sequences=True, activation='sigmoid', dropout=0.25), input_shape=(sample_length, dim))
model.add(LSTM(256, return_sequences=True, activation='sigmoid', dropout=0.25), input_shape=(sample_length, dim))
                     
            model.add(Dense(256,activation='sigmoid'))
            model.add(Dropout(0.25))
            model.add(Dense(64,activation='sigmoid'))
            model.add(Dropout(0.25))
            model.add(Dense(48,activation='softmax'))
            model.summary()  