# Features loading

In [11]:
import numpy as np
import random
import pickle
import math
random.seed(1)
import time
import pandas as pd
import h5py
import os

In [2]:
tag_dict = {}
with open("label.txt") as f:
    for line in f:
        (key, val) = line.split(' ')
        tag_dict[int(key)] = val.replace('\n','')

In [3]:
tag2index = {v: k for k, v in tag_dict.items()}

In [4]:
vocab_dict = {}
with open("vocab.txt") as f:
    for line in f:
        (key, val) = line.split(' ')
        vocab_dict[int(key)] = val.replace('\n','')

In [5]:
vocab2index = {v: k for k, v in vocab_dict.items()}

In [9]:
### image feature
h5 = h5py.File('insta_imgFeat__.h5', 'r')

In [10]:
from keras.models import Model
from keras.layers import Input, Reshape, Dense, Embedding, Dropout, LSTM, Lambda, Concatenate, \
    Multiply, RepeatVector, Permute, Flatten, Activation
import keras.backend as K
from keras import optimizers

from selfDef import myLossFunc, Attention, coAttention_para, zero_padding, tagOffSet

num_tags = len(tag_dict)
num_words = len(vocab_dict)
index_from_text = 3
index_from_tag = 2
seq_length = 328    # max length of text sequence
batch_size = 5
embedding_size = 300
attention_size = 200
dim_k = 100
num_region = 7*7
drop_rate = 0.75
maxTagLen = 186    # max length of tag sequence
num_epoch = 10
numHist = 1    # historical posts number for each user
numTestInst = 1280    # if you're going to use predict_generator, modify this parameter as your testSet size.
testBatchSize = 11

In [12]:
# orig set : post_id, words, hashtags, user_id, post_num, follower, following, like_num, comment_num
text_train, tag_train, ids_train, user_train, demo_train = pickle.load(open("trainSet2_20_%d.pkl"%numHist, "rb"))
text_train = zero_padding(text_train, seq_length)

text_test, tag_test, ids_test, user_test, demo_test = pickle.load(open("testSet2_20_%d.pkl"%numHist, "rb"))
text_test = zero_padding(text_test, seq_length)
tag_test = list(tag_test)
tmp_test_tag = []
for index in range(len(tag_test)-(numTestInst % testBatchSize)):
    tmpArray = np.zeros(num_tags)
    tmpArray[np.array(tag_test[index], dtype=np.int32)] = 1
    tmp_test_tag.append(tmpArray)
test_tag = np.array(tmp_test_tag)

# DemoHash model

In [13]:
def batchMaker(texts, tags, ids, users, demo):
    shape = texts.shape[0]
    text_copy = texts.copy()
    tag_copy = tags.copy()
    ids_copy = ids.copy()
    users_copy = users.copy()
    demo_copy = demo.copy()

    indices = np.arange(shape)
    np.random.shuffle(indices)
    text_copy = list(text_copy[indices])
    tag_copy = list(tag_copy[indices])
    ids_copy = np.array(ids_copy)[indices]
    users_copy = users_copy[indices]
    demo_copy = demo_copy[indices]
    
    i = 0
    while True:
        if i + batch_size <= shape:
            img_train = []
            tmp_train_text = []
            tmp_train_tag = []
            tmp_train_demo = []

            for index in range(i, i+batch_size):
                data = h5.get(ids_copy[index])
                np_data = np.array(data)
                if np_data.shape != ():
                    img_train.append(np_data)
                    tmp_train_text.append(np.array(text_copy[index], dtype=np.int32))
                    tmpArray = np.zeros(num_tags)
                    tmpArray[np.array(tag_copy[index], dtype=np.int32)] = 1
                    tmp_train_tag.append(tmpArray)
                    tmp_train_demo.append(np.array(demo_copy[index], dtype=np.float32))
            text_train = np.array(tmp_train_text)
            tag_train = np.array(tmp_train_tag)
            img_train = np.squeeze(np.array(img_train))
            demo_train = np.array(tmp_train_demo)

            yield [img_train, text_train, demo_train], tag_train
            i+=batch_size
        else:
            i= 0
            indices = np.arange(shape)
            np.random.shuffle(indices)
            text_copy = np.array(text_copy)
            tag_copy = np.array(tag_copy)
            text_copy = list(text_copy[indices])
            tag_copy = list(tag_copy[indices])
            ids_copy = np.array(ids_copy)[indices]
            users_copy = users_copy[indices]
            demo_copy = demo_copy[indices]
            
            continue

In [14]:
def batchMakerTest(texts, tags, ids, users, demo):
    shape = texts.shape[0]
    text_copy = texts.copy()
    tag_copy = tags.copy()
    ids_copy = ids.copy()
    users_copy = users.copy()
    demo_copy = demo.copy()

    i = 0
    while True:
        if i + testBatchSize <= shape:
            img_test = []
            tmp_test_text = []
            tmp_test_tag = []
            tmp_test_demo = []

            for index in range(i, i + testBatchSize):
                data = h5.get(ids_copy[index])
                np_data = np.array(data)
                if np_data.shape != ():
                    img_test.append(np_data)                 
                    tmp_test_text.append(np.array(text_copy[index], dtype=np.int32))
                    tmpArray = np.zeros(num_tags)
                    tmpArray[np.array(tag_copy[index], dtype=np.int32)] = 1
                    tmp_test_tag.append(tmpArray)
                    tmp_test_demo.append(np.array(demo_copy[index], dtype=np.float32))           
            text_test = np.array(tmp_test_text)
            tag_test = np.array(tmp_test_tag)
            img_test = np.squeeze(np.array(img_test))
            demo_test = np.array(tmp_test_demo)
            yield [img_test, text_test, demo_test], tag_test
            i += testBatchSize
        else:
            i = 0
            continue


In [15]:
def modelDef():
    inputs_img = Input(shape=(7, 7, 512))
    inputs_text = Input(shape=(seq_length,))
    inputs_demo = Input(shape=(133,))

    # shared layers
    tagEmbeddings = Embedding(input_dim=num_tags + index_from_tag, output_dim=embedding_size,
                              mask_zero=True, input_length=maxTagLen)
    textEmbeddings = Embedding(input_dim=num_words + index_from_text, output_dim=embedding_size,
                               mask_zero=True, input_length=seq_length)
    lstm = LSTM(units=embedding_size, return_sequences=True)
    dense = Dense(embedding_size, activation="tanh", use_bias=False)
    reshape = Reshape(target_shape=(num_region, 512))
    coAtt_layer = coAttention_para(dim_k=dim_k)
    tag_att = Attention(attention_size)  
    # Numerical Value 
    dense_numeric = Dense(32, activation = 'relu')

    # query post representation
    text_embeddings = textEmbeddings(inputs_text)
    tFeature = lstm(text_embeddings)
    iFeature = reshape(inputs_img)
    iFeature = dense(iFeature)

    demoFeature = Dense(embedding_size, activation='tanh')(inputs_demo)
    
    co_feature = coAtt_layer([tFeature, iFeature])
    
    # calculating similarity between demoFature and co_feature of the post
    sim = Multiply()([demoFeature, co_feature])
    att = Dense(1, activation='tanh')(sim)
    att = Flatten()(att)
    
    # compute the weights of each demographic feature 
    attention = Activation('softmax')(att)
    
    attention = RepeatVector(embedding_size)(attention)
    attention = Permute([2, 1])(attention)  
    
    # Multiply similarity and weights (weighted sum)
    influence = Multiply()([sim, attention])
    influence = Lambda(lambda x: K.sum(x, axis=1))(influence)
    # the most important demographic feature of using hashtag
    influence = Dense(embedding_size)(influence)
    
    h = Concatenate()([co_feature, influence])
    dropout = Dropout(drop_rate)(h)
    
    Softmax = Dense(num_tags, activation="softmax", use_bias=True)(dropout)
    model = Model(inputs=[inputs_img, inputs_text, inputs_demo],
                  outputs=[Softmax])

    model.compile(optimizer="adam", loss=myLossFunc)
    return model

In [16]:
def evaluator(y_true, y_pred, top_K):
    acc_count = 0
    precision_K = []
    recall_K = []
    f1_K = []

    for i in range(y_pred.shape[0]):
        top_indices = y_pred[i].argsort()[-top_K:]
        if np.sum(y_true[i, top_indices]) >= 1:
            acc_count += 1
        p = np.sum(y_true[i, top_indices]) / top_K
        r = np.sum(y_true[i, top_indices]) / np.sum(y_true[i, :])
        precision_K.append(p)
        recall_K.append(r)
        if p != 0 or r != 0:
            f1_K.append(2 * p * r / (p + r))
        else:
            f1_K.append(0)
    acc_K = acc_count * 1.0 / y_pred.shape[0]

    return acc_K, np.mean(np.array(precision_K)), np.mean(np.array(recall_K)), np.mean(np.array(f1_K))

In [None]:
if __name__ == "__main__":
    for top_K in [7]:
        model = modelDef()

        F = 0.0
        res_file = open("record_userHist_%d_%d.txt"%(embedding_size, numHist), "a")
        string = "Embedding_size = %d \t Top- %d\n" % (embedding_size, top_K)
        res_file.write(string)
        print("Start Training...")
        start = time.time()
        for epoch in range(num_epoch):
            history = model.fit_generator(
                generator=batchMaker(text_train, tag_train, ids_train, user_train, demo_train),
                steps_per_epoch=int(text_train.shape[0] / batch_size),
                epochs=1,
                verbose=1,)
            y_pred = model.predict_generator(generator=batchMakerTest(text_test, tag_test, ids_test, user_test, demo_test),
                                             steps=int(numTestInst / testBatchSize),
                                             verbose=1)
            acc, precision, recall, f1 = evaluator(test_tag, y_pred, top_K)

            print("Top %d, Epoch: %d,accuracy: %.6f, precision: %.6f, recall: %.6f, f1: %.6f" %
                  (top_K, epoch, acc, float(precision), float(recall), float(f1)))
            if f1 >= F:
                model.save_weights("model_best_%d_%d.h5"%(embedding_size, numHist))
                res_file = open("record_userHist_%d_%d.txt"%(embedding_size, numHist), "a")
                string = "Epoch: %d,accuracy: %.6f, precision: %.6f, recall: %.6f, f1: %.6f \n" % (
                epoch, acc, float(precision), float(recall), float(f1))
                res_file.write(string)
                res_file.close()
                F = f1
    print("time :", time.time() - start)
    print("Training Process Completed.")