In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# set parameters for text input
num_of_samples=10
maxlen = 30
training_samples = num_of_samples
text_vocabulary_size = 10000
max_words = text_vocabulary_size

Using TensorFlow backend.


In [1]:
# loading data
# X_image_train : image input
# X_text_train  : text input
# texts         : raw text
# y_train_like  : nb of likes output
# y_comment_like  : nb of comments output
# y_share_like  : nb of shares output
import os
import numpy as np
import cv2
import pandas as pd
import pickle
pixel_x = 100
pixel_y = 100
dim = (pixel_x, pixel_y)
X_image_train = []
y_train_like = []
y_train_comment = []
y_train_share = []
texts = []
data_raw=pd.read_csv('page_username.csv')
post_list_all_reload = pickle.load( open("fb_post_list_1000pages.p", "rb") )
for item in data_raw['page_username']:
    for movie in post_list_all_reload:
        if (movie['username']==item):
            print('Loading data for movie: ',item)
            for post in movie['posts']:
                if post['image'] and post['post_id']:
                    fname=movie['username']+'_'+post['post_id']
                    fpath = './photos/'+fname+'.jpg'
                    try:
                        im = cv2.imread(fpath)
                        im_resized = cv2.resize(im, dim, interpolation = cv2.INTER_AREA)
                        X_image_train.append(im_resized) # loading image data
                        texts.append(post['text']) # loading text data
                        y_train_like.append(post['likes']+0.1) # loading nb of likes output
                        y_train_comment.append(post['comments']+0.1) # loading nb of comments output
                        y_train_share.append(post['shares']+0.1) # loading nb of shares output
                    except Exception as e:
                        pass

Loading data for movie:  ActOfValorMovie
Loading data for movie:  Agneepath
Loading data for movie:  ALateQuartet
Loading data for movie:  AlbertNobbs
Loading data for movie:  AlexCrossMovie
Loading data for movie:  AmourFilm
Loading data for movie:  AnnaKareninaTheMovie
Loading data for movie:  arbitragemovie
Loading data for movie:  ARoyalAffairMovie
Loading data for movie:  avengers
Loading data for movie:  Battleship
Loading data for movie:  BeastsoftheSouthernWild
Loading data for movie:  bigmiraclemovie
Loading data for movie:  bobmarleymovie
Loading data for movie:  boom
Loading data for movie:  CASAmovie
Loading data for movie:  celesteandjesseforever
Loading data for movie:  chasingice
Loading data for movie:  ChasingMavericks
Loading data for movie:  chernobyldiaries
Loading data for movie:  Chronicle
Loading data for movie:  Contraband
Loading data for movie:  crookedarrows
Loading data for movie:  darkshadowsmovie
Loading data for movie:  DianaVreelandFilm
Loading data for 

In [5]:
print('traing set length: {}'.format(len(y_train_like)))

traing set length: 28858


In [6]:
# prepare the text input
# tokenize raw text into X_text_train
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found {} unique tokens'.format(len(word_index)))
X_text_train = pad_sequences(sequences, maxlen=maxlen)

Found 40879 unique tokens


In [7]:
# NLP part: using pretrained word embedding model
# load the pretrained GloVe coefficient
glove_dir = './'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.50d.txt'),encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

embedding_dim = 50
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [8]:
# NLP part: build the deep learning model for text input
from keras import layers
from keras import Input
from keras.models import Model
from keras import models
text_input = Input(shape=(None,), dtype='int32', name='text')
embedded_text = layers.Embedding(max_words, embedding_dim)(text_input)
encoded_text = layers.LSTM(8)(embedded_text)

# Computer vision part: build the deep learning model for image input
from keras.applications import VGG16
image_input = Input(shape=(pixel_x, pixel_y, 3), name='image')
vgg16 = VGG16(weights='imagenet',
                  include_top=False,
                  input_shape=(pixel_x, pixel_y, 3))(image_input)
x = layers.Flatten()(vgg16) 
x = layers.Dense(16, activation='relu')(x)

# Concatenate NLP output and computer vision output
# build the output layer for regression
from keras.optimizers import Adam
concatenated = layers.concatenate([x, encoded_text], axis=-1)
output = layers.Dense(1, activation="linear")(concatenated)
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model = Model([image_input, text_input], output)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)

# freeze parameters in pretrained models
model.layers[1].trainable = False # freeze VGG16 coefficient
model.layers[4].set_weights([embedding_matrix])
model.layers[4].trainable = False # freeze GloVe word embedding

In [12]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image (InputLayer)              (None, 100, 100, 3)  0                                            
__________________________________________________________________________________________________
vgg16 (Model)                   (None, 3, 3, 512)    14714688    image[0][0]                      
__________________________________________________________________________________________________
text (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 4608)         0           vgg16[1][0]                      
____________________________________________________________________________________________

In [None]:
# train and save model
model.fit([X_image_train, X_text_train], y_train_comment, epochs=2, batch_size=4)
model_path="./models/"
filename = 'deep_learning_like_model.sav'
pickle.dump(model, open(model_path+filename, 'wb'))