In [11]:
import os
from pathlib import Path
import datetime
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.translate.bleu_score import corpus_bleu

from keras.models import Model
from keras.models import load_model
from keras.layers import Input
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers.merge import add
from keras.layers.merge import concatenate
from keras.callbacks import ModelCheckpoint

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Load Data from GDrive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Read and Prepare Data for Model Input
I read the data from GDrive and add placeholder tokens to denote the start and end of a sequence. These are added to every tweet in the dataset.

The placeholders are required for the RNN.

In [12]:
full_train_df = pd.read_csv('/content/drive/My Drive/Capstone/training_tweets.csv', index_col=0)
# Add placeholder to start and end of tweet
full_train_df['tweet_text'] = full_train_df['tweet_text'].apply(lambda x: ' '.join(['startseq', x, 'endseq']))
#update n_tokens
full_train_df['n_tokens'] = full_train_df['n_tokens'] + 2
full_train_df

Unnamed: 0,tweet_text,emojis,emojis_unq,n_tokens,n_emojis,n_emojis_unq
26760,startseq Smh 🤦🏽‍♂️ things have got to change e...,🤦🏽‍♂️,🤦🏽‍♂️,9,1,1
150342,startseq He’ll probably just take it in stride...,🤣,🤣,13,1,1
616357,startseq God Bless America 😞 endseq,😞,😞,6,1,1
343728,startseq I don’t have Venmo ☹ ️ I have cashapp...,☹,☹,23,1,1
66344,startseq A little wip I’m working on ~ can fin...,👀,👀,24,1,1
...,...,...,...,...,...,...
51304,startseq All it takes is ONE google search The...,🤡,🤡,17,1,1
446871,startseq To the nail shop I go 🥰 endseq,🥰,🥰,9,1,1
538154,startseq Thank you for sharing your story It i...,💪🏽,💪🏽,23,1,1
48438,startseq 🚨 NEW VIDEOS Alert 🚨 Had a very 🔥 🔥 🔥...,🚨 🚨 🔥 🔥 🔥,🔥 🚨,22,5,2


I have to reduce the dataset to reduce the training time. I use the first 5000 tweets.

In [13]:
n_samples = 5000
train_df = full_train_df.head(n_samples)

I need to keep track of the maximum number of tokens in a tweet, and the size of the vocabulary, as they dictate the size of vectors used in the model.


In [14]:
# value required to define the model
# dictates size of internal vector
n_max_tokens = train_df['n_tokens'].max()

tokenizer = Tokenizer(oov_token='<unk>')
tokenizer.fit_on_texts(train_df['tweet_text'])
# word_index returns the actual vocab size of the corpus
# but uses the max vocab size passed on in num_words
# see: https://stackoverflow.com/questions/46202519/keras-tokenizer-num-words-doesnt-seem-to-work
# add 1 to account for <unk>
vocab_size = len(tokenizer.word_index) + 1

print(f'Maximum number of tokens per tweet: {n_max_tokens}')
print(f'Vocabulary size: {vocab_size}')

Maximum number of tokens per tweet: 32
Vocabulary size: 10036


In [19]:
#pickle.dump(tokenizer, open('/content/drive/My Drive/Capstone/emoji2tweet_tokenizer_n5000.pkl', 'wb'))

## Load Word2Vec Model
I load the word2vec model that will be used to create the feature vector. Each emoji is converted to a vector using this model. I create a helper function to quickly convert a string of emojis (as it is stored in the dataframe), into the resulting feature vector. The feature vector is the sum of the vector representations of each emoji in the string.


In [15]:
# w2v_model loaded outside of the function to prevent loading the model on every function call
w2v_model = Word2Vec.load('/content/drive/My Drive/Capstone/w2v.model')
def vectorize_emojis(emoji_str):
    emojis = emoji_str.split(' ')
    vec_sum = np.zeros(300)
    for emj in emojis:
        try:
            vec = w2v_model.wv[emj]
            vec_sum += vec
        except KeyError:
            pass
            #print(f'Emoji not in w2v: {emj}')
            #ignoring emojis not in the list
    return vec_sum

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Creating the Input to the Model
The tweet text has to be expanded into partial sequences so that it can be fed into the RNN. The partial sequences are essentially concatenated with the feature vector.

The `create_sequences` function will generate this data. Though it is very memory intensive, and I had to implement a data generator.

In [None]:
# function taken and adapted from: https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
def create_sequences(df, n_max_tokens, vocab_size):
  # initiate empty lists
  rnn_seqs, emoji_vecs, y = [], [], []
  #I'm iterating over a df, yuck
  #just cant think of the alternative atm
  for _, row in df.iterrows():
    tweet_text = row['tweet_text']
    emoji_vec = vectorize_emojis(row['emojis'])
    seq = tokenizer.texts_to_sequences([tweet_text])[0]
    for i in range(1, len(seq)):
      # split into input and output pair
      in_seq, out_seq = seq[:i], seq[i]
      # pad input sequence
      in_seq = pad_sequences([in_seq], maxlen=n_max_tokens)[0]
      # encode output sequence
      out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
      emoji_vecs.append(emoji_vec)
      rnn_seqs.append(in_seq)
      y.append(out_seq)
  return np.array(emoji_vecs), np.array(rnn_seqs), np.array(y)

In [16]:
# functions taken and adapted from: https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
def model_data_generator(df, n_max_tokens, vocab_size):
  while 1:
    for _, row in df.iterrows():
      tweet_text = row['tweet_text']
      emoji_str = row['emojis']
      emoji_vecs, rnn_seqs, y = create_sequences_gen(tweet_text, emoji_str, n_max_tokens, vocab_size)
      yield [[emoji_vecs, rnn_seqs], y]

def create_sequences_gen(tweet_text, emoji_str, n_max_tokens, vocab_size):
  # initiate empty lists
  rnn_seqs, emoji_vecs, y = [], [], []
  emoji_vec = vectorize_emojis(emoji_str)
  seq = tokenizer.texts_to_sequences([tweet_text])[0]
  for i in range(1, len(seq)):
    # split into input and output pair
    in_seq, out_seq = seq[:i], seq[i]
    # pad input sequence
    in_seq = pad_sequences([in_seq], maxlen=n_max_tokens)[0]
    # encode output sequence
    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
    emoji_vecs.append(emoji_vec)
    rnn_seqs.append(in_seq)
    y.append(out_seq)
  return np.array(emoji_vecs), np.array(rnn_seqs), np.array(y)

## Defining the Model


Here I create two functions that define the model. The first function merges the feature vector and the result of the RNN through addition. The second function concatenates along axis=1.

In [None]:
# function taken and adapted from: https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
def define_model_add(n_max_tokens, vocab_size):
  # word2vec 300dm vector as input
  inputs1 = Input(shape=(300,))
  fe1 = Dropout(0.5)(inputs1)
  # Use dense layer to 
  fe2 = Dense(256, activation='relu')(fe1)
  # sequence model
  inputs2 = Input(shape=(n_max_tokens,))
  se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
  se2 = Dropout(0.5)(se1)
  se3 = LSTM(256)(se2)
  # decoder model
  decoder1 = add([fe2, se3])
  decoder2 = Dense(256, activation='relu')(decoder1)
  outputs = Dense(vocab_size, activation='softmax')(decoder2)
  model = Model(inputs=[inputs1, inputs2], outputs=outputs)
  model.compile(loss='categorical_crossentropy', optimizer='adam')
  # summarize model
  print(model.summary())
  return model

In [19]:
# function taken and adapted from: https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
def define_model_conc(n_max_tokens, vocab_size):
  # feature extractor model
  inputs1 = Input(shape=(300,))
  fe1 = Dropout(0.5)(inputs1)
  fe2 = Dense(256, activation='relu')(fe1)
  # sequence model
  inputs2 = Input(shape=(n_max_tokens,))
  se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
  se2 = Dropout(0.5)(se1)
  se3 = LSTM(256)(se2)
  # decoder model
  decoder1 = concatenate(inputs=[fe2, se3], axis=1)
  decoder2 = Dense(256, activation='relu')(decoder1)
  outputs = Dense(vocab_size, activation='softmax')(decoder2)
  model = Model(inputs=[inputs1, inputs2], outputs=outputs)
  model.compile(loss='categorical_crossentropy', optimizer='adam')
  # summarize model
  print(model.summary())
  return model

The model is instanttiated here. Initially I saved an empty model in order to have the same starting condition for different test instance, but it was hard to track as I changed my model parameters (token limit, vocab size)


In [20]:
model = define_model_conc(n_max_tokens, vocab_size)
# Save an empty model that I can use to "reset" my weights
#model.save('/content/drive/My Drive/Capstone/tweetmoji_empty_model.h5')

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 32, 256)      2569216     input_6[0][0]                    
__________________________________________________________________________________________________
dropout_5 (Dropout)             (None, 300)          0           input_5[0][0]                    
____________________________________________________________________________________________

## Training the Model



The model is trained over 10 epochs. I create an output folder where the resulting models will be saved. I save the model after every epoch. Additionally, I save the training loss.

I trained the model using 2 different inputs, and 2 different methods of merging.

For the first set of inputs I used the list of emojis extracted from the tweet. For the second set I used a unique list of eemojis extracted from the tweet.

I used the unique list to remove the influence of repeated emojis. 

In [21]:
n_epochs = 10
input_df = train_df[['tweet_text', 'emojis']]

# To train on unique emojis uncomment the lines below
# input_df = train_df[['tweet_text', 'emojis_unq']]
# input_df.columns = ['tweet_text', 'emojis']

# Create output folder
os.environ['TZ'] = 'America/Toronto'
date_str = datetime.datetime.now().strftime('%Y%m%d_%H%M')
# Use the samples and number of epochs to differentiate model outputs
# Use the date_str to avoid overwriting existing outputs
output_dir_name = f'n_samples{n_samples}_n_epoch{n_epochs}_{date_str}'
output_dir = f"/content/drive/My Drive/Capstone/model_output/{output_dir_name}"
print(f'OUTPUT: {output_dir}')
# Create the directory
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Define naming scheme for model output
output_filename=os.path.join(output_dir, "tweetmoji-epoch{epoch}.h5")
# Use ModelCheckpoint to save the weights after each epoch
checkpoint = ModelCheckpoint(output_filename, monitor='loss', period=1)
callbacks_list = [checkpoint]

# reset model weights by loading an "empty" model
# using the same base model to compare results of different 
# model = load_model('/content/drive/My Drive/Capstone/tweetmoji_empty_model.h5')

# Fit the model using the data generator
data_gen = model_data_generator(input_df, n_max_tokens, vocab_size)
# the model history is saved as training_loss because that's the only value tracked
training_loss = model.fit_generator(data_gen, epochs=n_epochs, steps_per_epoch=n_samples, verbose=1, callbacks=callbacks_list)

# Save history of the model
# This contain training loss per epoch
training_loss_df = pd.DataFrame(training_loss.history)
training_loss_df.index.name = 'epoch'
training_loss_df.to_csv(os.path.join(output_dir, 'training_loss.csv'))

OUTPUT: /content/drive/My Drive/Capstone/model_output/n_samples5000_n_epoch10_20200627_2127


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Done Training!

Evaluation and generation of tweet is done in  emoji2tweet_evaluation.ipynb

