In [12]:
import os
from pathlib import Path
import datetime
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.translate.bleu_score import corpus_bleu

from keras.models import load_model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Load Data from GDrive

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Read and Prepare Data for Model Input
I read the data from GDrive and add placeholder tokens to denote the start and end of a sequence. These are added to every tweet in the dataset.

The placeholders are required for the RNN.

In [21]:
full_test_df = pd.read_csv('/content/drive/My Drive/Capstone/testing_tweets.csv', index_col=0)

# evaluate on training dataset to determine if the model is even learning anything
# full_test_df = pd.read_csv('/content/drive/My Drive/Capstone/training_tweets.csv', index_col=0)

# Add placeholder to start and end of tweet
full_test_df['tweet_text'] = full_test_df['tweet_text'].apply(lambda x: ' '.join(['startseq', x, 'endseq']))
#update n_tokens
full_test_df['n_tokens'] = full_test_df['n_tokens'] + 2
full_test_df

Unnamed: 0,tweet_text,emojis,emojis_unq,n_tokens,n_emojis,n_emojis_unq
602356,startseq 🌟 Out now 🌟 How to identify & explain...,🌟 🌟 🌧 ☀,☀ 🌟 🌧,23,4,3
563134,startseq Another sleeper 😡 Esper Orders Nation...,😡,😡,18,1,1
191830,startseq gratitude bloodline 🖤 u next endseq,🖤,🖤,7,1,1
443691,startseq Cross the map 🗺 endseq,🗺,🗺,6,1,1
359484,startseq Always remember to NEVER QUIT 💪 🏠 @ C...,💪 🏠,🏠 💪,17,2,2
...,...,...,...,...,...,...
342845,startseq thank you lexo ❤ ️ ❤ ️ endseq,❤ ❤,❤,9,2,1
640052,startseq Oh yes naman syempre 👍 endseq,👍,👍,7,1,1
583289,startseq Thank you ❤ ️ ❤ ️ ❤ ️ endseq,❤ ❤ ❤,❤,10,3,1
496851,startseq 😖 😖 😖 them some fuckin talons lady Id...,😖 😖 😖,😖,18,3,1


# Load word2vec Model

In [22]:
# w2v_model loaded outside of the function to prevent loading the model on every function call
w2v_model = Word2Vec.load('/content/drive/My Drive/Capstone/w2v.model')
def vectorize_emojis(emoji_str):
    emojis = emoji_str.split(' ')
    vec_sum = np.zeros(300)
    for emj in emojis:
        try:
            vec = w2v_model.wv[emj]
            vec_sum += vec
        except KeyError:
            pass
            #print(f'Emoji not in w2v: {emj}')
            #ignoring emojis not in the list
    return vec_sum

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Defining Some Parameters

I limit the number of tweets used for testing in order to reduce execution time


In [23]:
# token limit of tweets to be generated
n_max_tokens = 32

# number of tweets used for testing
n_samples = 1000
# Arbitrary choice, but doesn't matter since the model is not training on testing data 
# The validation set is the first n_sample tweets from the test set
valid_df = full_test_df.head(n_samples)
# The testing set is the last n_sample tweets from the test set
test_df = full_test_df.tail(n_samples)

## Loading the Tokenizer

Here I load the tokenizer that was created when training the data.

In [24]:
tokenizer = pickle.load(open('/content/drive/My Drive/Capstone/emoji2tweet_tokenizer_n5000.pkl', 'rb'))

## Generating Tweets

Here I create a helper function that will be used to generate tweets. The tweets are generated token-by-token. The token with the highest probability is chosen as the next token.

I want to explore different methods of choosing the next token. To introduce variability in the tweets generated, I could choose 1 of the top 5 tokens.

In [25]:
# This function is used to generate a tweet token-by-token
# function taken and adapted from: https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
def generate_tweet(tweetmoji_model, tokenizer, n_max_tokens, in_emojis, in_text = 'startseq'):
  # Create feature vector using word2vec
  emoji_vec = vectorize_emojis(in_emojis)
  # Convert the currently generate tweet into a sequence 
  seq = tokenizer.texts_to_sequences([in_text])[0]
  # Store the index of 'enqseq' to use as stopping condition
  endseq = tokenizer.texts_to_sequences(['endseq'])[0]
  # Generate token-by-token up to the endseq or the token limit
  for i in range(n_max_tokens):
    # input sequences must be padded as input for the model
    seq_pad = pad_sequences([seq],maxlen=n_max_tokens)
    # The token predicted is the token with the highst probability
    y_pred = np.argmax(tweetmoji_model.predict([[emoji_vec], seq_pad]))
    seq.append(y_pred)
    if y_pred == endseq[0]:
      break
  tweet = tokenizer.sequences_to_texts([seq])[0]
  return tweet

## Calculating BLEU Score

Here I create a helper function that is used to calculate the BLEU score for a generated tweet. 

From the test set I grab a tweet, extract its emojis and use it as input for the model. I compare the generated tweet with the original tweet. The BLEU scores are calculated for 1- to 4-grams.

Ideally I would compare the generated tweet with a set of reference tweets that are representative of the concept/idea the emojis are meant to represent. This remains an area I want to explore further. Can I cluster tweets based on their similarity scores and used those as reference tweets?

In [26]:
def bleu_eval(model, df, tokenizer, n_max_tokens):
  # lists that contain the generated and reference tweets
  actual, predicted = [], []
  bleu_scores = []
  for _, row in df.iterrows():
    # Curently comparing to the tweet from the testing set
    reference_tweets = [row['tweet_text']]
    # generate a full tweet
    gen_tweet = generate_tweet(model, tokenizer, n_max_tokens, row['emojis'])
    # add the tweets to the appropriate lists
    actual.append(reference_tweets)
    predicted.append(gen_tweet.split())
  # Compute BLEU score
  bleu1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
  bleu2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
  bleu3 = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))
  bleu4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
  return [bleu1, bleu2, bleu3, bleu4]

# Validating the Model

First I define where the models are located, and which models I want to test.

In [9]:
# The model names follow a standard naming scheme
model_names = [f'tweetmoji-epoch{n}.h5' for n in range(1,11)]

# The directory where the models are found
#model_dir = '/content/drive/My Drive/Capstone/model_output/n_samples5000_n_epoch10_add'
model_dir = '/content/drive/My Drive/Capstone/model_output/n_samples5000_n_epoch10_conc'
#model_dir = '/content/drive/My Drive/Capstone/model_output/n_samples5000_n_epoch10_unq_add'
#model_dir = '/content/drive/My Drive/Capstone/model_output/n_samples5000_n_epoch10_unq_conc'

In [10]:
input_df = test_df[['tweet_text', 'emojis']]

bleu_scores = []
for n, model_name in enumerate(model_names):
  epoch = n+1
  model_filename = os.path.join(model_dir, model_name)
  model = load_model(model_filename)
  bleu_scores.append(bleu_eval(model, input_df, tokenizer, n_max_tokens))

bleu_df = pd.DataFrame(bleu_scores, columns=['bleu-1', 'bleu-2', 'bleu-3', 'bleu-4'])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [11]:
bleu_df.to_csv(f'{model_dir}/bleu_scores_test.csv')

# Evaluating the Testing Set

After analyzing the plots for the BLEU Scores, which were created in a separate notebook. The model that had the best results used concatenation for merging, and used the verbatim emojis extracted from the tweet. The best results were found at epoch 2.

In [30]:
# Grab the right model
model_dir = '/content/drive/My Drive/Capstone/model_output/n_samples5000_n_epoch10_conc'
model_name = 'tweetmoji-epoch2.h5' # grabbing model at 2nd epoch
model_filepath = os.path.join(model_dir, model_name)
# load the model
model = load_model(model_filepath)
bleu_scores = []
bleu_scores.append(bleu_eval(model, input_df, tokenizer, n_max_tokens))
bleu_scores

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


[[0.018914905099723208,
  0.07179588532972878,
  0.12241022907824407,
  0.1398773028468326]]

In [34]:
bleu_scores_df = pd.DataFrame(bleu_scores[0], index=['bleu-1', 'bleu-2', 'bleu-3', 'bleu-4'], columns=['score'])
bleu_scores_df

Unnamed: 0,score
bleu-1,0.018915
bleu-2,0.071796
bleu-3,0.12241
bleu-4,0.139877


## Generating Sample Tweets

To generate sample tweets I first load a specific model. I then create a helper function to remove the start and end sequences tokens. Finally I use the previous `generte_tweet` function.


In [None]:
# Load a specific Model
model_filename = '/content/drive/My Drive/Capstone/model_output/20200624_2257/tweetmoji-epoch6.h5'
model = load_model(model_filename)

In [None]:
# Helper function to remove the start and end sequence tokens
def remove_seq_tokens(tweet_str):
  tweet_tokens = tweet_str.split(' ')
  tweet_tokens = tweet_tokens[1:-1]
  return ' '.join(tweet_tokens)

emoji_inputs = ['💗 🎄 👪', '🎁 🎂 🎈', '😂 😭', '❤ 🎁', '🏀 👑 🔥', '😅 ❤ 🤗']

generated_tweets = []
for emojis in emoji_inputs:
  gen_tweet = generate_tweet(model, tokenizer, n_max_tokens, emojis)
  stripped_tweet = remove_seq_tokens(gen_tweet)
  #generated_tweets.append(gen_tweet)
  print (f'{emojis} => "{remove_seq_tokens(gen_tweet)}"')

💗 🎄 👪 => "i love you 💖"
🎁 🎂 🎈 => "happy birthday 🎂 🎉 🎉"
😂 😭 => "i need a <unk> 😂 😭"
❤ 🎁 => "i need to be <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>"
🏀 👑 🔥 => "<unk> 🔥 🔥"
😅 ❤ 🤗 => "i need to be <unk> 😘"
