### Generating and using embeddings

We will use GLoVe pre-trained data set to convert some sentences into N-dimensional vectors.

First, we will use the "gensim" package to load the dataset into an easily consumable model.

In [None]:
# First, some imports

from importlib import reload
import shutil
import time
import csv

import numpy as np
import pandas as pd
import sklearn
import tensorflow  as tf

from w266_common import utils, vocabulary, tf_embed_viz
import rnnlm_deepak as rnnlm; reload(rnnlm)

In [None]:
# Load pre-trained GloVe vectors

def load_glove_model_v1(dim=50):
    """Load a Glove dataset into a Pandas dataframe
    Returns: embedding"""
    glove_data_file = 'glove.6B.%dd.txt' % dim
    embedding_df = pd.read_table(glove_data_file, sep=" ", index_col=0, 
                                 header=None, quoting=csv.QUOTE_NONE,
                                 na_values=None, keep_default_na=False)
    return embedding_df

def load_glove_model_v2(dim=50):
    """Load a Glove model into a gensim model, converting it
    into word2vec if necessary.
    Adapted from: https://stackoverflow.com/a/47465278
    """
    from gensim.scripts.glove2word2vec import glove2word2vec
    from gensim.models.keyedvectors import KeyedVectors
    from pathlib import Path

    glove_data_file = 'glove.6B.%dd.txt' % dim
    word2vec_output_file = '%s.w2v' % glove_data_file

    if not Path(word2vec_output_file).exists():
        glove2word2vec(glove_input_file=glove_data_file, word2vec_output_file=word2vec_output_file)
    glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    return glove_model

# We will use v2, because it is more versatile
dim = 50
model = load_glove_model_v2(dim)

Let's build the embedding matrix for this model.

In [None]:
# Build an embedding matrix out of embedding vectors
embedding_matrix = np.zeros((len(model.vocab), dim))
for i in range(len(model.vocab)):
    embedding_vector = model[model.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Let's try to run this embedding model on a sample sentence within TensorFlow.

In [None]:
# Adapted from: http://adventuresinmachinelearning.com/gensim-word2vec-tutorial/

# Print a lookup for a sample sentence with word IDs [1, 5, 10]

tf.reset_default_graph()
sentences = tf.placeholder(tf.int32, shape=[None,None])

saved_embeddings = tf.constant(embedding_matrix)
embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)
embedding_lookup = tf.nn.embedding_lookup(embedding, sentences)
with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print(sess.run(embedding_lookup,
                       feed_dict={sentences:[[1, 5, 10]]}))

It works.  Next, we will convert our dataset into a form suitable for this training.  We should also experiment with [GoogleNews word2vec dataset](http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/), because that is more relevant to a Fake News project.

### Look up embeddings for our dataset

We first load our data into a dataframe:

In [None]:
import pandas as pd
import re
import os

def get_word_list(article):
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    words_only = re.sub(strip_special_chars, "", article.lower())
    return words_only.split()

def tabulate_data(dataset_name):
    """Create a Pandas dataframe out of input Perez-Rosas dataset files
    @param dataset_name: Name of the dataset (fakenews or celebrity)
    @returns Pandas dataframe with columns:
        dataset_name, news_type, news_category, news_headline, news_content
    """
    def remove_numbers(in_str):
        return re.sub(r'[0-9]+', '', in_str)

    result_data_list = []
    data_dir = '../../data/fakeNewsDatasets_Perez-Rosas2018'
    for news_type in ['fake', 'legit']:
        folder = '%s/%s/%s' % (data_dir, dataset_name, news_type)
        for fname in os.listdir(folder):
            result_data = {}
            result_data['dataset_name'] = dataset_name
            result_data['news_type'] = news_type
            if news_type == 'fake':
                result_data['is_fake'] = 1
            else:
                result_data['is_fake'] = 0
            if dataset_name == 'fakeNewsDataset':
                result_data['news_category'] = remove_numbers(fname.split('.')[0])
            result_data['file_name'] = fname
            filepath = os.path.join(folder, fname)
            with open(filepath, 'r', encoding="utf8") as f:
                file_data = f.read().split('\n')
                # Some articles don't have a headline, but only article body.
                if len(file_data) > 1:
                    news_content_data = ' '.join(file_data[2:])
                    result_data['news_headline'] = file_data[0]
                else:
                    news_content_data = file_data[0]
                    result_data['news_headline'] = ''
                result_data['news_content'] = news_content_data
                result_data['news_all'] = ' '.join(file_data[0:])
                result_data['news_word_list'] = get_word_list(news_content_data)
                result_data['num_words'] = len(result_data['news_word_list'])
                result_data_list.append(result_data)
    df = pd.DataFrame(result_data_list)
    return df

In [None]:
fakenews_df = tabulate_data('fakeNewsDataset')
fakenews_df.head()

Next, we look up each word in the vocabulary of our embedding model.  If found, we note the index within the vocabulary.

To make this fast, we use the vectorized "apply" function.

In [None]:
ID_UNKNOWN = 99
def word_to_id(word_list):
    word_index_list = []
    for word in word_list:
        if word in model.vocab:
            word_index_list.append(model.vocab[word].index)
        else:
            # Unknown
            word_index_list.append(ID_UNKNOWN)
    return word_index_list
fakenews_df['embedding_ids'] = fakenews_df['news_word_list'].apply(lambda x: word_to_id(x))
fakenews_df['num_embedding_ids'] = fakenews_df['embedding_ids'].apply(lambda x: len(x))

In [None]:
fakenews_df

Let's try to print the embeddings for one of the articles.

In [None]:
sample_article = fakenews_df['embedding_ids'].tolist()[0]
print(sample_article)

import tensorflow  as tf

tf.reset_default_graph()
sentences = tf.placeholder(tf.int32, shape=[None,None])

saved_embeddings = tf.constant(embedding_matrix)
embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)
embedding_lookup = tf.nn.embedding_lookup(embedding, sentences)

with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        print(sess.run(embedding_lookup,
                       feed_dict={sentences: [sample_article]}))

It works!  Let's try to train a neural net with the data set.  We will reuse some of the code from W266 Assignment 3.

First, couple utility functions.

In [None]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=0.01):
    assert(learning_rate is not None)
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0
        
    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})

        feed_dict = {
            lm.input_w_: w,
            lm.target_y_: y,
            lm.initial_h_: h,
            lm.learning_rate_: learning_rate,
            lm.use_dropout_: use_dropout
        }
        ops = [loss, lm.final_h_, train_op]        
        #### YOUR CODE HERE ####
        # session.run(...) the ops with the feed_dict constructed above.
        # Ensure "cost" becomes the value of "loss".
        # Hint: see "ops" for other variables that need updating in this loop.
        cost = 0.0
                
        if train:
            cost, final_h, train_op = session.run([lm.train_loss_, lm.final_h_, lm.train_step_], feed_dict)
        else:
            cost, final_h, train_op = session.run([lm.loss_, lm.final_h_, tf.no_op()], feed_dict)
        #### END(YOUR CODE) ####
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print("[batch {:d}]: seen {:d} words at {:.1f} wps, loss = {:.3f}".format(
                i, total_words, avg_wps, avg_cost))
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

def score_dataset(lm, session, ids, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.rnnlm_batch_generator(ids, batch_size=100, max_time=100)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=0.01, train=False, 
                     verbose=False, tick_s=3600)
    print("{:s}: avg. loss: {:.03f}  (perplexity: {:.02f})".format(name, cost, np.exp(cost)))
    return cost

def article_to_ids(articles):
    all_ids = []
    for article in articles:
        all_ids.extend(article)
    return np.array(all_ids)

def train_test_split(df, id_list_column, split_frac=0.8, do_shuffle=True):
    articles = np.array(list(df[id_list_column]), dtype=object)
    if do_shuffle:
        rng = np.random.RandomState(do_shuffle)
        rng.shuffle(articles)  # in-place
    train_split = int(split_frac * articles.size)
    train_ids = article_to_ids(articles[:train_split])
    test_ids = article_to_ids(articles[train_split:])
    return train_ids, test_ids

We're now ready to train the model.  We will use the embeddings instead of the words themselves.  Let's set up some parameters, and then run the training.

In [None]:
# Training parameters
max_time = 25
batch_size = 100
learning_rate = 0.01
num_epochs = 10

# Model parameters
model_params = dict(V=len(model.vocab), 
                    H=200, 
                    softmax_ns=200,
                    num_layers=2)

TF_SAVEDIR = "/tmp/w266/final_project"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

train_ids, test_ids = train_test_split(fakenews_df, 'embedding_ids')    

In [None]:
# Uncomment if you want to debug upon exception
# %pdb

reload(rnnlm)

print_interval = 5

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        print(type(train_ids))
        bi = utils.rnnlm_batch_generator(train_ids, batch_size, max_time)
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))
        #### YOUR CODE HERE ####
        # Run a training epoch.
        
        run_epoch(lm, session, bi, train=True)

        #### END(YOUR CODE) ####
        print("[epoch {:d}] Completed in {:s}".format(epoch, utils.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        if epoch == num_epochs:
            print("[epoch {:d}]".format(epoch), end=" ")
            score_dataset(lm, session, train_ids, name="Train set")
            print("[epoch {:d}]".format(epoch), end=" ")
            score_dataset(lm, session, test_ids, name="Test set")
            print("")
    
    # Save final model
    saver.save(session, trained_filename)

TODO: Run predictions, look at perplexity.