### Generating and using embeddings

We will use GLoVe pre-trained data set to convert some sentences into N-dimensional vectors.

First, we will use the "gensim" package to load the dataset into an easily consumable model.

In [None]:
# Load pre-trained GloVe vectors
import numpy as np

import pandas as pd
import csv

def load_glove_model_v1(dim=50):
    """Load a Glove dataset into a Pandas dataframe
    Returns: embedding"""
    glove_data_file = 'glove.6B.%dd.txt' % dim
    embedding_df = pd.read_table(glove_data_file, sep=" ", index_col=0, 
                                 header=None, quoting=csv.QUOTE_NONE,
                                 na_values=None, keep_default_na=False)
    return embedding_df

def load_glove_model_v2(dim=50):
    """Load a Glove model into a gensim model, converting it
    into word2vec if necessary.
    Adapted from: https://stackoverflow.com/a/47465278
    """
    from gensim.scripts.glove2word2vec import glove2word2vec
    from gensim.models.keyedvectors import KeyedVectors
    from pathlib import Path

    glove_data_file = 'glove.6B.%dd.txt' % dim
    word2vec_output_file = '%s.w2v' % glove_data_file

    if not Path(word2vec_output_file).exists():
        glove2word2vec(glove_input_file=glove_data_file, word2vec_output_file=word2vec_output_file)
    glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    return glove_model

# We will use v2, because it is more versatile
dim = 50
model = load_glove_model_v2(dim)

Let's build the embedding matrix for this model.

In [None]:
# Build an embedding matrix out of embedding vectors
embedding_matrix = np.zeros((len(model.vocab), dim))
for i in range(len(model.vocab)):
    embedding_vector = model[model.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Let's try to run this embedding model on a sample sentence within TensorFlow.

In [None]:
# Adapted from: http://adventuresinmachinelearning.com/gensim-word2vec-tutorial/

# Print a lookup for a sample sentence with word IDs [1, 5, 10]
import tensorflow  as tf

tf.reset_default_graph()
sentences = tf.placeholder(tf.int32, shape=[None,None])

saved_embeddings = tf.constant(embedding_matrix)
embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)
embedding_lookup = tf.nn.embedding_lookup(embedding, sentences)
with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print(sess.run(embedding_lookup,
                       feed_dict={sentences:[[1, 5, 10]]}))

It works.  Next, we will convert our dataset into a form suitable for this training.  We should also experiment with [GoogleNews word2vec dataset](http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/), because that is more relevant to a Fake News project.

### Look up embeddings for our dataset

We first load our data into a dataframe:

In [None]:
import pandas as pd
import re
import os

def get_word_list(article):
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    words_only = re.sub(strip_special_chars, "", article.lower())
    return words_only.split()

def tabulate_data(dataset_name):
    """Create a Pandas dataframe out of input Perez-Rosas dataset files
    @param dataset_name: Name of the dataset (fakenews or celebrity)
    @returns Pandas dataframe with columns:
        dataset_name, news_type, news_category, news_headline, news_content
    """
    def remove_numbers(in_str):
        return re.sub(r'[0-9]+', '', in_str)

    result_data_list = []
    data_dir = '../../data/fakeNewsDatasets_Perez-Rosas2018'
    for news_type in ['fake', 'legit']:
        folder = '%s/%s/%s' % (data_dir, dataset_name, news_type)
        for fname in os.listdir(folder):
            result_data = {}
            result_data['dataset_name'] = dataset_name
            result_data['news_type'] = news_type
            if news_type == 'fake':
                result_data['is_fake'] = 1
            else:
                result_data['is_fake'] = 0
            if dataset_name == 'fakeNewsDataset':
                result_data['news_category'] = remove_numbers(fname.split('.')[0])
            result_data['file_name'] = fname
            filepath = os.path.join(folder, fname)
            with open(filepath, 'r', encoding="utf8") as f:
                file_data = f.read().split('\n')
                # Some articles don't have a headline, but only article body.
                if len(file_data) > 1:
                    news_content_data = ' '.join(file_data[2:])
                    result_data['news_headline'] = file_data[0]
                else:
                    news_content_data = file_data[0]
                    result_data['news_headline'] = ''
                result_data['news_content'] = news_content_data
                result_data['news_all'] = ' '.join(file_data[0:])
                result_data['news_word_list'] = get_word_list(news_content_data)
                result_data['num_words'] = len(result_data['news_word_list'])
                result_data_list.append(result_data)
    df = pd.DataFrame(result_data_list)
    return df

In [None]:
fakenews_df = tabulate_data('fakeNewsDataset')
fakenews_df.head()

Next, we look up each word in the vocabulary of our embedding model.  If found, we note the index within the vocabulary.

To make this fast, we use the vectorized "apply" function.

In [None]:
def word_to_id(word_list):
    word_index_list = []
    for word in word_list:
        if word in model.vocab:
            word_index_list.append(model.vocab[word].index)
    return word_index_list
fakenews_df['embedding_ids'] = fakenews_df['news_word_list'].apply(lambda x: word_to_id(x))
fakenews_df['num_embedding_ids'] = fakenews_df['embedding_ids'].apply(lambda x: len(x))

In [None]:
fakenews_df

Let's try to print the embeddings for one of the articles.

In [None]:
sample_article = fakenews_df['embedding_ids'].tolist()[0]
print(sample_article)

import tensorflow  as tf

tf.reset_default_graph()
sentences = tf.placeholder(tf.int32, shape=[None,None])

saved_embeddings = tf.constant(embedding_matrix)
embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)
embedding_lookup = tf.nn.embedding_lookup(embedding, sentences)

# Broken at the moment
with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print(sess.run(embedding_lookup,
                       feed_dict={sentences: [sample_article]}))