This workbook converts all of the words in the news data to vectors, using pre-trained GloVe vectors. We will start by using 50 dimension embeddings created from Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased). The file (glove.6B.50d.txt) can be downloaded here: https://nlp.stanford.edu/projects/glove/

In [1]:
# Supply location of GloVe text file, dimension of embedding used, and max word length of news article
# We will want all of our articles to be the same number of words long. Some articles will be padded with 0 at the end,
# and others will be cut off at maxSeqLength
glove_filepath = 'glove.6B/glove.6B.50d.txt'
datapath = 'C:/Users/mpowers/w266/w266finalproject/data/fakeNewsDatasets_Perez-Rosas2018'
dim = 50
maxSeqLength = 200

In [2]:
# Load pre-trained GloVe vectors
import numpy as np

def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile,'r', encoding="utf8")
    model = {}
    wordsList = []
    embeddings = []
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        wordsList.append(word)
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
        embeddings.append(embedding)
    print ("Done.",len(model)," words loaded!")
    f.close()
    return wordsList, embeddings, model

# We can access the position of a word in the embedding file using "wordsList"
# We can access the embedding of a word using "embeddings". The position in this will match "wordlist"
# We can access the embedding of a word using the dictionary "model". We will not actually use this, but useful to have.
wordsList, embeddings, model = loadGloveModel(glove_filepath)
print(model['hello'])
print(wordsList[0:5])

Loading Glove Model
Done. 400000  words loaded!
[-0.38497   0.80092   0.064106 -0.28355  -0.026759 -0.34532  -0.64253
 -0.11729  -0.33257   0.55243  -0.087813  0.9035    0.47102   0.56657
  0.6985   -0.35229  -0.86542   0.90573   0.03576  -0.071705 -0.12327
  0.54923   0.47005   0.35572   1.2611   -0.67581  -0.94983   0.68666
  0.3871   -1.3492    0.63512   0.46416  -0.48814   0.83827  -0.9246
 -0.33722   0.53741  -1.0616   -0.081403 -0.67111   0.30923  -0.3923
 -0.55002  -0.68827   0.58049  -0.11626   0.013139 -0.57654   0.048833
  0.67204 ]
['the', ',', '.', 'of', 'to']


In [33]:
# The GloVe embeddings are all lowercase. It does not find an emedding if you accidently use an uppercase letter.
# print(model['Hello'])

In [3]:
# Function that removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanArticle(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

# Function that generates fixed sequences of integers corresponding to the embeddings in the embedding lookup
def getArticleMatrix(article):
    articleMatrix = np.zeros(maxSeqLength, dtype='int32')
    cleanedArticle = cleanArticle(article)
    split = cleanedArticle.split()
    for indexCounter,word in enumerate(split):
        if indexCounter==maxSeqLength:
            break
        try:
            articleMatrix[indexCounter] = wordsList.index(word)
        except ValueError:
            articleMatrix[indexCounter] = 399999 #Vector for unkown words
    return articleMatrix

In [4]:
# Loop through each news file and generate a list with each word replaced with its corresponding index in the GloVe embedding file
import os

embedded_news = []

for dataset_name in ['fakeNewsDataset']:
    for news_type in ['fake', 'legit']:
        folder = '%s/%s/%s' % (datapath, dataset_name, news_type)
        for fname in os.listdir(folder):
            filepath = os.path.join(folder, fname)
            with open(filepath, 'r', encoding="utf8") as f:
                file_data = f.read().split('\n')
                embedded_news.append(getArticleMatrix(' '.join(file_data[0:])))

In [6]:
# Here is an example of one of our embedded news articles
# We can feed these into tensorflow, along with "embeddings" as the embedding lookup
embedded_news[0]

array([  3791,   1155,  25058,      6, 399999,   3187,   3791,   1155,
        49070,      3,      0,    920,   8775,    172,   2557, 399999,
            5,   1551,      3,      0,   3791,   1155,    273,     31,
           51,  25058,      6,     26,   1267,   3336,      0, 284841,
       399999,   3187,   1155,      5,    423,   8323,    906,     76,
           62,     12,    220,    590,    165,    148,     35,    791,
            6,      7,   9520,  27478,    964,   1739,   6123,      5,
        10711,   2930,    622,      0,    289,   8038,   9388,  20594,
        14805,  13376,  29948,   9388,     19,      7,    669,      0,
         2196,   8395,      0,    590,    165,      5,    107,     33,
         1098,   1034,   1441,      4,      0,   4539,    443,    763,
          398,    220,     95,    347,   2255,     33,    114,   1311,
           12,     39,     33,  12784,    109,      3,    158,   1267,
           49,  16379,   4815,     13,      0,   2087,      3,    201,
      

In [71]:
# Here is a overview of how to use the embeddings.

# Generate fake test news to see how embedding works
test_news = "This is some test news"

# This create a list of numbers. Each number corresponds to the index of its embedding in the embedding lookup
print(getArticleMatrix(test_news))

# See the indexes match
print(wordsList.index("this"))
print(wordsList.index("is"))
print(wordsList.index("some"))
print(wordsList.index("some"))
print(wordsList.index("test"))
print(wordsList.index("news"))

# The embeddings dictionary lookup, also matches the index lookup
print(model['this'])
print(embeddings[wordsList.index("this")])

[ 37  14  77 728 172   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
37
14
77
77
728
172
[ 5.3074e-01  4.0117e-01 -4.0785e-01  1.5444e-01  4.7782e-01  2.0754e-01
 -2.6951e-01 -3.4023e-01 -1.0879e-01  1.0563e-01 -1.0289e-01  1.0849e-01
 -4.9681e-01 -2.5128e

In [72]:
# Here is the resulting embedded news
embedded_news[0]

array([  3791,   1155,  25058,      6, 399999,   3187,   3791,   1155,
        49070,      3,      0,    920,   8775,    172,   2557, 399999,
            5,   1551,      3,      0,   3791,   1155,    273,     31,
           51,  25058,      6,     26,   1267,   3336,      0, 284841,
       399999,   3187,   1155,      5,    423,   8323,    906,     76,
           62,     12,    220,    590,    165,    148,     35,    791,
            6,      7,   9520,  27478,    964,   1739,   6123,      5,
        10711,   2930,    622,      0,    289,   8038,   9388,  20594,
        14805,  13376,  29948,   9388,     19,      7,    669,      0,
         2196,   8395,      0,    590,    165,      5,    107,     33,
         1098,   1034,   1441,      4,      0,   4539,    443,    763,
          398,    220,     95,    347,   2255,     33,    114,   1311,
           12,     39,     33,  12784,    109,      3,    158,   1267,
           49,  16379,   4815,     13,      0,   2087,      3,    201,
      