In [1]:
## Run this if you need glove libary
#! wget -nc -O - http://nlp.stanford.edu/data/glove.6B.zip | unzip -

In [2]:
import os
import re
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
from keras.initializers import Constant

BASE_DIR = os.getcwd()
GLOVE_DIR = BASE_DIR
MAX_SEQUENCE_LENGTH = 500
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

Using TensorFlow backend.


### Sample doc

In [3]:
doc1 = '''Preamble
1.We, the Heads of State and Government and high-level representatives,having met in Apia from 1 to 4September 2014 at the third International Conference on Small Island Developing States, with the full participation of civil society and relevant stakeholders, reaffirm our commitment to the sustainable development of small island developing  States.  This  can  be  achieved  only  with  a  broad  alliance  of  people, governments, civil society and the private sector all working together to achieve the future we want for present and future generations.
2.We reaffirm the commitments we made at United Nations conferences and summits on sustainable development: the Rio Declaration on Environment and Development,1Agenda21,2the Programme for the Further Implementation of Agenda21,3the Plan of Implementation of the World Summit on Sustainable Development (Johannesburg Plan of Implementation),4including chapter VII, on thesustainable development of small island developing States, and the Johannesburg Declaration on Sustainable Development,5the Programme of Action for the Sustainable Development of Small Island Developing States (Barbados Programme of Action)6and the Mauritius Strategy for the Further Implementation of the Programme of Action for the Sustainable Development of Small Island  Developing  States  (Mauritius  Strategy),7and  the  outcome  document  of  the United  Nations  Conference  on  Sustainable  Development,  entitled “The  future  we want”.8We further underscore that these processes are still being implemented and that there is a need for a more integrated approach to the sustainable development of small island developing States, with the support of the international community and all stakeholders.
'''
doc2 = '''To that end, declare the following:
1.We acknowledge the 60-year legacy and continuing significant role of the United Nations congresses on crime prevention and criminal justice as one of the largest  and  most  diverse  international  forumsfor  the  exchange  of  views  and experiences in research, law and policy and programme development between States, intergovernmental  organizations  and  individual  experts  representing  various professions and disciplines in order to identify emerging trends and issues in the field of crime prevention and criminal justice. We recognize the unique and important contributions of the congresses to law and policy development, as well as to the identification of emerging trends and issues in crime prevention and criminal justice. 
2.We reaffirm  the cross-cutting  nature  of  crime  prevention and  criminal justice issues and the consequent need to integrate those issues into the wider agenda of the United Nations in order to enhance system-wide coordination. We look forward to the future contributions of the Commission on Crime Prevention and Criminal Justice with regard to designing and implementing national and international crime prevention and criminal justice policies and programmes, taking into account and building upon the recommendations of the congresses. 
'''
docs = [doc1, doc2]

### Tokenize and vectorizing

In [4]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(docs)
sequences = tokenizer.texts_to_sequences(docs)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 191 unique tokens.


### Pad docs for consistent length

In [5]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
len(data[0]) == len(data[1])

True

### Build Glove index mapping words

In [6]:
def glove_indx(gloveFile: str):
    embeddings_index = {}
    with open(os.path.join(GLOVE_DIR, gloveFile)) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, 'f', sep=' ')
            embeddings_index[word] = coefs
    return embeddings_index

embeddings_index = glove_indx('glove.6B.100d.txt')
sample_word = 'million'
print('Total words in Glove:', len(embeddings_index))
print('Sample word `%s` dim:' % (sample_word) , len(embeddings_index[sample_word]))

Total words in Glove: 400000
Sample word `million` dim: 100


### Map GloVe to word_index

In [7]:
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# remove embeddings_index
del embeddings_index

### Create Embedding layer

In [8]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

### Use Keras to embed docs

In [9]:
model = Sequential()
model.add(embedding_layer)

# continue to add layers to model as needed
# model.add(Dense(32, input_dim=784))
# model.add(Activation('relu'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          19200     
Total params: 19,200
Trainable params: 0
Non-trainable params: 19,200
_________________________________________________________________


In [10]:
embedded_doc = model.predict(data)
embedded_doc.shape

(2, 500, 100)