In [28]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 200
%matplotlib inline

corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]

labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']] # What's this line for?
print(corpus_df)

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, eggs, toast and beans",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beautiful today,weather
7,The dog is lazy but the brown fox is quick!,animals


In [27]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc): 
    # replace occurrences of the RE pattern by '' in doc. 
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if not(token in stop_words)]
    lemmatizer = nltk.wordnet.WordNetLemmatizer()
    filtered_lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    ps = nltk.porter.PorterStemmer()
    filtered_lemma_stem_tokens = [ps.stem(token) for token in filtered_lemmatized_tokens]
    
    doc = ' '.join(filtered_lemma_stem_tokens)
 #   doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog',
       'kings breakfast sausages ham bacon eggs toast beans',
       'love green eggs ham sausages bacon',
       'brown fox quick blue dog lazy', 'sky blue sky beautiful today',
       'dog lazy brown fox quick'], dtype='<U51')

In [42]:
from nltk.corpus import gutenberg
from string import punctuation

bible = gutenberg.sents('bible-kjv.txt')

# I think the following two lines are useless. normalize_corpus() already does that.
#remove_terms = punctuation + '0123456789'
#norm_bible = [[word.lower() for word in sent if not(word in remove_terms)] for sent in bible]
# We need to make bible into a list of strings corresponding to sentences in order to 
# use normalize_corpus()
norm_bible = [' '.join(tok_sent) for tok_sent in bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]
print(bible[10])
print(bible[11])
print(bible[12])
print(bible[13])
print(norm_bible[10])
print(norm_bible[11])
print(norm_bible[12])
print(norm_bible[13])

['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']
['1', ':', '7', 'And', 'God', 'made', 'the', 'firmament', ',', 'and', 'divided', 'the', 'waters', 'which', 'were', 'under', 'the', 'firmament', 'from', 'the', 'waters', 'which', 'were', 'above', 'the', 'firmament', ':', 'and', 'it', 'was', 'so', '.']
['1', ':', '8', 'And', 'God', 'called', 'the', 'firmament', 'Heaven', '.']
['And', 'the', 'evening', 'and', 'the', 'morning', 'were', 'the', 'second', 'day', '.']
god said let firmament midst waters let divide waters waters
god made firmament divided waters firmament waters firmament
god called firmament heaven
evening morning second day


In [47]:
# keras requires the tensorflow module. Must pip install it. 
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence


# "The CBOW model architecture tries to predict the current target word (the center word) 
#  based on the source context words (surrounding words)."
# To implement the Continuous Bag of Words (CBOW) Model, we need to:
"""
- Build the corpus vocabulary
- Build a CBOW (context, target) generator
- Build the CBOW model architecture
- Train the Model
- Get Word Embeddings
"""

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index
word2id['PAD'] = 0
# This is a reversed index. The regular word_index associates words with unique codes. 
id2word = {v:k for k,v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

vocab_size = len(word2id)
embed_size = 100
window_size = 2
print(vocab_size)
print(len(wids))
print(list(word2id.items())[:10])
print(list(id2word.items())[:10])


12438
29251
[('shall', 1), ('unto', 2), ('lord', 3), ('thou', 4), ('thy', 5), ('god', 6), ('ye', 7), ('said', 8), ('thee', 9), ('upon', 10)]
[(1, 'shall'), (2, 'unto'), (3, 'lord'), (4, 'thou'), (5, 'thy'), (6, 'god'), (7, 'ye'), (8, 'said'), (9, 'thee'), (10, 'upon')]
