In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

from IPython.display import HTML

In [2]:
corpus = ['The cat sat on the mat', 'The dog sat on the mat', 'The goat sat on the mat']

In [3]:
#This countvectorizer takes care of a lot of preprocessing under the hood
# stop_words, tokenizing, lowercasing everything
vectorizer = CountVectorizer(lowercase=True, analyzer='word', binary=False)

In [12]:
#Fitting and transforming model
representation = vectorizer.fit_transform(corpus)

print(vectorizer.vocabulary_.keys())

df_rep = pd.DataFrame(data=representation.toarray(), columns=sorted(vectorizer.vocabulary_.keys()))

#Shows the frequency that each word shows up in the sentence
df_rep

dict_keys(['goat', 'sat', 'cat', 'mat', 'on', 'dog', 'the'])


Unnamed: 0,cat,dog,goat,mat,on,sat,the
0,1,0,0,1,1,1,2
1,0,1,0,1,1,1,2
2,0,0,1,1,1,1,2


In [23]:
#This also gets rid of stopwords
vectorizer = CountVectorizer(lowercase=True, analyzer='word', binary=True,
                             stop_words='english')

In [24]:
rep = vectorizer.fit_transform(corpus)

df_rep = pd.DataFrame(data=rep.toarray(), columns=vectorizer.vocabulary_.keys())

In [25]:
#This is a binary version, so no longer number of times the word shows up in sentence,
# but binary variable of whether or not sentence appears in each sentence
df_rep

Unnamed: 0,sat,mat,goat,dog,cat
0,1,0,0,1,1
1,0,1,0,1,1
2,0,0,1,1,1


In [27]:
vectorizer.vocabulary_

{'cat': 0, 'dog': 1, 'goat': 2, 'mat': 3, 'sat': 4}

In [31]:
training_corpus = ['The cat sat on the mat', 'The dog sat on the mat', 'The goat sat on the mat', 'The elephant sat on the mat', 
          'The plane sat on the mat', 'The apple sat on the mat', 'The pen sat on the mat', 'The notebook sat on the mat']

allowed = [1,1,1,1,   # Objects that are allowed on the mat
           0,0,0,0]   # Objects that are not allowed on the mat

# Make sure that words we'll use in the test set are considered
for other_object in ['keyboard', 'bird']:
    training_corpus.append(other_object)   
    
vectorizer = CountVectorizer(lowercase=True, analyzer='word', binary=True, stop_words='english')
representation = vectorizer.fit_transform(training_corpus)
representation_df = pd.DataFrame(data = representation.toarray(), columns=sorted(vectorizer.vocabulary_.keys()))
representation_df

Unnamed: 0,apple,bird,cat,dog,elephant,goat,keyboard,mat,notebook,pen,plane,sat
0,0,0,1,0,0,0,0,1,0,0,0,1
1,0,0,0,1,0,0,0,1,0,0,0,1
2,0,0,0,0,0,1,0,1,0,0,0,1
3,0,0,0,0,1,0,0,1,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0,1,1
5,1,0,0,0,0,0,0,1,0,0,0,1
6,0,0,0,0,0,0,0,1,0,1,0,1
7,0,0,0,0,0,0,0,1,1,0,0,1
8,0,0,0,0,0,0,1,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,0,0


In [33]:
#Text Classification Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logr = LogisticRegression()

y = allowed
X = representation_df[:len(y)]

logr.fit(X,y)

print("Training Accuracy Score:{} %".format(accuracy_score(logr.predict(X), y)*100))

Training Accuracy Score:100.0 %


In [35]:
#Now that we've fit our model, we want to test it on new words.
#Since we've already fit our vectorizer, we now just use .transform 
#to convert the new strings to a one-hot encoded matrix

test_corpus = ['The keyboard sat on the mat', 'The bird sat on the mat']

rep = vectorizer.transform(test_corpus)

X_test =  rep
y_test = [0,1]
print("Expected Results for (keyboard, bird):  {}".format(y_test))
print("Actual   Results for (keyboard, bird):  {}".format(logr.predict(X_test)))

Expected Results for (keyboard, bird):  [0, 1]
Actual   Results for (keyboard, bird):  [0 0]


In [38]:
from itertools import product

In [42]:
animals = ['cat','dog','goat','elephant','eagle','zebra','rhino', 'hippo']
actions = ['sat','stood','jumped','slept']
furniture = ['mat','rug','sofa','bed']

# Generate all combinations of animal, action and furniture
animal_corpus = ['The {} {} on the {}'.format(x[0], x[1], x[2]) for x in itertools.product(animals, actions, furniture)]
vocabulary_size = len(animals) + len(actions) + len(furniture) + 2

print("There are {} sentences in the corpus, with a vocabulary of {} words".format(len(animal_corpus), vocabulary_size))

#So you can use product to get every single combination of three words
list(product(animals, actions, furniture))

There are 128 sentences in the corpus, with a vocabulary of 18 words


[('cat', 'sat', 'mat'),
 ('cat', 'sat', 'rug'),
 ('cat', 'sat', 'sofa'),
 ('cat', 'sat', 'bed'),
 ('cat', 'stood', 'mat'),
 ('cat', 'stood', 'rug'),
 ('cat', 'stood', 'sofa'),
 ('cat', 'stood', 'bed'),
 ('cat', 'jumped', 'mat'),
 ('cat', 'jumped', 'rug'),
 ('cat', 'jumped', 'sofa'),
 ('cat', 'jumped', 'bed'),
 ('cat', 'slept', 'mat'),
 ('cat', 'slept', 'rug'),
 ('cat', 'slept', 'sofa'),
 ('cat', 'slept', 'bed'),
 ('dog', 'sat', 'mat'),
 ('dog', 'sat', 'rug'),
 ('dog', 'sat', 'sofa'),
 ('dog', 'sat', 'bed'),
 ('dog', 'stood', 'mat'),
 ('dog', 'stood', 'rug'),
 ('dog', 'stood', 'sofa'),
 ('dog', 'stood', 'bed'),
 ('dog', 'jumped', 'mat'),
 ('dog', 'jumped', 'rug'),
 ('dog', 'jumped', 'sofa'),
 ('dog', 'jumped', 'bed'),
 ('dog', 'slept', 'mat'),
 ('dog', 'slept', 'rug'),
 ('dog', 'slept', 'sofa'),
 ('dog', 'slept', 'bed'),
 ('goat', 'sat', 'mat'),
 ('goat', 'sat', 'rug'),
 ('goat', 'sat', 'sofa'),
 ('goat', 'sat', 'bed'),
 ('goat', 'stood', 'mat'),
 ('goat', 'stood', 'rug'),
 ('goat', 'st

In [43]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
import numpy as np

# Hyper-parameters

EMBEDDING_SIZE = 7  # Small corpus, so we're using a small dimension
WINDOW_SIZE = 4     # Empirically found to work well

# Convert text to numerical sequences

# Note that the Tokenizer starts numbering words with 1.  So we have vocabulary_size+1 words.  The 0-th word
# is considered to be the 'Out-of-vocabulary' token.
tokenizer = Tokenizer(num_words=vocabulary_size+1, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ', lower=True, split=' ',)
tokenizer.fit_on_texts(animal_corpus)
sequences = tokenizer.texts_to_sequences(animal_corpus)

In [45]:
sequences

[[1, 11, 3, 2, 1, 4],
 [1, 11, 3, 2, 1, 5],
 [1, 11, 3, 2, 1, 6],
 [1, 11, 3, 2, 1, 7],
 [1, 11, 8, 2, 1, 4],
 [1, 11, 8, 2, 1, 5],
 [1, 11, 8, 2, 1, 6],
 [1, 11, 8, 2, 1, 7],
 [1, 11, 9, 2, 1, 4],
 [1, 11, 9, 2, 1, 5],
 [1, 11, 9, 2, 1, 6],
 [1, 11, 9, 2, 1, 7],
 [1, 11, 10, 2, 1, 4],
 [1, 11, 10, 2, 1, 5],
 [1, 11, 10, 2, 1, 6],
 [1, 11, 10, 2, 1, 7],
 [1, 12, 3, 2, 1, 4],
 [1, 12, 3, 2, 1, 5],
 [1, 12, 3, 2, 1, 6],
 [1, 12, 3, 2, 1, 7],
 [1, 12, 8, 2, 1, 4],
 [1, 12, 8, 2, 1, 5],
 [1, 12, 8, 2, 1, 6],
 [1, 12, 8, 2, 1, 7],
 [1, 12, 9, 2, 1, 4],
 [1, 12, 9, 2, 1, 5],
 [1, 12, 9, 2, 1, 6],
 [1, 12, 9, 2, 1, 7],
 [1, 12, 10, 2, 1, 4],
 [1, 12, 10, 2, 1, 5],
 [1, 12, 10, 2, 1, 6],
 [1, 12, 10, 2, 1, 7],
 [1, 13, 3, 2, 1, 4],
 [1, 13, 3, 2, 1, 5],
 [1, 13, 3, 2, 1, 6],
 [1, 13, 3, 2, 1, 7],
 [1, 13, 8, 2, 1, 4],
 [1, 13, 8, 2, 1, 5],
 [1, 13, 8, 2, 1, 6],
 [1, 13, 8, 2, 1, 7],
 [1, 13, 9, 2, 1, 4],
 [1, 13, 9, 2, 1, 5],
 [1, 13, 9, 2, 1, 6],
 [1, 13, 9, 2, 1, 7],
 [1, 13, 10, 2, 1, 4],
 

In [46]:
np.hstack(sequences)

array([ 1, 11,  3,  2,  1,  4,  1, 11,  3,  2,  1,  5,  1, 11,  3,  2,  1,
        6,  1, 11,  3,  2,  1,  7,  1, 11,  8,  2,  1,  4,  1, 11,  8,  2,
        1,  5,  1, 11,  8,  2,  1,  6,  1, 11,  8,  2,  1,  7,  1, 11,  9,
        2,  1,  4,  1, 11,  9,  2,  1,  5,  1, 11,  9,  2,  1,  6,  1, 11,
        9,  2,  1,  7,  1, 11, 10,  2,  1,  4,  1, 11, 10,  2,  1,  5,  1,
       11, 10,  2,  1,  6,  1, 11, 10,  2,  1,  7,  1, 12,  3,  2,  1,  4,
        1, 12,  3,  2,  1,  5,  1, 12,  3,  2,  1,  6,  1, 12,  3,  2,  1,
        7,  1, 12,  8,  2,  1,  4,  1, 12,  8,  2,  1,  5,  1, 12,  8,  2,
        1,  6,  1, 12,  8,  2,  1,  7,  1, 12,  9,  2,  1,  4,  1, 12,  9,
        2,  1,  5,  1, 12,  9,  2,  1,  6,  1, 12,  9,  2,  1,  7,  1, 12,
       10,  2,  1,  4,  1, 12, 10,  2,  1,  5,  1, 12, 10,  2,  1,  6,  1,
       12, 10,  2,  1,  7,  1, 13,  3,  2,  1,  4,  1, 13,  3,  2,  1,  5,
        1, 13,  3,  2,  1,  6,  1, 13,  3,  2,  1,  7,  1, 13,  8,  2,  1,
        4,  1, 13,  8,  2