In [1]:
!chmod +x *.py

In [2]:
#!/usr/bin/env python3
""" Bag Of Words """
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(sentences, vocab=None):
    """
    ***********************************************
    *** creates a bag of words embedding matrix ***
    ***********************************************
    @sentences: is a list of sentences to analyze
    @vocab: is a list of the vocabulary words to use 
            for the analysis
            **If None: all words within sentences
                       should be used
    Returns: embeddings, features
             embeddings: is a numpy.ndarray of shape (s, f)
             containing the embeddings
                 s is the number of sentences in sentences
                 f is the number of features analyzed
             features: is a list of the features used for embeddings
    """
    vect = CountVectorizer(vocabulary=vocab)
    data = vect.fit_transform(sentences)
    return data.toarray(), vect.get_feature_names()

In [3]:
#!/usr/bin/env python3

#bag_of_words = __import__('0-bag_of_words').bag_of_words

sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
E, F = bag_of_words(sentences)
print(E)
print(F)

[[0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
 [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0]
 [1 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 1]
 [0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0]]
['are', 'awesome', 'beautiful', 'cake', 'children', 'future', 'good', 'grandchildren', 'holberton', 'is', 'learning', 'life', 'machine', 'nlp', 'no', 'not', 'one', 'our', 'said', 'school', 'that', 'the', 'very', 'was']
