In [3]:
#!cp 0-bag_of_words.py 2-word2vec.py
!chmod +x *.py

In [2]:
#!/usr/bin/env python3
""" Bag Of Words """
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(sentences, vocab=None):
    """
    ***********************************************
    *** creates a bag of words embedding matrix ***
    ***********************************************
    @sentences: is a list of sentences to analyze
    @vocab: is a list of the vocabulary words to use 
            for the analysis
            **If None: all words within sentences
                       should be used
    Returns: embeddings, features
             embeddings: is a numpy.ndarray of shape (s, f)
             containing the embeddings
                 s is the number of sentences in sentences
                 f is the number of features analyzed
             features: is a list of the features used for embeddings
    """
    vect = CountVectorizer(vocabulary=vocab)
    data = vect.fit_transform(sentences)
    return data.toarray(), vect.get_feature_names()

In [3]:
#!/usr/bin/env python3

#bag_of_words = __import__('0-bag_of_words').bag_of_words

sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
E, F = bag_of_words(sentences)
print(E)
print(F)

[[0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
 [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0]
 [1 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 1]
 [0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0]]
['are', 'awesome', 'beautiful', 'cake', 'children', 'future', 'good', 'grandchildren', 'holberton', 'is', 'learning', 'life', 'machine', 'nlp', 'no', 'not', 'one', 'our', 'said', 'school', 'that', 'the', 'very', 'was']


In [7]:
#!/usr/bin/env python3
""" Bag Of Words """
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
bag_of_words = __import__('0-bag_of_words').bag_of_words


def tf_idf(sentences, vocab=None):
    """
    ***********************************************
    ********** creates a TF-IDF embedding *********
    ***********************************************
    @sentences: is a list of sentences to analyze
    @vocab: is a list of the vocabulary words to use
            for the analysis
            **If None: all words within sentences
                       should be used
    Returns: embeddings, features
             embeddings: is a numpy.ndarray of shape (s, f)
             containing the embeddings
                 s is the number of sentences in sentences
                 f is the number of features analyzed
             features: is a list of the features used for embeddings
    """
    E, vocab = bag_of_words(sentences, vocab)
    return TfidfTransformer().fit_transform(E).toarray(), vocab

In [8]:
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
vocab = ["awesome", "learning", "children", "cake", "good", "none", "machine"]
E, F = tf_idf(sentences, vocab)
print(E)
print(F)

[[1.         0.         0.         0.         0.         0.
  0.        ]
 [0.5098139  0.60831315 0.         0.         0.         0.
  0.60831315]
 [0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         1.         0.         0.         0.
  0.        ]
 [0.         0.         1.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.70710678 0.70710678 0.
  0.        ]
 [0.         0.         0.         0.70710678 0.70710678 0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.        ]]
['awesome', 'learning', 'children', 'cake', 'good', 'none', 'machine']


In [1]:
#!/usr/bin/env python3
""" Train Word2Vec """
from gensim.models import Word2Vec


def word2vec_model(sentences, size=100, min_count=5, window=5,
                       negative=5, cbow=True, iterations=5, seed=0, workers=1):
    """
    ************************************************
    ** creates and trains a gensim word2vec model **
    ************************************************
    @sentences: is a list of sentences to be trained on
    @size: is the dimensionality of the embedding layer
    @min_count: is the minimum number of occurrences
                of a word for use in training
    @window: is the maximum distance between the current
             and predicted word within a sentence
    @negative: is the size of negative sampling
    @cbow: is a boolean to determine the training type;
           True is for CBOW; False is for Skip-gram
    @iterations: is the number of iterations to train over
    @seed: is the seed for the random number generator
    @workers: is the number of worker threads to train the model

    Returns: the trained model
    """
    return Word2Vec(sentences, size=size, min_count=min_count, window=window,
                    negative=negative, sg=not cbow, iter=iterations,
                    seed=seed, workers=workers)

ModuleNotFoundError: No module named 'smart_open'

In [2]:
from gensim.test.utils import common_texts
#word2vec_model = __import__('2-word2vec').word2vec_model
print(common_texts[:2])
w2v = word2vec_model(common_texts, min_count=1)
print(w2v.wv["computer"])

ModuleNotFoundError: No module named 'smart_open'