**Bag of Words Model**

Bag of Word is a Natural Language Processing technique of text modeling . Whenever we apply any algorithm in NLP, 
we are working with numbers. We cannot directly feed our text into that algorithm. Hence, Bag of Words model 
is used to preprocess the text by converting it into a bag of words, which keeps a count of the total occurrences of most frequently used words

In [None]:
import os
import string

In [2]:

def preprocessing(docs):
    # creating one entire string
    docs = ' '.join(docs)

    # removing punctuation and normalizing string
    remove_punc = str.maketrans('', '', string.punctuation)
    cleaned_docs = docs.translate(remove_punc)
    cleaned_docs = cleaned_docs.lower()
    cleaned_docs = cleaned_docs.strip()
    cleaned_docs = cleaned_docs.split()

    # gets list of unique values
    doc_vocabulary = list(set(cleaned_docs))

    return doc_vocabulary

In [3]:
def get_sent_dictionary(sent):
    sent = sent.lower()
    sent_word_count = {}
    
    # counts the number of words
    for word in sent.split():
        if word in sent_word_count.keys():
            sent_word_count[word] += 1
        else:
            sent_word_count[word] = 1

    return sent_word_count

In [4]:

# builds a language dictionary based on the input doc and 
# generates embeddings for each sentence in doc
def bag_of_words_lm(docs):
    embeddings = []
    docs = docs.split('.')
    doc_vocabulary = preprocessing(docs)
    
    # genearte embedding per sentence
    for sentence in docs:
        sent_embed = []
        sent_word_count = get_sent_dictionary(sentence)

        for word in doc_vocabulary:
            # get numeric occurence of word
            word_count_value = sent_word_count[word] if word in sent_word_count.keys() else 0

            # place value at the index associated with the word in vector: sent_embed
            sent_embed.append(word_count_value)
        
        # add to all sentence embeddings
        embeddings.append(sent_embed)

    print(embeddings)

In [5]:
if __name__ == "__main__":
    docs_example = "Are we beginning to commend ourselves again? Or do we need, like some people, letters of recommendation to you or from you? 2 You yourselves are our letter, written on our hearts, known and read by everyone. 3 You show that you are a letter from Christ, the result of our ministry, written not with ink but with the Spirit of the living God, not on tablets of stone but on tablets of human hearts. 4 Such confidence we have through Christ before God. 5 Not that we are competent in ourselves to claim anything for ourselves, but our competence comes from God. 6 He has made us competent as ministers of a new covenantâ€”not of the letter but of the Spirit; for the letter kills, but the Spirit gives life."
    
    # call BoW language model
    bag_of_words_lm(docs_example)

[[2, 0, 0, 2, 0, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1], [0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 4, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 2, 0, 1, 2, 1, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0], [1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 