# Importing Libraries

In [1]:
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
stemmer = PorterStemmer()

### NLP techniques for preprocessing and preparing data

In [3]:
def tokenize(sentence):
    """
    it splits the sentence into array of tokens
    token can be any word, a punctuation, or even a number
    """
    return nltk.word_tokenize(sentence)


def stem(word):
    """
    stemming = find the root form of the word
    examples:
    words = ["organize", "organizes", "organizing"]
    words = [stem(w) for w in words]
    -> ["organ", "organ", "organ"]
    """
    return stemmer.stem(word.lower())


def bag_of_words(tokenized_sentence, words):
    """
    return bag of words array:
    1 for each known word that exists in the sentence, 0 otherwise
    example:
    sentence = ["intelligent", "agent", "dia", "project"]
    words = ["intelligent", "dog", "cat", "hi", "project", "phone", "dia"]
    bog   = [  1 ,    0 ,   0,   0 ,    1 ,    0 ,      1]
    """
    #stemming each word 
    sentence_words = [stem(word) for word in tokenized_sentence]
    # preparing the bag of words with list of 0's
    # placing the index for word that has matched with the vocab as '1'
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in sentence_words: 
            bag[idx] = 1

    return bag