# One hot encoding of text

In [1]:
documents = ["Dog bites man", "Man bites dog", "Dogs eat meat.", "Man eats food"]

docs = [doc.lower().replace(".", "") for doc in documents]
docs

['dog bites man', 'man bites dog', 'dogs eat meat', 'man eats food']

In [3]:
# Build the vocabulary
vocab = { }
count = 0

for doc in docs:
    for word in doc.split(" "):
        if word not in vocab:
            count += 1
            vocab[word] = count

print(vocab)

{'dog': 1, 'bites': 2, 'man': 3, 'dogs': 4, 'eat': 5, 'meat': 6, 'eats': 7, 'food': 8}


In [4]:
# Create one hot encoder function
def get_onehot_vector(str):
    onehot_encoded = []
    l = len(vocab)

    for word in str.split():
        temp = [0] * l

        if word in vocab:
            temp[vocab[word] - 1] = 1

        onehot_encoded.append(temp)

    return onehot_encoded

#### One hot encode first sentence

In [5]:
first_sentence = docs[0]
first_sentence

'dog bites man'

In [6]:
get_onehot_vector(first_sentence)

[[1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0]]

#### One hot encode second sentence

In [7]:
print(docs[1])
get_onehot_vector(docs[1])

man bites dog


[[0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0]]

In [12]:
# One hot encoding - some word within vocab - some outsides
get_onehot_vector("man and dog are good")

[[0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0]]

In [13]:
# One hot encode of random text not in vocab
get_onehot_vector("fires in volcanos are lava")

[[0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0]]