# One Hot Encoding

## Manual One-Hot Encoding

In [None]:
def onehot_word(word):
    lookup = {v[1]: v[0] for v in enumerate(set(word))}

    word_vector = []
    for c in word:
        one_hot_vector = [0] * len(lookup)
        one_hot_vector[lookup[c]] = 1
        word_vector.append(one_hot_vector)
    return word_vector

In [None]:
onehot_word('data')

## One Hot Encoding Using Keras

In [None]:
import keras
from keras.preprocessing import text

keras.preprocessing.text.one_hot('dawn of man', n=5)

In [None]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.text import one_hot
import pandas as pd


In [None]:
with open('../data/100lines.txt') as lines_file:
    movie_lines = [ line.strip()  for line in lines_file.readlines()]
    
movie_lines[:10]

In [None]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(movie_lines)

In [None]:
lines_as_integers = tokenizer.texts_to_sequences(movie_lines)

In [None]:
lines_as_integers[:4]

In [None]:
vocabulary = set(int_value for line_values in lines_as_integers for int_value in line_values)

In [None]:
vocabulary_size = len(vocabulary)

In [None]:
movie_lines_one_hot = [keras.utils.to_categorical(line, vocabulary_size+1) for line in lines_as_integers]

In [None]:
movie_lines = pd.read_csv('../data/100lines.txt', sep='\t', header=None)
movie_lines.columns = ['line']

In [None]:
movie_lines_one_hot[0:4]

In [None]:
tokenizer.fit_on_texts(movie_lines)

In [None]:
int_sequence = tokenizer.texts_to_sequences(movie_lines)

In [None]:
int_sequence 

In [None]:
vocabulary = set(int_value for line_values in int_sequence for int_value in line_values)

In [None]:
vocabulary_size = len(vocabulary)

In [None]:
keras.utils.to_categorical([1,2,3], vocabulary_size)

In [None]:
to_categorical([1,2,3,1, 0], 5)

In [None]:
import numpy as np
np.array(int_sequence)

In [None]:
movie_lines

### Character Level Encoding Using Keras

In [None]:
text = 'One small step for man'

In [None]:
from keras.preprocessing.text import Tokenizer
import numpy as np
char_tokenizer = Tokenizer(char_level=True)

In [None]:
char_tokenizer.fit_on_texts(text)

In [None]:
char_tokenizer.texts_to_sequences(text)

In [None]:
char_tokenizer.index_word

In [None]:
char_tokenizer.word_index

In [None]:
char_vectors = char_tokenizer.texts_to_matrix(text)
char_vectors

In [None]:
char_vectors.shape

In [None]:
char_tokenizer.index_word[np.argmax(char_vectors[0])]

## One Hot Encoding Words

In [None]:
with open('../data/100lines.txt') as lines_file:
    movie_lines = [ line.strip()  for line in lines_file.readlines()]
    
movie_lines[:10]

In [None]:
lines_array = np.array(movie_lines)
lines_array.reshape(-1,1)
lines_array.shape

In [None]:
from sklearn import preprocessing

In [None]:
wordOneHotEncoder = preprocessing.OneHotEncoder()
labelEncoder = preprocessing.LabelEncoder()
movie_labels = labelEncoder.fit_transform(lines_array)

In [None]:
movie_labels

In [None]:
movie_labels.reshape(-1,1)

In [None]:
movie_onehot = wordOneHotEncoder.fit_transform(movie_labels.reshape(-1,1))

In [None]:
movie_onehot.toarray()