# one-hot-encoding

In [43]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pprint 

pp = pprint.PrettyPrinter(indent=2)

In [49]:
doc = ['Positive thinking will let you do everthing', 'It always seems impossible until it is done']

#1. Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(doc)
sequences = tokenizer.texts_to_sequences(doc)

print('\nsequence')
pp.pprint(sequences)

#2. Get Word index
word_index = tokenizer.word_index
print('\nword_index')
pp.pprint(word_index)

#3. pad
data = pad_sequences(sequences, maxlen=15)
print('\npaded data\n', data)

#4. 
data = np.array(data)
print('\nsize of data tensor\n',data.shape)

label = data[:,-1]
data = data[:, :-1]
print('\ndata\n',data)
print('\nlabels\n',label)


label = to_categorical(label, num_classes=15)
print('\none-hot encoded labels\n',label)

#ref
results = tokenizer.texts_to_matrix(doc, mode='binary')
print('\nresult\n',results)


sequence
[[2, 3, 4, 5, 6, 7, 8], [1, 9, 10, 11, 12, 1, 13, 14]]

word_index
{ 'always': 9,
  'do': 7,
  'done': 14,
  'everthing': 8,
  'impossible': 11,
  'is': 13,
  'it': 1,
  'let': 5,
  'positive': 2,
  'seems': 10,
  'thinking': 3,
  'until': 12,
  'will': 4,
  'you': 6}

paded data
 [[ 0  0  0  0  0  0  0  0  2  3  4  5  6  7  8]
 [ 0  0  0  0  0  0  0  1  9 10 11 12  1 13 14]]

size of data tensor
 (2, 15)

data
 [[ 0  0  0  0  0  0  0  0  2  3  4  5  6  7]
 [ 0  0  0  0  0  0  0  1  9 10 11 12  1 13]]

labels
 [ 8 14]

one-hot encoded labels
 [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]

result
 [[0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.]]
