## Classic One Hot Encoding

In [5]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

text = 'hello bitsians how are you doing today ?'

text_label_map = {}

for word in text.split():
    if word not in text_label_map:
        text_label_map[word] = len(text_label_map)

text_label_list = [[k,v] for k, v in text_label_map.items()]
print("number of features: ", len(text_label_list), '\n')
print("text_label_map: ", text_label_map, '\n')


# define example
data = text.split()
values = array(data)
print(values, '\n')

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print("onehot_encoded shape: ", onehot_encoded.shape, '\n')
print(onehot_encoded, '\n')

# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

number of features:  8 

text_label_map:  {'hello': 0, 'bitsians': 1, 'how': 2, 'are': 3, 'you': 4, 'doing': 5, 'today': 6, '?': 7} 

['hello' 'bitsians' 'how' 'are' 'you' 'doing' 'today' '?'] 

[4 2 5 1 7 3 6 0]
onehot_encoded shape:  (8, 8) 

[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]] 

['hello']


## Quiz: What's the problem with One-Hot Encoding Approach?

## Word Embedding Example

In [6]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]

# train model
model = Word2Vec(sentences, min_count=1, size=30)

# summarize the loaded model
print(model, '\n')

# summarize vocabulary
words = list(model.wv.vocab)
print(words, '\n')
# access vector for one word
print(model['sentence'], '\n')

# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model, '\n')

Word2Vec(vocab=14, size=30, alpha=0.025) 

['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec', 'second', 'yet', 'another', 'one', 'more', 'and', 'final'] 

[ 0.00459062  0.01466651 -0.01566215  0.00108665 -0.00166228  0.01632862
  0.01651154  0.01658395  0.00235168  0.0003479  -0.00851604 -0.00742927
 -0.01394663 -0.00325313  0.00261753  0.00702077 -0.01426754 -0.01333659
 -0.01418663  0.00312294 -0.01149141  0.00047896  0.00776908 -0.0091708
  0.01649732  0.00229354  0.00434268  0.01160009  0.00704735 -0.006148  ] 

Word2Vec(vocab=14, size=30, alpha=0.025) 





In [9]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://miro.medium.com/max/1806/1*cuOmGT7NevP9oJFJfVpRKA.png")

## Quiz: What's the problem with Word Embedding Approach?

## Transformers: Contextual Embeddings

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-cls-token')
sentences = ['what is the best memory of your student journey?']
sentence_embeddings = model.encode(sentences)

print("Sentence embeddings:")
print(sentence_embeddings[0].shape)
print(sentence_embeddings)


HBox(children=(FloatProgress(value=0.0, max=405237015.0), HTML(value='')))


Sentence embeddings:
(768,)
[[-7.14924514e-01 -2.07899120e-02  7.84179688e-01  1.29526958e-01
   5.29034615e-01  4.57159817e-01  2.83099949e-01  4.97395337e-01
   2.03237265e-01 -6.33602917e-01  2.72991121e-01  4.97204274e-01
   6.71188593e-01 -2.32010335e-01 -1.50385983e-02  2.98834532e-01
   1.81426048e-01 -2.88195372e-01 -1.08266786e-01 -1.31664038e+00
  -8.00215840e-01  1.93803340e-01  7.88016170e-02  5.01182936e-02
   1.82802185e-01  7.22472191e-01  3.85005921e-01 -1.66672230e-01
   3.55073512e-01  7.97547400e-01  1.22434206e-01 -4.77154583e-01
  -2.36310065e-01 -1.02875069e-01 -8.56809199e-01  6.73236370e-01
   2.52073824e-01  1.74972892e-01  2.03100085e-01  7.42366463e-02
  -2.23949456e+00 -5.41451275e-01  6.62961245e-01  3.77157420e-01
  -9.88354862e-01 -6.30682334e-02 -3.04801369e+00  5.96240580e-01
   2.57308841e-01  2.77977437e-03 -4.93922979e-01  9.09532905e-01
   8.54490638e-01  8.92385244e-01 -7.30484426e-01 -6.52297854e-01
   6.49869978e-01 -1.74373436e+00 -3.85000348e-