## Classic One Hot Encoding

In [13]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

text = 'hello bitsians how are you doing today ?'

text_label_map = {}

for word in text.split():
    if word not in text_label_map:
        text_label_map[word] = len(text_label_map)

print("number of features: ", len(text_label_list), '\n')
print("text_label_map: ", text_label_map, '\n')


# define example
data = text.split()
values = array(data)
print(values, '\n')

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print("onehot_encoded shape: ", onehot_encoded.shape, '\n')
print(onehot_encoded, '\n')

# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

number of features:  8 

text_label_map:  {'hello': 0, 'bitsians': 1, 'how': 2, 'are': 3, 'you': 4, 'doing': 5, 'today': 6, '?': 7} 

['hello' 'bitsians' 'how' 'are' 'you' 'doing' 'today' '?'] 

[4 2 5 1 7 3 6 0]
onehot_encoded shape:  (8, 8) 

[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]] 

['hello']


## Quiz: What's the problem with One-Hot Encoding Approach?

## Word Embedding Example

In [17]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]

# train model
model = Word2Vec(sentences, min_count=1, size=30)

# summarize the loaded model
print(model, '\n')

# summarize vocabulary
words = list(model.wv.vocab)
print(words, '\n')
# access vector for one word
print(model['sentence'], '\n')

# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model, '\n')

Word2Vec(vocab=14, size=30, alpha=0.025) 

['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec', 'second', 'yet', 'another', 'one', 'more', 'and', 'final'] 

[-0.00168845 -0.00108926  0.01517474 -0.00147417  0.01347492 -0.00420771
  0.01537163  0.01103556 -0.00048048  0.0006031   0.00957203  0.00216474
  0.00289918  0.00857945 -0.01114961  0.00740007 -0.01172909 -0.00859065
 -0.00563982  0.00953312  0.01099747  0.01123064 -0.00702224  0.00411851
 -0.00225844  0.01257167  0.0041694   0.00280735 -0.00141178 -0.01482459] 

Word2Vec(vocab=14, size=30, alpha=0.025) 





## Quiz: What's the problem with Word Embedding Approach?

## Transformers: Contextual Embeddings

In [22]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-cls-token')
sentences = ['what is the best memory of your student journey?']
sentence_embeddings = model.encode(sentences)

print("Sentence embeddings:")
print(sentence_embeddings[0].shape)
print(sentence_embeddings)


Sentence embeddings:
(768,)
[[-7.14925051e-01 -2.07887944e-02  7.84179688e-01  1.29526809e-01
   5.29033840e-01  4.57159728e-01  2.83099741e-01  4.97395456e-01
   2.03238055e-01 -6.33602917e-01  2.72990763e-01  4.97204632e-01
   6.71189189e-01 -2.32010230e-01 -1.50385797e-02  2.98834652e-01
   1.81426957e-01 -2.88195431e-01 -1.08267196e-01 -1.31664026e+00
  -8.00215483e-01  1.93802908e-01  7.88021386e-02  5.01182824e-02
   1.82802156e-01  7.22472250e-01  3.85006070e-01 -1.66672528e-01
   3.55073214e-01  7.97548234e-01  1.22434191e-01 -4.77153838e-01
  -2.36310422e-01 -1.02875173e-01 -8.56808901e-01  6.73237026e-01
   2.52073735e-01  1.74972966e-01  2.03100801e-01  7.42368177e-02
  -2.23949456e+00 -5.41451693e-01  6.62961662e-01  3.77157807e-01
  -9.88355696e-01 -6.30684271e-02 -3.04801536e+00  5.96240520e-01
   2.57309318e-01  2.77939532e-03 -4.93922681e-01  9.09533024e-01
   8.54490817e-01  8.92385364e-01 -7.30484307e-01 -6.52297258e-01
   6.49870038e-01 -1.74373424e+00 -3.85000736e-0