## Classic One Hot Encoding

In [1]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

text = 'hello students how are you doing today ?'

text_label_map = {}

for word in text.split():
    if word not in text_label_map:
        text_label_map[word] = len(text_label_map)

text_label_list = [[k,v] for k, v in text_label_map.items()]
print("number of features: ", len(text_label_list), '\n')
print("text_label_map: ", text_label_map, '\n')


# define example
data = text.split()
values = array(data)
print(values, '\n')

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print("onehot_encoded shape: ", onehot_encoded.shape, '\n')
print(onehot_encoded, '\n')

# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

number of features:  8 

text_label_map:  {'hello': 0, 'students': 1, 'how': 2, 'are': 3, 'you': 4, 'doing': 5, 'today': 6, '?': 7} 

['hello' 'students' 'how' 'are' 'you' 'doing' 'today' '?'] 

[3 5 4 1 7 2 6 0]
onehot_encoded shape:  (8, 8) 

[[0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]] 

['hello']


## Quiz: What's the problem with One-Hot Encoding Approach?

## Word Embedding Example

In [2]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]

# train model
model = Word2Vec(sentences, min_count=1, size=30)

# summarize the loaded model
print(model, '\n')

# summarize vocabulary
words = list(model.wv.vocab)
print(words, '\n')
# access vector for one word
print(model['sentence'], '\n')

# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model, '\n')

Word2Vec(vocab=14, size=30, alpha=0.025) 

['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec', 'second', 'yet', 'another', 'one', 'more', 'and', 'final'] 

[-0.00339331 -0.01598168  0.01094237 -0.01652149  0.00732072  0.00297741
 -0.0011619   0.01289032  0.01128422 -0.01021125  0.00082624  0.01288406
 -0.0101222  -0.00310323  0.00143963  0.00360074 -0.0078881   0.00547472
  0.00215671  0.00688834 -0.00950495  0.01161353 -0.0087323   0.0020835
  0.01209858 -0.00090453 -0.00133998 -0.01488148 -0.00893773  0.01471785] 

Word2Vec(vocab=14, size=30, alpha=0.025) 





In [7]:
model['yet']
#model['fdfdafasa']

  """Entry point for launching an IPython kernel.


array([-0.01560717,  0.0082434 , -0.00735913,  0.00655246, -0.00671566,
        0.00763811,  0.01606547,  0.00077643,  0.00534043, -0.01586732,
       -0.01073733,  0.00590457, -0.00816169, -0.01602001, -0.00316846,
       -0.0099422 , -0.00876949, -0.01413536, -0.01437334,  0.01084207,
       -0.01154254, -0.00867407, -0.00184571,  0.00155982, -0.00863484,
        0.00444709,  0.00361354,  0.01031046, -0.00672278,  0.00044784],
      dtype=float32)

In [9]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://miro.medium.com/max/1806/1*cuOmGT7NevP9oJFJfVpRKA.png")

## Quiz: What's the problem with Word Embedding Approach?

## Transformers: Contextual Embeddings

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-cls-token')
sentences = ['what is the best memory of your student journey?']
sentence_embeddings = model.encode(sentences)

print("Sentence embeddings:")
print(sentence_embeddings[0].shape)
print(sentence_embeddings)


Some weights of the model checkpoint at /Users/chandra/.cache/torch/sentence_transformers/sbert.net_models_bert-base-nli-cls-token/0_BERT were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Sentence embeddings:
(768,)
[[-7.14924991e-01 -2.07896791e-02  7.84179449e-01  1.29526734e-01
   5.29034317e-01  4.57159460e-01  2.83099234e-01  4.97395754e-01
   2.03237683e-01 -6.33602500e-01  2.72990644e-01  4.97204661e-01
   6.71188831e-01 -2.32010692e-01 -1.50387483e-02  2.98834413e-01
   1.81426272e-01 -2.88195133e-01 -1.08267009e-01 -1.31664038e+00
  -8.00215304e-01  1.93802193e-01  7.88016990e-02  5.01181148e-02
   1.82801932e-01  7.22471893e-01  3.85006160e-01 -1.66672274e-01
   3.55073214e-01  7.97547758e-01  1.22434519e-01 -4.77153540e-01
  -2.36310363e-01 -1.02874719e-01 -8.56808901e-01  6.73237443e-01
   2.52073824e-01  1.74973100e-01  2.03100592e-01  7.42362738e-02
  -2.23949480e+00 -5.41451514e-01  6.62961483e-01  3.77157271e-01
  -9.88355815e-01 -6.30681813e-02 -3.04801464e+00  5.96241236e-01
   2.57308811e-01  2.77972966e-03 -4.93922889e-01  9.09532666e-01
   8.54490817e-01  8.92385721e-01 -7.30484784e-01 -6.52297676e-01
   6.49870276e-01 -1.74373448e+00 -3.85000646e-0

In [12]:
s = "fdasfasfadsfads"
emb = model.encode([s])

print(emb)

[[ 6.06369227e-02  2.88879424e-01  8.54231834e-01 -2.07520694e-01
   9.63195086e-01 -1.31849036e-01 -7.06652582e-01 -2.02416196e-01
  -6.31682277e-01  2.86208481e-01  6.79688990e-01  1.30384350e+00
  -3.60871822e-01  8.08300674e-01  2.02285171e-01 -1.03987694e-01
  -6.54103756e-02  1.85006186e-01  1.68155682e+00 -7.26262033e-01
  -1.84736520e-01  2.17724860e-01  4.73328799e-01  4.11453575e-01
  -7.96028599e-02 -1.17921531e-01  1.30215958e-01 -8.42964530e-01
  -9.38609660e-01  6.80459619e-01 -2.98420817e-01 -5.27374074e-03
   2.57196605e-01 -1.54989108e-01 -4.19207782e-01 -1.95279121e-01
   1.44028161e-02 -4.65769544e-02 -1.33877909e+00 -2.60577798e-01
  -7.06566989e-01 -3.63171935e-01  4.91985619e-01  2.65001446e-01
  -1.33006465e+00  1.69471800e-01 -1.91534555e+00  8.07114661e-01
  -7.31507421e-01  8.81504193e-02 -4.68643159e-01 -7.56173193e-01
   6.11125886e-01  7.90206909e-01  7.87295699e-01 -6.94702789e-02
  -4.91978019e-01 -5.86362302e-01 -5.81007123e-01 -4.01108153e-02
   2.47322

In [13]:
sentences = ['which is the best memory of your student journey?']
sentence_embeddings = model.encode(sentences)

print("Sentence embeddings:")
print(sentence_embeddings[0].shape)
print(sentence_embeddings)

Sentence embeddings:
(768,)
[[-8.18199635e-01 -2.82008611e-02  8.58250976e-01  1.22131273e-01
   4.49485391e-01  4.00017381e-01  2.88126647e-01  5.03261805e-01
   2.59323061e-01 -5.41435242e-01  3.10176253e-01  5.57084501e-01
   7.47518480e-01 -1.58451393e-01  4.47428301e-02  3.46482009e-01
   5.01102619e-02 -3.05123210e-01 -1.67337462e-01 -1.14255738e+00
  -7.32896984e-01  3.42328429e-01  4.43528630e-02  1.07326642e-01
   1.67283341e-01  7.07371652e-01  3.26046109e-01 -2.41490215e-01
   3.44227135e-01  7.59545445e-01  1.03299327e-01 -3.95222038e-01
  -2.42410064e-01 -1.23837143e-02 -6.82193279e-01  6.17374301e-01
   2.61127800e-01  1.16725489e-01  2.81482041e-01 -1.90263465e-02
  -2.29176378e+00 -5.57998598e-01  7.30817020e-01  3.14118832e-01
  -8.37596357e-01  1.34345785e-01 -3.18091536e+00  7.16151178e-01
   2.66260237e-01 -1.85214028e-01 -4.70405728e-01  8.11600506e-01
   8.80625486e-01  9.20101702e-01 -7.68406451e-01 -6.37185752e-01
   7.15440333e-01 -1.62982118e+00 -4.06814158e-0