<a href="https://colab.research.google.com/github/godpeny/laboratory/blob/master/Study/Deep_Learning_for_Everyone/Chapter_5/embedding_for_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install
%pip install pandas
%pip install numpy
%pip install tensorflow
%pip install sklearn
%pip install matplotlib

Collecting sklearn
  Downloading sklearn-0.0.post9.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post9-py3-none-any.whl size=2952 sha256=a460c9bf226d09e6d2201c7877d40684cca651c6b4a9e8f5f3b608222463833f
  Stored in directory: /root/.cache/pip/wheels/33/a3/d2/092b519e9522b4c91608b7dcec0dd9051fa1bff4c45f4502d1
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post9


In [30]:
# import
import pandas as pd
import numpy as np

from google.colab import drive

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [16]:
"""
Tokenizer
"""
docs = [
    "Sometimes to understand a word's meaning you need more than a definition you need to see the word used in a sentence.",
    "At YourDictionary, we give you the tools to learn what a word means and how to use it correctly.",
    "With this sentence maker, simply type a word in the search bar and see a variety of sentences with that word used in its different ways.",
    "Our sentence generator can provide more context and relevance, ensuring you use a word the right way."
    ]

token = Tokenizer()
token.fit_on_texts(docs)
print(token.document_count)
print(token.word_counts)
print(token.word_docs) # count words on docs
print(token.word_index) # index

4
OrderedDict([('sometimes', 1), ('to', 4), ('understand', 1), ('a', 7), ("word's", 1), ('meaning', 1), ('you', 4), ('need', 2), ('more', 2), ('than', 1), ('definition', 1), ('see', 2), ('the', 4), ('word', 5), ('used', 2), ('in', 3), ('sentence', 3), ('at', 1), ('yourdictionary', 1), ('we', 1), ('give', 1), ('tools', 1), ('learn', 1), ('what', 1), ('means', 1), ('and', 3), ('how', 1), ('use', 2), ('it', 1), ('correctly', 1), ('with', 2), ('this', 1), ('maker', 1), ('simply', 1), ('type', 1), ('search', 1), ('bar', 1), ('variety', 1), ('of', 1), ('sentences', 1), ('that', 1), ('its', 1), ('different', 1), ('ways', 1), ('our', 1), ('generator', 1), ('can', 1), ('provide', 1), ('context', 1), ('relevance', 1), ('ensuring', 1), ('right', 1), ('way', 1)])
defaultdict(<class 'int'>, {'a': 4, 'you': 3, 'need': 1, 'used': 2, 'more': 2, 'than': 1, 'understand': 1, "word's": 1, 'definition': 1, 'the': 4, 'word': 4, 'to': 2, 'sentence': 3, 'in': 2, 'sometimes': 1, 'meaning': 1, 'see': 2, 'yourdi

In [20]:
"""
One-hot-encodding
"""
text = "Sometimes to understand a word's meaning you need more than a definition you need to see the word used in a sentence."
token = Tokenizer()
token.fit_on_texts([text])
print(token.word_index)

x = token.texts_to_sequences([text]) # text is converted into its word index list
print(x)

word_len = len(token.word_index) + 1 # 'token.word_index' begins with 1 so make 0 index.
x = to_categorical(x, num_classes=word_len)
print(x)

{'a': 1, 'to': 2, 'you': 3, 'need': 4, 'sometimes': 5, 'understand': 6, "word's": 7, 'meaning': 8, 'more': 9, 'than': 10, 'definition': 11, 'see': 12, 'the': 13, 'word': 14, 'used': 15, 'in': 16, 'sentence': 17}
[[5, 2, 6, 1, 7, 8, 3, 4, 9, 10, 1, 11, 3, 4, 2, 12, 13, 14, 15, 16, 1, 17]]
[[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 

In [40]:
"""
Embedding
"""
# positive & negative reviews
reviews = [
  "stunning visuals and a gripping performance",
  "a cinematic masterpiece with a moving story",
  "intriguing plot twists and impeccable acting",
  "a visual treat with a compelling narrative",
  "innovative, memorable, and utterly captivating",
  "predictable plot and one-dimensional characters",
  "lacks depth, originality, and excitement",
  "unconvincing acting and poorly written scrip",
  "the pacing was slow and the story, uninspiring",
  "disappointing execution and lackluster performance"
]

# evals 1 is positive and 0 is negative
evals = np.array([1,1,1,1,1,0,0,0,0,0])

# tokenize
token = Tokenizer()
token.fit_on_texts(reviews)
print(token.word_index)

# convert into index slice
x = token.texts_to_sequences(reviews)
print(x)

# padding (data element length should be same to be processed)
longest = 0;
for i in x:
  if len(i) > longest:
    longest = len(i)

padded_x = pad_sequences(x, longest)
print(padded_x)

# use embedding for modeling
input_len = len(token.word_index) + 1
model = Sequential()
model.add(Embedding(input_dim=input_len, output_dim=8, input_length=longest)) # pick the value of output_dim.
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics='accuracy')

# train model
history = model.fit(padded_x, evals, epochs=20, batch_size=5)

# evaluate model
model_eval = model.evaluate(padded_x, evals)
print(model_eval)

{'and': 1, 'a': 2, 'performance': 3, 'with': 4, 'story': 5, 'plot': 6, 'acting': 7, 'the': 8, 'stunning': 9, 'visuals': 10, 'gripping': 11, 'cinematic': 12, 'masterpiece': 13, 'moving': 14, 'intriguing': 15, 'twists': 16, 'impeccable': 17, 'visual': 18, 'treat': 19, 'compelling': 20, 'narrative': 21, 'innovative': 22, 'memorable': 23, 'utterly': 24, 'captivating': 25, 'predictable': 26, 'one': 27, 'dimensional': 28, 'characters': 29, 'lacks': 30, 'depth': 31, 'originality': 32, 'excitement': 33, 'unconvincing': 34, 'poorly': 35, 'written': 36, 'scrip': 37, 'pacing': 38, 'was': 39, 'slow': 40, 'uninspiring': 41, 'disappointing': 42, 'execution': 43, 'lackluster': 44}
[[9, 10, 1, 2, 11, 3], [2, 12, 13, 4, 2, 14, 5], [15, 6, 16, 1, 17, 7], [2, 18, 19, 4, 2, 20, 21], [22, 23, 1, 24, 25], [26, 6, 1, 27, 28, 29], [30, 31, 32, 1, 33], [34, 7, 1, 35, 36, 37], [8, 38, 39, 40, 1, 8, 5, 41], [42, 43, 1, 44, 3]]
[[ 0  0  9 10  1  2 11  3]
 [ 0  2 12 13  4  2 14  5]
 [ 0  0 15  6 16  1 17  7]
 [ 0 

#Embedding
## word embedding using elmo
http://www.realworldnlpbook.com/blog/improving-sentiment-analyzer-using-elmo.html

https://wikidocs.net/33930
