<a href="https://colab.research.google.com/github/a-forty-two/DataSetsForML/blob/master/12_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# AGENDA FOR THE DAY 
# how exactly is NLP dictionaries and logic created?
# REST of the week is dedicated to NLP
# DataBricks -> Big Data -> how to access data using Apache Spark
# PyTorch -> another dl framework besides TensorFlow, its ops are very much like NumPy
# Dynamic Computing Graphs (ALSO A DAG, but better) in PyTorch rather than DAG
# As a result, graphs build on-the-go (dynamically)

In [0]:
# LISTEN and SILENT -> both have same spelling but opposite meanings!
# ASCII or UNICODE -> that would have been char by char encoding 
# WORDS are better 

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
sentences = ['I am a good boy', 'today is a beautiful day', 'good is beautiful', 'today is good', "Today's milk is spoilt", "Spoilt, are you?"]


In [0]:
myDict = Tokenizer(num_words=100)
myDict.fit_on_texts(sentences) # LABEL ENCODING using FREQUENCY 
word_index = myDict.word_index
data = {'beautiful dog! Is day yours?', "milk is spoilt"}
text_encoded = myDict.texts_to_sequences(data)
text_encoded

In [0]:
word_index

{'a': 3,
 'am': 8,
 'are': 13,
 'beautiful': 5,
 'boy': 9,
 'day': 10,
 'good': 2,
 'i': 7,
 'is': 1,
 'milk': 12,
 'spoilt': 6,
 'today': 4,
 "today's": 11,
 'you': 14}

In [0]:
data = {'beautiful dog! Is day yours?', "milk is spoilt"}
text_encoded = myDict.texts_to_sequences(data)
text_encoded

[[12, 1, 6], [5, 1, 10]]

In [2]:
# LOSING LENGTH OF DATA CAN CHANGE THE MEANING OF DATA!!! 
myDict = Tokenizer(num_words=100, oov_token='<UNK>') # OUT OF VOCABULARY TOKEN 
myDict.fit_on_texts(sentences) # LABEL ENCODING using FREQUENCY 
word_index = myDict.word_index
data = {'beautiful dog! Is day yours?', "milk is spoilt"}
text_encoded = myDict.texts_to_sequences(data)
text_encoded
# WORD SEQUENCES 

[[13, 2, 7], [6, 1, 2, 11, 1]]

In [3]:
word_index

{'<UNK>': 1,
 'a': 4,
 'am': 9,
 'are': 14,
 'beautiful': 6,
 'boy': 10,
 'day': 11,
 'good': 3,
 'i': 8,
 'is': 2,
 'milk': 13,
 'spoilt': 7,
 'today': 5,
 "today's": 12,
 'you': 15}

In [4]:
# MAKE ALL SENTENCES EQUAL IN SIZE -> NORMALIZATION OF LENGTH -> PADDING
from tensorflow.keras.preprocessing.sequence import pad_sequences
sequences = pad_sequences(text_encoded)
sequences

array([[ 0,  0, 13,  2,  7],
       [ 6,  1,  2, 11,  1]], dtype=int32)

In [6]:
# MAKE ALL SENTENCES EQUAL IN SIZE -> NORMALIZATION OF LENGTH -> PADDING
# SHOULD BE ON end for easier and optimized training 
from tensorflow.keras.preprocessing.sequence import pad_sequences
sequences = pad_sequences(text_encoded, padding='post', maxlen=16, truncating='post') # truncating -> control how to del sentences
sequences

array([[13,  2,  7,  0,  0,  0,  0,  0,  0,  0],
       [ 6,  1,  2, 11,  1,  0,  0,  0,  0,  0]], dtype=int32)

In [9]:
# Embedding -> Global Average Pooling (Average sentiment) -> Pattern Detection
# Embedding -> Global Average Pooling (Average sentiment) -> Dense(ReLU) -> Dense(Sigmoid)
# Model 2
# EVERY WORD its own sentiment -> Instead of average, just look up vectors from embdedding layer, flatten them,
# detect pattern 

# Prev-> sentiment analysis of sentence
# New -> sentiment analysis per word (WORD2WORD SEQUENCES)

vocab_size = 100
embed_dim = 16
MAXLEN = 16

from tensorflow.keras.preprocessing.sequence import pad_sequences
sequences = pad_sequences(text_encoded, padding='post', maxlen=MAXLEN, truncating='post')
# 15,000 training samples -> imdb has 
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(100, 16, input_length=10))
model.add(tf.keras.layers.Flatten()) # 16 vectors will be flattened to 1 dimension
model.add(tf.keras.layers.Dense(10, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 16)            1600      
_________________________________________________________________
flatten (Flatten)            (None, 160)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1610      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 3,221
Trainable params: 3,221
Non-trainable params: 0
_________________________________________________________________
