# Basic NLP
Natural Language Processing

## Todo
Read sentences and evaluate negative/positive

```python
original_x = [
    '너 오늘 이뻐 보인다.',
    '나는 오늘 기분이 더러워',
    '끝내주는데, 좋은 일 있나봐?',
    '나 좋은 일 생겼어',
    '아 오늘 진짜 짜증나!',
    '환상적인데, 정말 좋은거 같아'    
]
original_y = [[1], [0], [1], [1], [0], [1]]

question_x = [
    '나 오늘 기분 좋아',
    '아 진짜 환상적이야'
]
question_y = [[?], [?]]
```

## Preparation

### Load the Library

In [1]:
import tensorflow as tf
import numpy as np
from datetime import datetime

tf.__version__

'2.7.4'

### Prepare Training Data

In [53]:
original_x = [
    '너 오늘 이뻐 보인다.',
    '나는 오늘 기분이 더러워',
    '끝내주는데, 좋은 일 있나봐?',
    '나 좋은 일 생겼어',
    '아 오늘 진짜 짜증나!',
    '환상적인데, 정말 좋은거 같아'    
]
original_y = [[1], [0], [1], [1], [0], [1]]

question_x = [
    '나 오늘 아주 좋은',
    '아 너는 진짜 더러워'
]

#### Preprocessing

##### Tokenizer
Create a dictionary using words from the training data

In [3]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(original_x)

In [4]:
len(tokenizer.word_index)

20

In [5]:
tokenizer.word_index

{'오늘': 1,
 '좋은': 2,
 '일': 3,
 '너': 4,
 '이뻐': 5,
 '보인다': 6,
 '나는': 7,
 '기분이': 8,
 '더러워': 9,
 '끝내주는데': 10,
 '있나봐': 11,
 '나': 12,
 '생겼어': 13,
 '아': 14,
 '진짜': 15,
 '짜증나': 16,
 '환상적인데': 17,
 '정말': 18,
 '좋은거': 19,
 '같아': 20}

In [6]:
tokenizer.word_counts

OrderedDict([('너', 1),
             ('오늘', 3),
             ('이뻐', 1),
             ('보인다', 1),
             ('나는', 1),
             ('기분이', 1),
             ('더러워', 1),
             ('끝내주는데', 1),
             ('좋은', 2),
             ('일', 2),
             ('있나봐', 1),
             ('나', 1),
             ('생겼어', 1),
             ('아', 1),
             ('진짜', 1),
             ('짜증나', 1),
             ('환상적인데', 1),
             ('정말', 1),
             ('좋은거', 1),
             ('같아', 1)])

##### Text to Numeric
Convert text data to numeric using a dictionary

In [7]:
original_num = tokenizer.texts_to_sequences(original_x)
original_num

[[4, 1, 5, 6],
 [7, 1, 8, 9],
 [10, 2, 3, 11],
 [12, 2, 3, 13],
 [14, 1, 15, 16],
 [17, 18, 19, 20]]

In [8]:
train_x = np.array(original_num)
train_y = np.array(original_y)

##### Save Dictionary

In [9]:
vocab_data = tokenizer.word_index
vocab_data['<PAD>'] = 0   # Any data that is not in the dictionary is treated as 0

### Define a Global Variables

In [10]:
EPOCHS = 10
BATCH_SIZE = 2

VOCAB_SIZE = len(vocab_data) + 1

## Define Model

In [23]:
class SimpleNLP(tf.keras.Model):
    
    def __init__(self):
        super(SimpleNLP, self).__init__(name="simple_nlp")
        self.i  = tf.keras.layers.Embedding(VOCAB_SIZE, 128)
        self.f1 = tf.keras.layers.Dense(units=256, activation="relu")
        self.o  = tf.keras.layers.Dense(units=1, activation="sigmoid")
        
    def call(self, inputs):
        l = self.i(inputs)
        l = tf.reduce_mean(l, axis=1)
        l = self.f1(l)
        l = self.o(l)
        return l
    
model = SimpleNLP()
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

## Training Model

In [24]:
model.fit(train_x, train_y, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8524375a60>

## Evaluate Model

In [58]:
question_x

['나 오늘 아주 좋은', '아 너는 진짜 더러워']

In [59]:
# Numericalization of test data should also use a training tokenizer
question_num = tokenizer.texts_to_sequences(question_x)
question_num

[[12, 1, 2], [14, 15, 9]]

In [60]:
test_x = np.array(question_num)
pred_y = model.predict(test_x)
pred_y

array([[0.58491594],
       [0.38955677]], dtype=float32)

: 