# Basic NLP(Natural Language Processing)

## Todo:
* Read sentences and evaluate negative/positive

## Preparation

### Load the Library

In [1]:
import tensorflow as tf
import numpy as np

from utils.support_tf import LogLevelManager as llm

In [2]:
# Set display tensorflow log level
# 2 - warning, error display
llm.set(2)

### Prepare Training Data
> Caution:
* Constructs a sentence of 4 words(separated by 3 spaces) as one input data.

#### Get Source Data

In [3]:
original_x = [
    '너 오늘 이뻐 보인다.',
    '나는 오늘 기분이 더러워',
    '끝내주는데, 좋은 일 있나봐?',
    '나 좋은 일 생겼어',
    '아 오늘 진짜 짜증나!',
    '환상적인데, 정말 좋은거 같아'    
]
original_y = [[1], [0], [1], [1], [0], [1]]   # 1-positive, 0-negative

question_x = [
    '나 오늘 아주 좋은',
    '아 너는 진짜 더러워'
]

#### Preprocessing

##### Tokenizing
* Create a dictionary using words from the training data.

In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(original_x)

In [5]:
print(f"Word Count: {len(tokenizer.word_counts)}")

Word Count: 20


In [6]:
tokenizer.word_index

{'오늘': 1,
 '좋은': 2,
 '일': 3,
 '너': 4,
 '이뻐': 5,
 '보인다': 6,
 '나는': 7,
 '기분이': 8,
 '더러워': 9,
 '끝내주는데': 10,
 '있나봐': 11,
 '나': 12,
 '생겼어': 13,
 '아': 14,
 '진짜': 15,
 '짜증나': 16,
 '환상적인데': 17,
 '정말': 18,
 '좋은거': 19,
 '같아': 20}

##### Text To Numeric

Convert text data to numeric using a dictionary

In [7]:
original_num = tokenizer.texts_to_sequences(original_x)
original_num

[[4, 1, 5, 6],
 [7, 1, 8, 9],
 [10, 2, 3, 11],
 [12, 2, 3, 13],
 [14, 1, 15, 16],
 [17, 18, 19, 20]]

In [9]:
type(original_num), type(original_y)

(list, list)

In [10]:
train_x, train_y = np.array(original_num), np.array(original_y)
type(train_x), type(train_y)

(numpy.ndarray, numpy.ndarray)

##### Save Dictionary & Append PAD
* PAD - Padding data

In [11]:
vocab_data = tokenizer.word_index
vocab_data['<PAD>'] = 0

In [12]:
vocab_data

{'오늘': 1,
 '좋은': 2,
 '일': 3,
 '너': 4,
 '이뻐': 5,
 '보인다': 6,
 '나는': 7,
 '기분이': 8,
 '더러워': 9,
 '끝내주는데': 10,
 '있나봐': 11,
 '나': 12,
 '생겼어': 13,
 '아': 14,
 '진짜': 15,
 '짜증나': 16,
 '환상적인데': 17,
 '정말': 18,
 '좋은거': 19,
 '같아': 20,
 '<PAD>': 0}

### Define Global Variables

In [54]:
EPOCHS = 10
BATCH_SIZE = 2
EMBEDDING_MAX_VALUE = max(vocab_data.values()) + 1

## Model 

### Define

In [70]:
nlp_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=EMBEDDING_MAX_VALUE, output_dim=32),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])
nlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

: 

### Training

In [None]:
tf.reduce_mean

In [57]:
nlp_model.fit(train_x, train_y, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f80d80d5f70>

### Evaluate

In [63]:
question_num = tokenizer.texts_to_sequences(question_x)
test_x = np.array(question_num)
pred_y = nlp_model.predict(test_x)

In [68]:
for idx, p in enumerate(pred_y):
    print(f'{question_x[idx]} => {"긍정" if np.mean(p)>.5 else "부정"}')

나 오늘 아주 좋은 => 긍정
아 너는 진짜 더러워 => 부정
