In [5]:
import json
import tensorflow as tf
import numpy as np
import csv
import urllib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
# headline을 보고 sarcasm인지 아닌지 판별해보기
url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
urllib.request.urlretrieve(url, 'sarcasm.json')

# parameter setting
vocab_size = 1000
embedding_dim = 100
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

sentences = []
labels = []
# YOUR CODE HERE
with open("sarcasm.json") as file:
    data = json.load(file)

In [14]:
for row in data:
    sentences.append(row['headline'])
    labels.append(row['is_sarcastic'])

#train과 test(validation) 데이터를 분리하기 위한 과정
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

#num_words : tokenizer 처리할 단어 갯수 지정(주의: vocab_size가 1000일 경우, 0~999 index까지 고려한다는 뜻), oov_token은 tokenizer에 없는 단어 처리할 때 사용(index 0번으로 지정됨)
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) 
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index #word_index의 형태: {'단어': index }

training_sequences = tokenizer.texts_to_sequences(training_sentences) # texts_to_sequences를 통해 단어가 index로 변환
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [16]:
print(word_index) #num_words는 texts_to_seqeunces할 때만 적용됨



In [8]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2021-09-20 11:39:28--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-09-20 11:39:28--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-09-20 11:39:28--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [28]:
embeddings_index = {}; #{ 'word' : coef }의 형태 , coef는 100차원
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size, embedding_dim));

for word, i in word_index.items():
    #word_index는 일단 num_word 넘어서까지 numbering되어 있긴 때문에 break 필요함.
    if i>= vocab_size:
      break
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [30]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False), #100 dimension
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)), #64 dimension 그리고 문장 길이만큼 output
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)), #32 dimension 그리고 마지막 output만 가져온 뒤
    tf.keras.layers.Flatten(), # 마지막 output의 32 dimension을 flatten
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

model.fit(training_padded, training_labels, epochs=30, validation_data=(testing_padded, testing_labels), verbose=2)

if __name__ == '__main__':
    model = solution_model()
    model.save("mymodel.h5")


Epoch 1/30
625/625 - 208s - loss: 0.5264 - accuracy: 0.7302 - val_loss: 0.4278 - val_accuracy: 0.7932
Epoch 2/30
625/625 - 202s - loss: 0.4354 - accuracy: 0.7975 - val_loss: 0.3944 - val_accuracy: 0.8222
Epoch 3/30
625/625 - 202s - loss: 0.4025 - accuracy: 0.8138 - val_loss: 0.3843 - val_accuracy: 0.8211
Epoch 4/30
625/625 - 202s - loss: 0.3761 - accuracy: 0.8305 - val_loss: 0.3519 - val_accuracy: 0.8398
Epoch 5/30
625/625 - 202s - loss: 0.3586 - accuracy: 0.8397 - val_loss: 0.3530 - val_accuracy: 0.8351
Epoch 6/30
625/625 - 202s - loss: 0.3375 - accuracy: 0.8486 - val_loss: 0.3193 - val_accuracy: 0.8592
Epoch 7/30
625/625 - 202s - loss: 0.3186 - accuracy: 0.8580 - val_loss: 0.3434 - val_accuracy: 0.8456
Epoch 8/30
625/625 - 202s - loss: 0.3005 - accuracy: 0.8680 - val_loss: 0.2981 - val_accuracy: 0.8684
Epoch 9/30
625/625 - 202s - loss: 0.2790 - accuracy: 0.8777 - val_loss: 0.2905 - val_accuracy: 0.8756
Epoch 10/30
625/625 - 202s - loss: 0.2591 - accuracy: 0.8863 - val_loss: 0.2862 - 

SyntaxError: ignored