In [1]:
# sarcasm json data binary classification
# total 26,709 headlines  

In [3]:
import json
import urllib
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# sarcasm.json 데이터셋 파일 다운로드 , Windows용
url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
urllib.request.urlretrieve(url, 'sarcasm.json')

('sarcasm.json', <http.client.HTTPMessage at 0x2ac07e18190>)

In [5]:
# 데이터 파일 불러오기
with open('sarcasm.json','r') as f:
    datastore = json.load(f)

sentences = []
labels = []

for item in datastore:  # 26709회
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic']) 

In [16]:
# 데이터 프레임으로 보기
import pandas as pd
df = pd.DataFrame(datastore)
df = df.iloc[:,1:]

print(type(datastore[0]))  # <class 'dict'>
print(df.shape)            # (26709, 2)
print(df['is_sarcastic'].value_counts())   # 0    14985 : not sarcastic 
                                           # 1    11724 : sarcastic
df.head(10)

<class 'dict'>
(26709, 2)
is_sarcastic
0    14985
1    11724
Name: count, dtype: int64


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
5,advancing the world's women,0
6,the fascinating case for eating lab-grown meat,0
7,"this ceo will send your kids to school, if you...",0
8,top snake handler leaves sinking huckabee camp...,1
9,friday's morning email: inside trump's presser...,0


### 텍스트 전처리: Tokenizer

In [19]:
# 전처리를 위한 변수 설정
vocab_size = 10000    # 토큰화에 사용될 최대 어휘수
embedding_dim = 16    # Embedding 계층의 output size 
max_length = 100      # 한 문장의 길이, 데이터 셋의 길이, maxlen , T :Sequence Length
trunc_type = 'post'   # maxlen보다 클때 잘라낼 유형, 'post' : 뒤쪽
padding_type = 'post' # maxlen보다 작을때 0을 추가할 유형, 'post' : 뒤쪽
oov_tok = "<OOV>"     # Out-Of-Vocabulary(단어 집합에 없는 단어)
training_size = 20000 # 학습 데이터의 갯수

In [20]:
# train(20000)/test(6709) data split 
training_sentences = sentences[:training_size]
training_labels = labels[:training_size]

testing_sentences = sentences[training_size:]
testing_labels = labels[training_size:]

In [42]:
# 토큰나이저를 시행하여 단어를 숫자값, 인덱스로 변환하여 저장

# 가장 빈도가 높은 10000개의 단어들만 사용하여 토큰화
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_tok)

# 단어 인덱스를 구축 
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index
# print(word_index)

# 문자열을 정수 인덱스의 리스트로 변환 : 정수 인코딩
training_sequences = tokenizer.texts_to_sequences(training_sentences)
# print(training_sequences[0])
# print(training_sentences[0])

# 패딩, 벡터 표현을 얻음 : 신경망에 입력할 X값
training_padded = pad_sequences(training_sequences,maxlen=max_length,
                                padding=padding_type,truncating=trunc_type)
# print(training_padded[0])

# test 데이터 : 정수 인덱스의 리스트로 변환 , 인코딩
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# test 데이터 : 벡터 표현을 얻음 ,패딩
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
                                padding=padding_type,truncating=trunc_type)

print(training_padded.shape)  # (20000, 100)
print(testing_padded.shape)   # (6709, 100)

(20000, 100)
(6709, 100)


In [44]:
# list를 array로 변환
import numpy as np
training_labels = np.array(training_labels)  # (20000,)
testing_labels = np.array(testing_labels)    # (6709,)

#### 학습 모델

In [50]:
model = tf.keras.Sequential([
    # X:(20000,100)
    tf.keras.layers.Input(shape=(100,)),

    # W:(10000,16)   , Param : 160,000
    tf.keras.layers.Embedding(vocab_size,embedding_dim), # (N,T,D) ,(N,100,16)
    tf.keras.layers.LSTM(64,return_sequences=True),      # (N,T,H) ,(N,100,64)
    tf.keras.layers.LSTM(32),                            # (N,H) 
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid'),
 ])

model.summary()