In [1]:
"""
임포트
"""
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import re

"""
데이터 파싱
"""
values = []
def parse_file_content(root_path, is_summary, section, file_count):
    folder_sort = ["News Articles", "Summaries"]

    if is_summary:
        index = 1
    else:
        index = 0
   
    # 파일 열기
    file_name = ('%d' % file_count).zfill(3) + '.txt'
    path = "%s/%s/%s/%s" % (root_path, folder_sort[index], section, file_name)
    file = open(path, 'r')

    # 파일 읽어서 content에 넣기
    lines = file.readlines()
    content = ''
    for line in lines:
        content = content + ' ' + line.strip()

    # 파일 닫기
    file.close()

    # 파일 내용 반환
    return content
  
def parse_folder(root_path, section):
    folder_path = "%s/Summaries/%s/" % (root_path, section)
    file_list = os.listdir(folder_path)
    max_file_count = len(file_list)
  
    for i in range(1, max_file_count + 1):
        # Text 데이터 가져오기
        text = parse_file_content(root_path=root_path, is_summary=False, section=section, file_count=i)
    
        # Summary 데이터 가져오기
        summary = parse_file_content(root_path=root_path, is_summary=True, section=section, file_count=i)

        # values에 데이터 넣기
        values.append([text, summary])

# 모든 섹션들 데이터에 넣기
root_path = "./BBC News Summary"
parse_folder(root_path, "business")
parse_folder(root_path, "entertainment")
parse_folder(root_path, "politics")
parse_folder(root_path, "sport")
parse_folder(root_path, "tech")

# 데이터프레임 만들어 csv 파일로 저장
df = pd.DataFrame(values)
df.columns = ["Text", "Summary"]
df.to_csv("News.csv", index=False)

In [2]:
"""
전처리
"""
df = pd.read_csv("News.csv")

# print(df)
print("전체 데이터 갯수 :", len(df))
# print(df.isnull().sum())
print(df["Summary"][18]) # UTF-8 인코딩이 유로 표시를 '짙'으로 바꿈, 영어 빼고 제외시킬 예정

전체 데이터 갯수 : 2225
 "We need at least $20bn (짙10.6bn) in investment and part of this has to come as foreign direct investment," said Mr Maran.Potential foreign investors will however need government approval before they increase their stake beyond 49%, Mr Maran said.Communications Minister Dayanidhi Maran said that there is a need to fund the fast-growing mobile market.India has raised the limit for foreign direct investment in telecoms companies from 49% to 74%.Investment bank Morgan Stanley has forecast that India's mobile market is likely to grow by about 40% a year until 2007.


In [3]:
texts = []
for content in df['Text']:
    content = re.sub(r"([?.!,])", r" \1 ", content)
    content = re.sub("[^ A-Za-z?.!,$%]+", '', content)
    content = content.strip()
    texts.append(content)

summaries = []
for content in df['Summary']:
    content = re.sub(r"([?.!,])", r" \1 ", content)
    content = re.sub("[^ A-Za-z0-9?.!,$%]+", ' ', content)
    content = content.strip()
    summaries.append(content)
print(texts[0])
print(summaries[0])

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    texts + summaries, target_vocab_size=2**13
)
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]  # <sos> 와 <eos>
VOCAB_SIZE = tokenizer.vocab_size + 2

print("시작 토큰 번호 :", START_TOKEN)
print("종료 토큰 번호 :", END_TOKEN)
print("단어 집합의 크기 :", VOCAB_SIZE)


Ad sales boost Time Warner profit  Quarterly profits at US media giant TimeWarner jumped % to $ . bn m for the three months to December ,  from $m yearearlier .   The firm ,  which is now one of the biggest investors in Google ,  benefited from sales of highspeed internet connections and higher advert sales .  TimeWarner said fourth quarter sales rose % to $ . bn from $ . bn .  Its profits were buoyed by oneoff gains which offset a profit dip at Warner Bros ,  and less users for AOL .   Time Warner said on Friday that it now owns % of searchengine Google .  But its own internet business ,  AOL ,  had has mixed fortunes .  It lost  ,  subscribers in the fourth quarter profits were lower than in the preceding three quarters .  However ,  the company said AOLs underlying profit before exceptional items rose % on the back of stronger internet advertising revenues .  It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up

In [4]:
# print('임의의 질문 샘플을 정수 인코딩 : {}'.format(tokenizer.encode(texts[20])))

sample_string = texts[20]

tokenized_string = tokenizer.encode(sample_string)
print('정수 인코딩 후의 문장 : {}'.format(tokenized_string))

original_string = tokenizer.decode(tokenized_string)
print('기존 문장 : {}'.format(original_string))

for ts in tokenized_string:
    print('{} ----> {}'.format(ts, tokenizer.decode([ts])))

정수 인코딩 후의 문장 : [5556, 169, 147, 3, 1236, 179, 161, 1420, 10, 4273, 720, 320, 5556, 169, 71, 4293, 7903, 254, 3, 3755, 7325, 7903, 47, 161, 382, 2510, 7, 1236, 47, 519, 1230, 2, 1048, 5663, 21, 5556, 7978, 2, 4157, 4144, 4713, 12, 1, 2538, 46, 443, 3322, 2, 28, 3948, 39, 1, 7247, 1646, 270, 30, 1, 3027, 5, 47, 1549, 16, 1384, 2, 1, 1150, 6965, 5608, 4, 5210, 262, 531, 4753, 274, 507, 613, 9, 32, 6304, 3, 26, 2163, 36, 47, 3755, 7325, 36, 2739, 7988, 3127, 7903, 1312, 2510, 30, 7980, 2, 1, 334, 363, 4, 4739, 2540, 143, 5, 6, 805, 7247, 1646, 270, 20, 4607, 27, 215, 5556, 169, 564, 6, 2952, 92, 6, 805, 3755, 7325, 90, 8, 1541, 4, 3409, 1310, 3646, 9, 7357, 7903, 25, 255, 467, 79, 63, 1, 320, 20, 5384, 16, 7319, 2693, 7, 2578, 2800, 16, 6, 435, 1446, 2765, 1127, 45, 20, 347, 22, 4336, 714, 39, 5, 6, 144, 5, 1230, 133, 2, 251, 1, 7950, 501, 16, 4641, 847, 8, 7, 1, 2014, 1921, 6496, 7903, 5595, 714, 4, 14, 325, 433, 1, 320, 23, 141, 3512, 5930, 4890, 6717, 2, 4549, 7903, 4967, 7, 2739, 7988,

In [5]:
MAX_INPUT_LENGTH = 1000
MAX_OUTPUT_LENGTH = 600
def tokenize_and_filter(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []

    # 시작 토큰(sos)과 종료 토큰(eos) 포함
    for (content1, content2) in zip(inputs, outputs):
        content1 = START_TOKEN + tokenizer.encode(content1) + END_TOKEN
        content2 = START_TOKEN + tokenizer.encode(content2) + END_TOKEN

        tokenized_inputs.append(content1)
        tokenized_outputs.append(content2)
    
    # 길이를 1000, 600으로 맞춘다. 더 짧은 배열은 뒤에 0을 추가한다.
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(tokenized_inputs, maxlen=MAX_INPUT_LENGTH, padding='post')
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(tokenized_outputs, maxlen=MAX_OUTPUT_LENGTH, padding='post')

    return tokenized_inputs, tokenized_outputs

tok_texts, tok_summaries = tokenize_and_filter(texts, summaries)

In [7]:
print("기사 데이터의 크기(shape) :", tok_texts.shape)
print("요약 데이터의 크기(shape) :", tok_summaries.shape)

print(summaries[0])
print(tok_summaries[0])

기사 데이터의 크기(shape) : (2225, 1000)
요약 데이터의 크기(shape) : (2225, 600)
TimeWarner said fourth quarter sales rose 2% to $11 . 1bn from $10 . 9bn . For the full year ,  TimeWarner posted a profit of $3 . 36bn ,  up 27% from its 2003 performance ,  while revenues grew 6 . 4% to $42 . 09bn . Quarterly profits at US media giant TimeWarner jumped 76% to $1 . 13bn  600m  for the three months to December ,  from $639m year earlier . However ,  the company said AOL s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues . Its profits were buoyed by one off gains which offset a profit dip at Warner Bros ,  and less users for AOL . For 2005 ,  TimeWarner is projecting operating earnings growth of around 5% ,  and also expects higher revenue and wider profit margins . It lost 464 , 000 subscribers in the fourth quarter profits were lower than in the preceding three quarters . Time Warner s fourth quarter profits were slightly better than analysts  expec

In [12]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((
    { 
        'enc_inputs': tok_texts,
        'dec_inputs': tok_summaries[:, :-1] # 마지막 패딩 0 제거
    },
    {
        'outputs': tok_summaries[:, 1:] # 시작 토큰 제거
    }
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)

test_dataset = dataset.take(49)   # 앞에서 49개를 테스트 데이터로 뺌
dataset = dataset.skip(49)   # 나머지 2176개가 학습 데이터가 됨

dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)  # 메모리 사용을 위한 prefetch

## transformer 아키텍처 구현

In [14]:
# 인코더&디코더(포지셔널 인코딩)
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position, i, d_model):
        return position * (1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32)))

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            position = tf.range(position, dtype = tf.float32)[:, tf.newaxis],
            i = tf.range(d_model, dtype = tf.float32)[tf.newaxis, :],
            d_model = d_model
        )
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])
        angle_rads = np.zeros(angle_rads.shape)
        angle_rads[:, 0::2] = sines # even index -> sin
        angle_rads[:, 1::2] = cosines # odd index  -> cos
        pos_encoding = tf.constant(angle_rads)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)
    
    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [15]:
#인코더(스케일드 닷 프로덕트 어텐션)
def scaled_dot_product_attention(query, key, value, mask) :
    
    # split head 상태로 진행
    
    multiple_QandK = tf.matmul(query, key, transpose_b=True)
    depth = tf.cast(tf.shape(key)[-1], tf.float32)
    logits = multiple_QandK / tf.math.sqrt(depth)

    if mask is not None :
        logits += (mask * -1e10)
    attention_weights = tf.nn.softmax(logits, axis = -1)
    output = tf.matmul(attention_weights, value)
    return output, attention_weights
        

In [24]:
# 패딩 마스크 생성
def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x,0), tf.float32) # [[1,2,0,2,1]] => [[0.,0.,1.,0.,0.]]
    return mask[:,tf.newaxis, tf.newaxis, :] # 차원 추가

In [25]:
def create_look_ahead_mask(x):
    seq_len = tf.shape(x)[1] # [[1,2,0]] => 3
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) # 모든 원소가 1인 하삼각행렬
    padding_mask = create_padding_mask(x) # x에서 0이었던 부분만 1로 바꿔진 행렬
    return tf.maximum(look_ahead_mask, padding_mask)

In [27]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name="multi_head_attention"):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % self.num_heads == 0 # d_model 사이즈의 행렬을 num_heads로 나눠야하기 때문
        
        self.depth = d_model // self.num_heads
        
        # Q, K, V 정의 : d_model 길이의 밀집층(가중치 행렬)
        self.query_dense = tf.keras.layers.Dense(units=d_model) # WQ
        self.key_dense = tf.keras.layers.Dense(units=d_model) # WK
        self.value_dense = tf.keras.layers.Dense(units=d_model) #WV
        
        # WO
        self.dense = tf.keras.layers.Dense(units=d_model)
    
    # num_heads 개수로 행렬 나누기
    def split_heads(self, inputs, batch_size):
        inputs = tf.reshape(inputs, shape=(batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0,2,1,3])
    
    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']
        batch_size = tf.shape(query)[0]
        #print(batch_size)

        # 신경망 지나기
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        # 헤드 나누기
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        #스케일 닷 프로덕트
        scaled_attention, attention_weight = scaled_dot_product_attention(query, key, value, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0,2,1,3])

        # concat
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))

        # WO 밀집층 지나기
        outputs = self.dense(concat_attention)

        return outputs

In [17]:
# 인코더 레이어
def encoder_layer(dff, d_model, num_heads, dropout, name = "encoder_layer") :
    
    inputs = tf.keras.Input(shape = (None, d_model), name = "inputs")
    padding_mask = tf.keras.Input(shape = (1, 1, None), name = "padding_mask")
    attention = MultiHeadAttention(
        d_model, num_heads, name = "attention")({
            'query': inputs,
            'key': inputs,
            'value': inputs,
            'mask': padding_mask
        })
    attention = tf.keras.layers.Dropout(rate=dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention)
    outputs = tf.keras.layers.Dense(units=dff, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention + outputs)
    
    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [18]:
# real 인코더
def encoder(
    vocab_size, 
    num_layers, 
    dff, d_model, 
    num_heads, 
    dropout, 
    name="encoder"):

    inputs = tf.keras.Input(shape=(None,), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")
    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = encoder_layer(
            dff=dff, 
            d_model=d_model, 
            num_heads=num_heads,
            dropout=dropout,
            name="encoder_layer_{}".format(i))([outputs, padding_mask])
            
    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)


## 디코더 구현

In [19]:
def decoder_layer(dff, d_model, num_heads, dropout, name="decoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
    
    # mask
    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name="look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
    
    # masked multi-head attention
    attention1 = MultiHeadAttention(d_model, num_heads, name="attention_1").call(inputs={"query":inputs, "key":inputs, "value":inputs, "mask":look_ahead_mask})

    # add, normorlization
    attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention1 + inputs)

    # multi-head attention (encoder-decoder attention)
    attention2 = MultiHeadAttention(d_model, num_heads, name="attention_2").call(inputs={"query":attention1, "key":enc_outputs, "value":enc_outputs, "mask":padding_mask})

    # dropout, add, normorlization
    attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention2 + attention1)
    
    # feed forward (dense layer)
    outputs = tf.keras.layers.Dense(units=dff, activation="relu")(attention2)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)

    return tf.keras.Model(inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], outputs=outputs, name=name)

In [20]:
def decoder(vocab_size, num_layers, dff, d_model, num_heads, dropout, name="decoder"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")

    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name="look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")
    
    # output embedding
    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    # decoder layer * N
    for i in range(num_layers):
        outputs = decoder_layer(dff=dff, d_model=d_model, num_heads=num_heads, dropout=dropout, name="decoder_layer_{}".format(i),)(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])
    
    return tf.keras.Model(inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], outputs=outputs, name=name)

## 트랜스포머

In [21]:
def transformer(vocab_size, num_layers, dff, d_model, num_heads, dropout, name="transformer"):
    
    # encoder input (type: keras tensor)
    enc_inputs = tf.keras.Input(shape=(None,), name="enc_inputs")

    # decoder input (type: keras tensor)
    dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")

    # encoder padding mask (type: Lambda layer)
    enc_padding_mask = tf.keras.layers.Lambda(create_padding_mask, output_shape=(1, 1, None), name="enc_padding_mask")(enc_inputs)

    # decoder padding mask - first sub layer (type: Lambda layer)
    look_ahead_mask = tf.keras.layers.Lambda(create_look_ahead_mask, output_shape=(1, None, None), name="look_ahead_mask")(dec_inputs)

    # decoder padding mask - second sub layer (type: lambda layer)
    dec_padding_mask = tf.keras.layers.Lambda(create_padding_mask, output_shape=(1, 1, None), name="dec_padding_mask")(enc_inputs)

    # encoder * N (type: keras model)
    enc_outputs = encoder(vocab_size, num_layers, dff, d_model, num_heads, dropout,)(inputs=[enc_inputs, enc_padding_mask])

    # decoder * N (type: keras model)
    dec_outputs = decoder(vocab_size, num_layers, dff, d_model, num_heads, dropout,)(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

    # 다음 단어 예측 출력층(단어 개수만큼 출력 존재)
    outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)

    return tf.keras.Model(inputs=[enc_inputs, dec_inputs], outputs=outputs, name=name)

In [22]:
# 손실 함수 (cross entropy)

# 요약은 문장을 생성해내는 것이고, 이것은 단어 모음에 있는 단어 중
# 현재 문장 뒤에 올 단어 하나를 선택하는 다중 클래스 분류 문제이다.
# 따라서 cross entropy 함수를 손실함수로 사용한다.
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_OUTPUT_LENGTH -1))

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

## 데이터 정보
크기: 2176



In [28]:
tf.keras.backend.clear_session()

# hyper padameter (in paper)
D_MODEL = 256
NUM_LAYERS = 2
NUM_HEADS = 8
DFF = 512
DROPOUT = 0.1

model = transformer(
    vocab_size = VOCAB_SIZE,
    num_layers = NUM_LAYERS,
    dff = DFF,
    d_model = D_MODEL,
    num_heads = NUM_HEADS,
    dropout = DROPOUT
)

In [29]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** - 1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [33]:
lr = CustomSchedule(d_model=D_MODEL, warmup_steps=4000) # 학습률

optimizer = tf.keras.optimizers.Adam(
    learning_rate=lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)

def accuracy(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(BATCH_SIZE, MAX_OUTPUT_LENGTH -1))

    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [34]:
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 enc_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 enc_padding_mask (Lambda)      (None, 1, 1, None)   0           ['enc_inputs[0][0]']             
                                                                                                  
 encoder (Functional)           (None, None, 256)    3135232     ['enc_inputs[0][0]',             
                                                                  'enc_padding_mask[0][0

In [None]:
checkpoint_path = "training/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                save_weights_only=True,
                                                verbose=1)
EPOCHS=50
model.fit(dataset, epochs=EPOCHS, callbacks=[cp_callback], verbose=1)