In [12]:
from importlib.metadata import version
import nltk
import tensorflow as tf
import summa
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/sunnysai12345/News_Summary/master/news_summary_more.csv", filename="news_summary_more.csv")
data = pd.read_csv('news_summary_more.csv', encoding='iso-8859-1')

In [3]:
data.sample(10)

Unnamed: 0,headlines,text
37991,Peru lose to Denmark in their 1st World Cup ga...,Peru started their first FIFA World Cup campai...
66627,Hamilton wins Japanese GP to extend c'ship lea...,Mercedes' British Formula One driver Lewis Ham...
70482,Ojha dropped after CAB selectors 'fail to esta...,Indian spinner Pragyan Ojha has been dropped f...
24022,"Kohli, Dhawan do bhangra while entering field ...",Team India captain Virat Kohli and opener Shik...
37616,Lobbying in India done without unlawful paymen...,AirAsia Group has said that it lobbied for an ...
79642,Flyers can claim refund of excess levy at Delh...,Passengers flying to and from Delhi's Indira G...
19877,20-yr-old Neeraj gives India 1st javelin gold ...,Neeraj Chopra on Monday became the first-ever ...
72144,Kangana names Aditya Pancholi as her abuser: R...,"According to reports, actress Kangana Ranaut h..."
78653,Spike got stuck in the pitch: Raj on her WWC f...,Responding to criticism over her run-out dismi...
16620,Rashid Khan sets record for most int'l wickets...,"Afghanistan spinner Rashid Khan, who turned 20..."


In [4]:
# Step 2. 데이터 전처리하기
def preprocess_text(text):
    text = text.lower()
    return text

data['text'] = data['text'].apply(preprocess_text)

In [19]:
# Step 3. 데이터 토큰화, 패딩
# 토큰화
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
tokenizer.fit_on_texts(data['headlines'])

# 텍스트를 시퀀스로 변환
text_sequences = tokenizer.texts_to_sequences(data['text'])
summary_sequences = tokenizer.texts_to_sequences(data['headlines'])

# 시퀀스 패딩
text_max_len = 100
summary_max_len = 20

x_train = pad_sequences(text_sequences, maxlen=text_max_len, padding='post')
y_train = pad_sequences(summary_sequences, maxlen=summary_max_len, padding='post')

In [20]:
# Step 4. 어텐션 메커니즘 사용하기 (Seq2Seq 모델 정의 및 학습)
# 인코더 입력
encoder_inputs = Input(shape=(text_max_len,))
enc_emb = Embedding(len(tokenizer.word_index)+1, 128, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(128, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# 디코더 입력
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(len(tokenizer.word_index)+1, 128, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# 어텐션 적용
attn_layer = Attention()([decoder_outputs, encoder_outputs])

# 디코더 출력
decoder_concat_input = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs, attn_layer])
decoder_dense = Dense(len(tokenizer.word_index)+1, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

In [7]:
# 모델 생성
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 모델 컴파일 및 학습
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
es = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=2)
model.fit([x_train, y_train[:, :-1]], y_train[:, 1:], epochs=50, callbacks=[es], batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f98d04ef580>

In [8]:
from tensorflow.keras.models import load_model

# 모델 학습이 완료된 후, 모델 저장
model.save("saved_model_fit.h5")

In [22]:
# Step 5. 추상적 요약 결과 비교 (추상적 요약 결과 생성 및 실제 요약과 비교)
def decode_sequence(input_seq):
    # 입력 시퀀스를 상태 벡터로 변환
    states_value = encoder_model.predict(input_seq)

    # 목표 시퀀스 초기화
    target_seq = np.zeros((1, 1))

    # 첫 번째 입력 단어를 <start>로 설정
    target_seq[0, 0] = tokenizer.word_index['start']

    # 디코더의 결과 시퀀스 생성
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # 사전 인덱스를 단어로 변환
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_word

        # 종료 조건: 최대 길이에 도달하거나 <end>를 만날 때
        if sampled_word == 'end' or len(decoded_sentence) > summary_max_len:
            stop_condition = True

        # 목표 시퀀스 업데이트
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # 상태 업데이트
        states_value = [h, c]

    return decoded_sentence

In [23]:
# 인코더 모델과 디코더 모델 생성
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])
decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(dec_emb_layer(decoder_inputs), initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)


In [24]:
# 요약 문장 생성 및 비교
for i in range(5):  # 상위 5개 샘플만 확인
    print("원문: ", data['text'][i])
    print("실제 요약: ", data['headlines'][i])
    print("추상적 요약: ", decode_sequence(x_train[i].reshape(1, text_max_len)))
    print("\n")

원문:  saurav kant, an alumnus of upgrad and iiit-b's pg program in machine learning and artificial intelligence, was a sr systems engineer at infosys with almost 5 years of work experience. the program and upgrad's 360-degree career support helped him transition to a data scientist at tech mahindra with 90% salary hike. upgrad's online power learning has powered 3 lakh+ careers.
실제 요약:  upGrad learner switches to career in ML & Al with 90% salary hike


ValueError: in user code:

    /opt/conda/lib/python3.9/site-packages/keras/engine/training.py:1586 predict_function  *
        return step_function(self, iterator)
    /opt/conda/lib/python3.9/site-packages/keras/engine/training.py:1576 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/conda/lib/python3.9/site-packages/keras/engine/training.py:1569 run_step  **
        outputs = model.predict_step(data)
    /opt/conda/lib/python3.9/site-packages/keras/engine/training.py:1537 predict_step
        return self(x, training=False)
    /opt/conda/lib/python3.9/site-packages/keras/engine/base_layer.py:1020 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /opt/conda/lib/python3.9/site-packages/keras/engine/input_spec.py:199 assert_input_compatibility
        raise ValueError('Layer ' + layer_name + ' expects ' +

    ValueError: Layer model_7 expects 3 input(s), but it received 4 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 1) dtype=float32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 100, 128) dtype=float32>, <tf.Tensor 'IteratorGetNext:2' shape=(None, 128) dtype=float32>, <tf.Tensor 'IteratorGetNext:3' shape=(None, 128) dtype=float32>]
