In [3]:
import json

import numpy as np
import pandas as pd

data_path = "E:\\MachineLearning\\Study\\RadiationReportSummarization\\Dataset\\train.csv"
train_path = "E:\\MachineLearning\\Study\\RadiationReportSummarization\\Dataset\\train_set.csv"
valid_path = "E:\\MachineLearning\\Study\\RadiationReportSummarization\\Dataset\\test_set.csv"
metadata_path = "E:\\MachineLearning\\Study\\RadiationReportSummarization\\Dataset\\char_level.txt"

batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 100  # Number of samples to train on.


class Lang:
    def __init__(self, metadata_path):
        self.path = metadata_path
        self.load()

    def load(self):
        with open(self.path) as file:
            data = json.load(file)
            self.metadata = data['metadata'][0]
            self.input_token_dict = dict([(x, i) for i, x in enumerate(self.metadata['input_char'])])
            self.output_token_dict = dict([(x, i) for i, x in enumerate(self.metadata['target_char'])])

    def get_index(self, char):
        return self.input_token_dict[char]

    def get_in_char_set(self):
        return self.metadata['input_char']

    def get_ta_char_set(self):
        return self.metadata['target_char']

    def get_num_en_token(self):
        return self.metadata['num_en_token']

    def get_num_de_token(self):
        return self.metadata['num_de_token']

    def get_max_en_len(self):
        return self.metadata['max_en_seq_len']

    def get_max_de_len(self):
        return self.metadata['max_de_seq_len']


class DataLoader:
    def __init__(self, train_path, valid_path, lang: Lang):
        self.train_path = train_path
        self.valid_path = valid_path
        self.lang = lang

    def embedding(self, input):
        encoder_input_data = np.zeros((len(input[0]), self.lang.get_max_en_len(), len(self.lang.get_in_char_set())), dtype='float32')
        decoder_output_data = np.zeros((len(input[0]), self.lang.get_max_de_len(), len(self.lang.get_ta_char_set())), dtype='float32')
        decoder_target_data = np.zeros((len(input[0]), self.lang.get_max_de_len(), len(self.lang.get_ta_char_set())), dtype='float32')

        for i, (input_text, target_text) in enumerate(zip(input[0], input[1])):
            for t, char in enumerate(input_text):
                encoder_input_data[i, t, self.lang.get_index(char)] = 1
            for t, char in enumerate(target_text):
                decoder_output_data[i, t, self.lang.get_index(char)] = 1
                if t > 0:
                    decoder_target_data[i, t - 1, self.lang.get_index(char)] = 1

        return ([encoder_input_data, decoder_output_data], decoder_target_data)

    def load_data(self, is_train=True):
        path = self.train_path if is_train else self.valid_path
        df = pd.read_csv(path, index_col=0, header=None)
        inputs = [x for x in df[3]]
        outputs = ['\t' + x + '\n' for x in df[4]]
        data = (inputs, outputs)
        return self.embedding(data)

    def generate_data(self, batch_size, step, is_train=True):
        path = self.train_path if is_train else self.valid_path
        idx = 1

        while True:
            df = pd.read_csv(path, skiprows=(idx - 1) * batch_size, nrows=batch_size, header=None, index_col=0)
            inputs = [x for x in df[3]]
            outputs = []
            for x in df[4]:
                outputs.append('\t' + x + '\n')
            data = (inputs, outputs)
            yield self.embedding(data)

            if idx < step:
                idx = idx + 1
            else:
                idx = 1


In [4]:
from keras.layers import *
from keras.models import *

import numpy as np

data_path = "E:\\MachineLearning\\Study\\RadiationReportSummarization\\Dataset\\train.csv"
train_path = "E:\\MachineLearning\\Study\\RadiationReportSummarization\\Dataset\\train_set.csv"
valid_path = "E:\\MachineLearning\\Study\\RadiationReportSummarization\\Dataset\\test_set.csv"
metadata_path = "E:\\MachineLearning\\Study\\RadiationReportSummarization\\Dataset\\char_level.txt"

batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 100  # Number of samples to train on.

lang = Lang(metadata_path)
dataloader = DataLoader(train_path, valid_path, lang)

num_encoder_tokens = lang.get_num_en_token()
num_decoder_tokens = lang.get_num_de_token()
input_characters = lang.get_in_char_set()
target_characters = lang.get_ta_char_set()
max_encoder_seq_length = lang.get_max_en_len()
max_decoder_seq_length = lang.get_max_de_len()

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()
# print(model.layers[-1].input)
print(model.layers[-3].output)

step = int(np.ceil(73236 / batch_size))
step_valid = int(np.ceil(18308 / batch_size))

# Run training
from keras.optimizers import *
model.compile(optimizer=Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.001), loss='categorical_crossentropy')
"""model.fit(x=dataloader.generate_data(batch_size, step),
          epochs=epochs,
          validation_data=dataloader.generate_data(batch_size, step_valid))"""


# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in lang.input_token_dict.items())
reverse_target_char_index = dict(
    (i, char) for char, i in lang.output_token_dict.items())




Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 87)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 87)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 352256      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  352256      input_2[0][0]                    
                                                                 lstm[0][1]            

In [16]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, lang.get_index('\t')] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


def decode(input):
    result = ""
    
    for i in range(len(input[0])):
        for j in range(len(input[0][i])):
            if input[0][i][j] == 1:
                result = result + reverse_input_char_index[j]
            
    return result

input_seq = dataloader.generate_data(1, 1, False)

for (input, output), target in input_seq:
    # Take one sequence (part of the training set)
    # for trying out decoding.
    
    print("Decode sentence: ", decode(input))
    break

Decode sentence:  Again seen is a opacity in the left mid lung which is mildly improved in appearance from the prior study. There is a consolidation at the base of the right lung which appears worse from the prior study. The cardiomediastinal silhouette and hilar contours are normal. There is no evidence of pneumothorax and there may be a small right pleural effusion.
