In [1]:
import os, re 
import numpy as np
import tensorflow as tf

In [2]:
file_path = '../../data/exploration_06/data/shakespeare.txt'
with open(file_path, "r") as f:
    raw_corpus = f.read().splitlines()

print(raw_corpus[:9])

['First Citizen:', 'Before we proceed any further, hear me speak.', '', 'All:', 'Speak, speak.', '', 'First Citizen:', 'You are all resolved rather to die than to famish?', '']


In [3]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip() # 1
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence) # 2
    sentence = re.sub(r'[" "]+', " ", sentence) # 3
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence) # 4
    sentence = sentence.strip() # 5
    sentence = '<start> ' + sentence + ' <end>' # 6
    return sentence

In [6]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue
    if sentence[-1] == ":": continue
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)

corpus[:10]

['<start> before we proceed any further , hear me speak . <end>',
 '<start> speak , speak . <end>',
 '<start> you are all resolved rather to die than to famish ? <end>',
 '<start> resolved . resolved . <end>',
 '<start> first , you know caius marcius is chief enemy to the people . <end>',
 '<start> we know t , we know t . <end>',
 '<start> let us kill him , and we ll have corn at our own price . <end>',
 '<start> is t a verdict ? <end>',
 '<start> no more talking on t let it be done away , away ! <end>',
 '<start> one word , good citizens . <end>']

In [7]:
len(corpus)

24015

In [22]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=7000, filters=' ', oov_token="<unk>")
tokenizer.fit_on_texts(corpus)
tensor = tokenizer.texts_to_sequences(corpus)
print(type(tensor))
print(np.array(tensor))
print(np.array([len(sentence) for sentence in tensor]).max())
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
print(type(tensor))
print(tensor.shape)

<class 'list'>
[list([2, 143, 40, 933, 140, 591, 4, 124, 24, 110, 5, 3])
 list([2, 110, 4, 110, 5, 3])
 list([2, 11, 50, 43, 1201, 316, 9, 201, 74, 9, 3034, 15, 3]) ...
 list([2, 149, 4553, 4, 3])
 list([2, 34, 71, 132, 39, 328, 390, 201, 4, 316, 2965, 132, 3])
 list([2, 945, 34, 134, 1787, 5, 3])]
21
<class 'numpy.ndarray'>
(24015, 21)


  print(np.array(tensor))


In [23]:
print(len(corpus))
print(tensor.shape) #24015개의 문장, 각 문장은 21차원의 벡터
print(corpus[0])
print(corpus[1])
print(corpus[2])
print(tensor[0])
print(tensor[1])
print(tensor[2])

24015
(24015, 21)
<start> before we proceed any further , hear me speak . <end>
<start> speak , speak . <end>
<start> you are all resolved rather to die than to famish ? <end>
[  2 143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0
   0   0   0]
[  2 110   4 110   5   3   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0]
[   2   11   50   43 1201  316    9  201   74    9 3034   15    3    0
    0    0    0    0    0    0    0]


![image.png](attachment:image.png)

In [25]:
src_input = tensor[:, :-1]  
tgt_input = tensor[:, 1:]    
print(src_input[0])
print(tgt_input[0])

[  2 143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0
   0   0]
[143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0   0
   0   0]


In [26]:
BUFFER_SIZE = len(src_input)
print('BUFFER_SIZE :',BUFFER_SIZE)
BATCH_SIZE = 256
print('BATCH_SIZE :',BATCH_SIZE)
steps_per_epoch = len(src_input) // BATCH_SIZE

BUFFER_SIZE : 24015
