In [1]:
import tensorflow as tf
import numpy as np
import re

class DataSet():
    def __init__(self, new_code=None):
        
        # user's customizing data
        self.new_code = new_code
        
        # built-in dataset
        self.seq_data = ['tuple', 'list', 'dic', 'return', 'print', 
                         'for', 'range', 'while', 'not', 'is', 'sort']
        
        # the longest word's length among dataset
        self.max_word_len = max([len(word) for word in self.seq_data])
        
        # update built-in dataset with given data from a user
        self.add_new_code2seq_data()
    
    # preprocess new codes(new data)
    def clean_new_code(self):
        if self.new_code:
            assert type(self.new_code) == str
            new_code_ls = re.sub('[ "\']+', ' ', re.sub('[\t]', ' ', self.new_code)).split('\n')
            new_code_ls = [word for sent in new_code_ls for word in sent.split(' ')]
            new_code_ = re.sub('[ ]+', ' ', re.sub('[\W]', ' ', self.new_code))
            new_code_ls2 = new_code_.split(' ')
            new_code_ls2 = [word for sent in new_code_ls2 for word in sent.split(' ')]
            new_code_ls.extend(new_code_ls2)
            return new_code_ls
        else:
            print('There is no new code to preprocess!')
            return None
    
    def add_new_code2seq_data(self):
        new_code_ls = self.clean_new_code()
        if new_code_ls:
            assert type(new_code_ls) == list
            self.seq_data.extend(new_code_ls)
            self.max_word_len = max([len(word) for word in self.seq_data])
    
    # make word dictionary
    def make_word2idx_idx2word(self, get_vocab_size=True):
        """
        return : (word2idx(dic type), idx2word(dic type), vocab_size(int type))
        """
        # base characters
        words_arr = set("")
        for i in range(len(self.seq_data)):
            x = [self.seq_data[i][:j] for j in range(1, len(self.seq_data[i]))]
            y = [self.seq_data[i][j:] for j in range(1, len(self.seq_data[i]))]
            words_arr.update(set(list(self.seq_data[i])+ x + y))

        word2idx = {word: i for i, word in enumerate(words_arr)}
        idx2word = {i: word for i, word in enumerate(words_arr)}
        
        # add '[UKN]' token in the both word dictionaries
        word2idx['[UKN]'] = len(word2idx)
        idx2word[word2idx['[UKN]']] = ' '
        
        if get_vocab_size:
            vocab_size = len(word2idx)
            return word2idx, idx2word, vocab_size
        else:
            return word2idx, idx2word
        
    def get_dataset(self):
        """
        return : x, y, max word length, vocabulary size
        x shape : (dataset length, max word length)
        y shape : (dataset length, length of word dictionary)
        """
        
        word2idx, idx2word, vocab_size = self.make_word2idx_idx2word()
        
        inputs = list(list(list(self.seq_data[i][:j])for j in range(1, len(self.seq_data[i]))) for i in range(len(self.seq_data)))
        outputs = list(list(self.seq_data[i][j:] for j in range(1, len(self.seq_data[i]))) for i in range(len(self.seq_data)))
        inputs_vec = [[word2idx[char_ls[i]] for i in range(len(char_ls))] for word_ls in inputs for char_ls in word_ls]
        x = tf.keras.preprocessing.sequence.pad_sequences(inputs_vec, maxlen=self.max_word_len, padding='pre')
        y = np.array([word2idx[word] for word_ls in outputs for word in word_ls])
        y = np.eye(vocab_size)[y]
        return x, y, self.max_word_len, vocab_size
    

In [2]:
tf_filepath = "../data/text_data/tf_all_symbols.txt"
with open(tf_filepath, 'r') as f:
    tf_code_text = f.read()

tf_code_text
dataset = DataSet(new_code = tf_code_text)
print('raw data : \n', dataset.seq_data)
print('max word length : ', dataset.max_word_len)

raw data : 
 ['tuple', 'list', 'dic', 'return', 'print', 'for', 'range', 'while', 'not', 'is', 'sort', 'tf', 'tf.AggregationMethod', 'tf.Assert', 'tf.CriticalSection', 'tf.DType', 'tf.DeviceSpec', 'tf.GradientTape', 'tf.Graph', 'tf.IndexedSlices', 'tf.IndexedSlicesSpec', 'tf.Module', 'tf.Operation', 'tf.OptionalSpec', 'tf.RaggedTensor', 'tf.RaggedTensorSpec', 'tf.RegisterGradient', 'tf.SparseTensor', 'tf.SparseTensorSpec', 'tf.Tensor', 'tf.TensorArray', 'tf.TensorArraySpec', 'tf.TensorShape', 'tf.TensorSpec', 'tf.TypeSpec', 'tf.UnconnectedGradients', 'tf.Variable', 'tf.Variable.SaveSliceInfo', 'tf.VariableAggregation', 'tf.VariableSynchronization', 'tf.abs', 'tf.acos', 'tf.acosh', 'tf.add', 'tf.add_n', 'tf.argmax', 'tf.argmin', 'tf.argsort', 'tf.as_dtype', 'tf.as_string', 'tf.asin', 'tf.asinh', 'tf.assert_equal', 'tf.assert_greater', 'tf.assert_less', 'tf.assert_rank', 'tf.atan', 'tf.atan2', 'tf.atanh', 'tf.audio', 'tf.audio.decode_wav', 'tf.audio.encode_wav', 'tf.autodiff', 'tf.autodi

In [3]:
word2idx, idx2word, vocab_size = dataset.make_word2idx_idx2word()
inputs = list(list(list(dataset.seq_data[i][:j])for j in range(1, len(dataset.seq_data[i]))) for i in range(len(dataset.seq_data)))
outputs = list(list(dataset.seq_data[i][j:] for j in range(1, len(dataset.seq_data[i]))) for i in range(len(dataset.seq_data)))
for i, (x, y) in enumerate(zip(inputs, outputs)):
    if i > 20 and i < 26:
        print(f'inputs{i} ---- outputs{i}')
        for j in range(len(x)):
            print(x[j], '----', y[j])
        print()

inputs21 ---- outputs21
['t'] ---- f.Module
['t', 'f'] ---- .Module
['t', 'f', '.'] ---- Module
['t', 'f', '.', 'M'] ---- odule
['t', 'f', '.', 'M', 'o'] ---- dule
['t', 'f', '.', 'M', 'o', 'd'] ---- ule
['t', 'f', '.', 'M', 'o', 'd', 'u'] ---- le
['t', 'f', '.', 'M', 'o', 'd', 'u', 'l'] ---- e

inputs22 ---- outputs22
['t'] ---- f.Operation
['t', 'f'] ---- .Operation
['t', 'f', '.'] ---- Operation
['t', 'f', '.', 'O'] ---- peration
['t', 'f', '.', 'O', 'p'] ---- eration
['t', 'f', '.', 'O', 'p', 'e'] ---- ration
['t', 'f', '.', 'O', 'p', 'e', 'r'] ---- ation
['t', 'f', '.', 'O', 'p', 'e', 'r', 'a'] ---- tion
['t', 'f', '.', 'O', 'p', 'e', 'r', 'a', 't'] ---- ion
['t', 'f', '.', 'O', 'p', 'e', 'r', 'a', 't', 'i'] ---- on
['t', 'f', '.', 'O', 'p', 'e', 'r', 'a', 't', 'i', 'o'] ---- n

inputs23 ---- outputs23
['t'] ---- f.OptionalSpec
['t', 'f'] ---- .OptionalSpec
['t', 'f', '.'] ---- OptionalSpec
['t', 'f', '.', 'O'] ---- ptionalSpec
['t', 'f', '.', 'O', 'p'] ---- tionalSpec
['t', 'f', 

In [4]:
x, y, max_word_len, vocab_size = dataset.get_dataset()

In [7]:
x.shape, y.shape

((181260, 66), (181260, 121924))

In [2]:
dataset = DataSet(new_code = "import tensorflow as tf \n tf.keras.layers.LSTM")
print('raw data : \n', dataset.seq_data)
print('max word length : ', dataset.max_word_len)

raw data : 
 ['tuple', 'list', 'dic', 'return', 'print', 'for', 'range', 'while', 'not', 'is', 'sort', 'import', 'tensorflow', 'as', 'tf', '', '', 'tf.keras.layers.LSTM', 'import', 'tensorflow', 'as', 'tf', 'tf', 'keras', 'layers', 'LSTM']
max word length :  20


In [21]:
word2idx, idx2word, vocab_size = dataset.make_word2idx_idx2word()

In [22]:
inputs = list(list(list(dataset.seq_data[i][:j])for j in range(1, len(dataset.seq_data[i]))) for i in range(len(dataset.seq_data)))
outputs = list(list(dataset.seq_data[i][j:] for j in range(1, len(dataset.seq_data[i]))) for i in range(len(dataset.seq_data)))
for i, (x, y) in enumerate(zip(inputs, outputs)):
    if i < 6:
        print(f'inputs{i} ---- outputs{i}')
        for j in range(len(x)):
            print(x[j], '----', y[j])
        print()

inputs0 ---- outputs0
['t'] ---- uple
['t', 'u'] ---- ple
['t', 'u', 'p'] ---- le
['t', 'u', 'p', 'l'] ---- e

inputs1 ---- outputs1
['l'] ---- ist
['l', 'i'] ---- st
['l', 'i', 's'] ---- t

inputs2 ---- outputs2
['d'] ---- ic
['d', 'i'] ---- c

inputs3 ---- outputs3
['r'] ---- eturn
['r', 'e'] ---- turn
['r', 'e', 't'] ---- urn
['r', 'e', 't', 'u'] ---- rn
['r', 'e', 't', 'u', 'r'] ---- n

inputs4 ---- outputs4
['p'] ---- rint
['p', 'r'] ---- int
['p', 'r', 'i'] ---- nt
['p', 'r', 'i', 'n'] ---- t

inputs5 ---- outputs5
['f'] ---- or
['f', 'o'] ---- r



In [25]:
inputs_vec = [[word2idx[char_ls[i]] for i in range(len(char_ls))] for word_ls in inputs for char_ls in word_ls]
inputs_vec[:10]

[[126],
 [126, 8],
 [126, 8, 84],
 [126, 8, 84, 31],
 [31],
 [31, 112],
 [31, 112, 63],
 [87],
 [87, 112],
 [92]]

In [27]:
x = tf.keras.preprocessing.sequence.pad_sequences(inputs_vec, maxlen=dataset.max_word_len, padding='pre')
y = np.array([word2idx[word] for word_ls in outputs for word in word_ls])
y = np.eye(vocab_size)[y]

In [30]:
x[:5]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0, 126],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0, 126,   8],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0, 126,   8,  84],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0, 126,   8,  84,  31],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,  31]], dtype=int32)

In [34]:
assert vocab_size == len(y[0])
print(len(y[0]))
y[0]

145


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])