## Task 1: Data and Tokenizer
1. Import the data
2. Create a tokenizer
3. Char to index and Index to char dictionaries


In [1]:
!git clone https://github.com/am1tyadav/superhero

Cloning into 'superhero'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 8 (delta 0), reused 4 (delta 0), pack-reused 0[K
Receiving objects: 100% (8/8), 47.08 KiB | 1.88 MiB/s, done.


In [2]:
with open('superhero/superheroes.txt','r') as f:
    data = f.read()
data[:100]

'jumpa\t\ndoctor fate\t\nstarlight\t\nisildur\t\nlasher\t\nvarvara\t\nthe target\t\naxel\t\nbattra\t\nchangeling\t\npyrrh'

In [3]:
import tensorflow as tf

In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~',
    split='\n',
)

In [5]:
tokenizer.fit_on_texts(data)

In [6]:
char_to_index = tokenizer.word_index
index_to_char = dict((v,k) for k , v in char_to_index.items())
print(index_to_char)

{1: '\t', 2: 'a', 3: 'e', 4: 'r', 5: 'o', 6: 'n', 7: 'i', 8: ' ', 9: 't', 10: 's', 11: 'l', 12: 'm', 13: 'h', 14: 'd', 15: 'c', 16: 'u', 17: 'g', 18: 'k', 19: 'b', 20: 'p', 21: 'y', 22: 'w', 23: 'f', 24: 'v', 25: 'j', 26: 'z', 27: 'x', 28: 'q'}


## Task 2: Names and Sequences
1. Converting between names and sequences


In [7]:
names = data.splitlines()
names[:10]

['jumpa\t',
 'doctor fate\t',
 'starlight\t',
 'isildur\t',
 'lasher\t',
 'varvara\t',
 'the target\t',
 'axel\t',
 'battra\t',
 'changeling\t']

In [8]:
tokenizer.texts_to_sequences(names[0])

[[25], [16], [12], [20], [2], [1]]

In [9]:
def name_to_seq(name):
    return[tokenizer.texts_to_sequences(c)[0][0] for c in name]

In [10]:
name_to_seq(names[0])

[25, 16, 12, 20, 2, 1]

In [11]:
def seq_to_name(seq):
    return ''.join([index_to_char[i] for i in seq if i != 0])

In [12]:
seq_to_name(name_to_seq(names[0]))

'jumpa\t'

In [13]:
print(names[1])
print(tokenizer.texts_to_sequences(names[1]))
print(name_to_seq(names[1]))
print(seq_to_name(name_to_seq(names[1])))

doctor fate	
[[14], [5], [15], [9], [5], [4], [8], [23], [2], [9], [3], [1]]
[14, 5, 15, 9, 5, 4, 8, 23, 2, 9, 3, 1]
doctor fate	


## Task 3: Creating Examples
1. Creating sequences
2. Padding all sequences

In [14]:
sequences= []
for name in names:
    seq = name_to_seq(name)
    if len(seq) >=2:
        sequences += [seq[:i] for i in  range(2, len(seq) + 1)]

In [20]:
sequences[5:16]

[[14, 5],
 [14, 5, 15],
 [14, 5, 15, 9],
 [14, 5, 15, 9, 5],
 [14, 5, 15, 9, 5, 4],
 [14, 5, 15, 9, 5, 4, 8],
 [14, 5, 15, 9, 5, 4, 8, 23],
 [14, 5, 15, 9, 5, 4, 8, 23, 2],
 [14, 5, 15, 9, 5, 4, 8, 23, 2, 9],
 [14, 5, 15, 9, 5, 4, 8, 23, 2, 9, 3],
 [14, 5, 15, 9, 5, 4, 8, 23, 2, 9, 3, 1]]

In [21]:
max_len = max([len(x) for x in sequences])
print(max_len)

33


In [24]:
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    sequences, 
    padding='pre',
    maxlen= max_len
)
print(padded_sequences[1])

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0 25 16 12]


In [25]:
padded_sequences.shape

(88279, 33)

## Task 4: Training and Validation Sets
1. Creating training and validation sets

## Task 5: Creating the Model


## Task 6: Training the Model


## Task 7: Generating Names
