# 00 - Predicting letter by letter with One-Hot-Encoding

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
def read_data(file_name):
    text = open(file_name, 'r').read()
    return text.lower()


class Alphabet:
    def __init__(self, text):
        from collections import Counter
        self._count = Counter(list(text))
        self._keys  = list(self._count.keys())
        self._dict  = {}
        for idx, key in enumerate(self._keys):
            self._dict[key] = idx
    
    def get_count(self):
        return self._count
    
    def get_size(self):
        return len(self._keys)
    
    def letter_to_index(self, letter):
        return self._dict.get( letter, 'err' )
    
    def index_to_letter(self, index):
        return self._keys[index]
    
    def one_hot(self, text):
        encoded = []
        for letter in text:
            one_hot = [0] * self.get_size()
            one_hot[self.letter_to_index(letter)] = 1
            encoded.append(one_hot)
        return np.array(encoded)
    
    def to_text(self, one_hots):
        indices = np.argmax( one_hots, axis=1 ).tolist()
        return "".join([self.index_to_letter(idx) for idx in indices])
    
    def indices_to_text(self, indices):
        print("shape")
        print(indices.shape)
        _indices = indices.tolist()
        print(_indices)
        return "".join([self.index_to_letter(idx) for idx in _indices])
        
    
text = read_data("data/cleaned-rap-lyrics/clean2_pac_.txt")
alphabet = Alphabet(text)

In [3]:
print(f"# unique characters: {alphabet.get_size()}")

# unique characters: 37


### One-Hot-Encoding

In [4]:
encoded = np.array(alphabet.one_hot(text))
encoded[:100]

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
alphabet.to_text(encoded)[:100]

"as real as it seems the american dream\nain't nothing but another calculated schemes\nto get us locked"

In [6]:
def batch_data(num_data, batch_size):
    """ Yield batches with indices until epoch is over.
    
    Parameters
    ----------
    num_data: int
        The number of samples in the dataset.
    batch_size: int
        The batch size used using training.

    Returns
    -------
    batch_ixs: np.array of ints with shape [batch_size,]
        Yields arrays of indices of size of the batch size until the epoch is over.
    """
    
    # data_ixs = np.random.permutation(np.arange(num_data))
    data_ixs = np.arange(num_data)
    ix = 0
    while ix + batch_size < num_data:
        batch_ixs = data_ixs[ix:ix+batch_size]
        ix += batch_size
        yield batch_ixs

### Multiclass Classification

What we are building in this first instance is a 36-class classifier. Based on the previous characters, our model will predict the next upcoming character!

#### Network Architecture:
1. **RNN cell**: LSTM with hidden_layer_size
2. **linear output layer**: maps to scores for 36 classes

In [7]:
def sample(predicted, temperature=0.9):
    '''
     helper function to sample an index from a probability array
     our model will output scores for each class
     we normalize those outputs and create a probability distribution out of them to sample from
    '''
    exp_predicted = np.exp(predicted/temperature)
    predicted = exp_predicted / np.sum(exp_predicted)
    probabilities = np.random.multinomial(1, predicted, 1)
    return probabilities

In [8]:
class RNN:
    def __init__(self, name):
        self.name = name
        self.weights = []
        self.biases = []
        
    def build(self, hidden_layer_size, vocab_size, time_steps, l2_reg=0.0):
        self.time_steps = time_steps
        self.vocab_size = vocab_size
        
        self.X = tf.placeholder(tf.float32, shape=[None, time_steps, vocab_size], name="data")
        self.Y = tf.placeholder(tf.int16, shape=[None, vocab_size], name="labels")
        
        _X = tf.transpose(self.X, [1, 0, 2])
        _X = tf.reshape(_X, [-1, vocab_size])
        _X = tf.split(_X, time_steps, 0)
        
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            
            # 1x RNN LSTM Cell
            self.rnn_cell   = tf.nn.rnn_cell.LSTMCell(hidden_layer_size)
            
            self.outputs, _ = tf.contrib.rnn.static_rnn(self.rnn_cell, _X, dtype=tf.float32)
            
            # 1x linear output layer
            W_out = tf.Variable(tf.truncated_normal([hidden_layer_size, vocab_size], 
                                                 mean=0, stddev=.01))
            b_out = tf.Variable(tf.truncated_normal([vocab_size],
                                                mean=0, stddev=.01))
            self.weights.append(W_out)
            self.biases.append(b_out)
            
            self.last_rnn_output = self.outputs[-1]
            self.final_output    = self.last_rnn_output @ W_out + b_out
            
            # softmax cross entropy as our loss function (between 36 classes)
            self.softmax = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.final_output,
                                                                labels=self.Y)
            self.cross_entropy_loss = tf.reduce_mean(self.softmax)
            
            self.loss = self.cross_entropy_loss
            
            self.optimizer = tf.train.AdamOptimizer()
            self.train_step= self.optimizer.minimize(self.loss)
            
            self.correct_prediction = tf.equal(tf.argmax(self.Y,1), tf.argmax(self.final_output, 1))
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))*100
    
    def train(self, train_data, train_labels, alphabet, epochs=20, batch_size=128):
        train_losses = []
        train_accs = []
        
        self.session = tf.Session()
        session = self.session
        
        with session.as_default():
            session.run(tf.global_variables_initializer())
            tr_loss, tr_acc = session.run([self.loss, self.accuracy],
                                          feed_dict={self.X: train_data,
                                                     self.Y: train_labels})
            train_losses.append(tr_loss)
            train_accs.append(tr_acc)
            
            for epoch in range(epochs):
                
                if(epoch + 1) % 1 == 0:
                    print(f"\n\nEpoch {epoch + 1}/{epochs}")
                    print(f"Loss:    \t {tr_loss}")
                    print(f"Accuracy:\t {tr_acc}")
                
                for batch_ixs in batch_data(len(train_data), batch_size):
                    _ = session.run(self.train_step,
                                   feed_dict={
                                       self.X: train_data[batch_ixs],
                                       self.Y: train_labels[batch_ixs],
                                   })
                tr_loss, tr_acc = session.run([self.loss, self.accuracy],
                                               feed_dict={self.X: train_data,
                                                          self.Y: train_labels
                                                         })
                train_losses.append(tr_loss)
                train_accs.append(tr_acc)
                
                #get on of training set as seed
                seed = train_data[:1:]
        
                #to print the seed 40 characters
                seed_chars = ''
                for each in seed[0]:
                    seed_chars += alphabet._keys[np.where(each == max(each))[0][0]]
                print ("Seed:" + seed_chars)
        
                #predict next 500 characters
                for i in range(500):
                    if i > 0:
                        remove_fist_char = seed[:,1:,:]
                        seed = np.append(remove_fist_char, np.reshape(probabilities, [1, 1, self.vocab_size]), axis=1)
                        
                    predicted = session.run([self.final_output], feed_dict = {self.X:seed})
                    predicted = np.asarray(predicted[0]).astype('float64')[0]
                    probabilities = sample(predicted)
                    predicted_chars = alphabet._keys[np.argmax(probabilities)]
                    seed_chars += predicted_chars
                print ('Result:'+ seed_chars)
        
        self.hist = {
            'train_losses': np.array(train_losses),
            'train_accuracy': np.array(train_accs)
        }

In [9]:
text = read_data('data/cleaned-rap-lyrics/clean2_pac_.txt')

In [10]:
step = 1

HIDDEN = 128
VOCAB_SIZE = 37
TIME_STEPS = 20
EPOCHS = 10

def making_one_hot(text, alphabet):
    '''
    '''
    unique_chars = alphabet._keys
    len_unique_chars = len(unique_chars)

    input_chars = []
    output_char = []
    
    for i in range(0, len(text) - TIME_STEPS, step):
        input_chars.append(text[i:i+TIME_STEPS])
        output_char.append(text[i+TIME_STEPS])
    train_data = np.zeros((len(input_chars), TIME_STEPS, len_unique_chars))
    target_data = np.zeros((len(input_chars), len_unique_chars))
    for i , each in enumerate(input_chars):
        for j, char in enumerate(each):
            train_data[i, j, unique_chars.index(char)] = 1
        target_data[i, unique_chars.index(output_char[i])] = 1
    return train_data, target_data, unique_chars, len_unique_chars

In [11]:
tr_data, tr_labels, unique_chars, len_unique = making_one_hot(text, alphabet)

In [12]:
basicRNN = RNN(name = "basic")
basicRNN.build(HIDDEN, VOCAB_SIZE, TIME_STEPS)
basicRNN.train(tr_data, tr_labels, alphabet, epochs=EPOCHS)



Epoch 1/10
Loss:    	 3.6113319396972656
Accuracy:	 5.893774032592773
Seed:as real as it seems 
Result:as real as it seems mte  mwe  ie  moom iia m spmorewiacs
feso
ee ii
ti se tdt a'paiar we tcw nedsgeto f csgedol   netde eeth' heey iu re h
nr adt entndeoe kee linod shpi'l r oyikse ajle

fe tstttond   bh noou tha tip  kaa ijog aibel
cutkrks cbownaa
'e
nlnzi usk s wrsett  wee  fueo niny dhoh
2g
 ogie ws t a ne vehggvttvt h'o nhculpsy wgwheat eim ftahetwr gkttt no r nptot  mertalosit stk cofwl shatereh ncnnnea ahs oio
fjlide rh c var ee lzo
toi romiand alnnt ten b btheuadst ellgnstow  uahn yerl'lwol iaiilte etbs  p
o


Epoch 2/10
Loss:    	 2.7001945972442627
Accuracy:	 26.048110961914062
Seed:as real as it seems 
Result:as real as it seems cifl bninos ak mya zi herad wane joyo anlith woiw hat man' lats y inew i ffi'  itkez'n cith t'utwritee if lrsrdtgtonn bewyd me waupatide tou loi thirk toucl ms eos il tacd gonf mo laye whene thit keret of hit ou  astha af on'd 'u f yome nout mapea


## Learnings
Some of words slowly begin to make sense, but it is mostly gibberish. Instead of predicting letters, we should try to predict word by word!

Have a look at **02-word-embedding** for that!

In [13]:
tr_data.shape

(60776, 20, 37)