In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
processed_df = pd.read_pickle('../data/processed_df.pkl')

In [3]:
temp_bioguide_id = 'H001055'
temp_df = processed_df[processed_df.bioguide_id==temp_bioguide_id]

In [4]:
X_train = list(map(lambda x: ['[BOS]'] + list(x.lower()) + ['[EOS]'], temp_df.speech.values))

In [5]:
lookup_layer = tf.keras.layers.StringLookup(num_oov_indices=0,output_mode='int')
lookup_layer.adapt(tf.ragged.constant(X_train))

X_train_idx = lookup_layer(tf.ragged.constant(X_train))

train_ds = tf.data.Dataset.from_tensor_slices(X_train_idx)
train_ds = train_ds.shuffle(1000).batch(5).prefetch(tf.data.AUTOTUNE)

2022-04-22 11:14:39.061089: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
class RNN_LM(tf.keras.Model):
    def __init__(self):
        super().__init__()
        #self.lstm1 = tf.keras.layers.LSTM(units=512,return_sequences=True)
        self.lstm2 = tf.keras.layers.LSTM(units=256,return_sequences=True)
        #self.dense1 = tf.keras.layers.Dense(units=256,activation='relu')
        self.dense2 = tf.keras.layers.Dense(units=128,activation='relu')
        self.dense3 = tf.keras.layers.Dense(units=len(lookup_layer.get_vocabulary()))

    def call(self,x,mask=None):
        
        #hiddens = self.lstm2(self.lstm1(x))
        #outputs = self.dense3(self.dense2(self.dense1(hiddens)))
        hiddens = self.lstm2(inputs=x,mask=mask)
        outputs = self.dense3(hiddens)
        
        return outputs

In [7]:
model = RNN_LM()

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0001)

loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

loss_tracker = tf.keras.metrics.Mean(name='categorical_loss')

epochs = 20
for epoch in range(epochs):

    for step, batch in enumerate(train_ds):
        
        # batch processing
        max_len = -1
        for temp_sample in batch:
            if max_len < temp_sample.shape[0]:
                max_len = temp_sample.shape[0]
        
        padded_batch = tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences(list(batch),maxlen=max_len, padding='post',value=-1))
        rnn_mask = padded_batch!=-1

        one_hot_depth = len(lookup_layer.get_vocabulary())
        
        padded_batch_one_hot_encoded = tf.one_hot(padded_batch,one_hot_depth)
        
        with tf.GradientTape() as tape:
            y_pred = model(padded_batch_one_hot_encoded,rnn_mask)[:,:-1,:] # from char 0 to char [EOS] - 1
            y = padded_batch_one_hot_encoded[:,1:,:] # from char 1 to EOS
            
            loss_value = loss_fn(y, y_pred)

        
        trainable_vars = model.trainable_variables
        gradients = tape.gradient(loss_value, trainable_vars)
        optimizer.apply_gradients(zip(gradients, trainable_vars))

        loss_tracker.update_state(loss_value)
        
    
    epoch_loss = loss_tracker.result()
    print(f'epoch: {epoch}, loss: {epoch_loss:.4f}')

    loss_tracker.reset_state()
        

epoch: 0, loss: 2.4143


KeyboardInterrupt: 

In [None]:
rnn_mask.shape

In [None]:
tensor = tf.expand_dims(tensor,axis=0)
batch_size,seq_len,voc_size = tensor.shape

In [None]:
class RNN_LM(tf.keras.Model):
    def __init__(self):
        super().__init__()
        #self.lstm1 = tf.keras.layers.LSTM(units=512,return_sequences=True)
        self.lstm2 = tf.keras.layers.LSTM(units=1024,return_sequences=True)
        #self.dense1 = tf.keras.layers.Dense(units=256,activation='relu')
        self.dense2 = tf.keras.layers.Dense(units=128,activation='relu')
        self.dense3 = tf.keras.layers.Dense(units=len(lookup_layer.get_vocabulary()))

    def call(self,x,mask):
        
        #hiddens = self.lstm2(self.lstm1(x))
        #outputs = self.dense3(self.dense2(self.dense1(hiddens)))
        hiddens = self.lstm2(inputs=x,mask=mask)
        outputs = self.dense3(hiddens)
        
        return outputs

    '''def train_step(self, x):
        # for each batch do the following operation
        max_len = -1
        for temp_sample in x:
            if max_len < temp_sample.shape[0]:
                max_len = temp_sample.shape[0]
        
        padded_batch = tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences(list(i),maxlen=max_len, padding='post',value=-1))
        self.rnn_mask = padded_batch!=-1

        one_hot_depth = len(lookup_layer.get_vocabulary())
        
        padded_batch_one_hot_encoded = tf.one_hot(padded_batch,one_hot_depth)

        with tf.GradientTape() as tape:
            y_pred = self(padded_batch_one_hot_encoded)[:,:-1,:] # from char 0 to char [EOS] - 1
            y = x[:,1:,:] # from char 1 to EOS
            
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}'''

In [None]:
model = RNN_LM()

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)

loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, verbose=1)

model.compile(optimizer=optimizer, loss=loss)

model.fit(x=padded_batch_one_hot_encoded, epochs=1000, verbose=2, callbacks=[lr_scheduler], shuffle=True)

In [None]:
# prediction IMPLEMENT SAMPLING

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

preds_tensor = np.zeros(shape=(1,1,len(lookup_layer.get_vocabulary())))
preds_tensor[0,0,:] = lookup_layer('[BOS]')
preds_tensor = tf.cast(tf.constant(preds_tensor),dtype=tf.float32)
decoded_str = ''

while len(decoded_str) < 200:
    last_of_preds = model(preds_tensor)[:,-1:,:]
    #decoded_ch = lookup_layer.get_vocabulary()[sample(tf.squeeze(tf.math.softmax(last_of_preds)))]
    decoded_ch = lookup_layer.get_vocabulary()[np.argmax(last_of_preds)]
    decoded_str += decoded_ch
    
    preds_tensor = tf.concat([preds_tensor,tf.expand_dims(tf.expand_dims(tf.one_hot(lookup_layer(decoded_ch),len(lookup_layer.get_vocabulary())),axis=0),axis=0)],axis=1)

print(decoded_str)