In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
# helper functions

def split_seq(seq):
    input_seq = seq[:-1,:] # from char 0 to char [EOS] - 1
    target_seq = seq[1:,:] # from char 1 to EOS

    return input_seq,target_seq

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def create_speaker_dataset(temp_df,speaker_idx,lookup_layer):

    # add BOS and EOS tokens
    X_train = list(map(lambda x: ['[BOS]'] + list(x.lower()) + ['[EOS]'], temp_df.speech.values))

    # merge all texts end to end
    merged_X_train = np.hstack(X_train)

    # string to idx
    X_train_idx = lookup_layer(merged_X_train)

    # split into seqs, add speaker idx
    train_ds = tf.data.Dataset.from_tensor_slices(X_train_idx)
    train_ds = train_ds.batch(101,drop_remainder=True).map(lambda x: (split_seq(x),speaker_idx))

    return train_ds

In [3]:
# read whole corpus
processed_df = pd.read_pickle('../data/processed_df.pkl')

# work on subset of speakers
#temp_bioguides = ['S000033','L000480','W000791','D000388','S001176','E000285','B000944','N000002','C001070']
temp_bioguides = ['H001055','C001074','H001074','E000092','R000590','G000544','O000107','S000672','M000687','H000712']
temp_df = processed_df[processed_df.bioguide_id.isin(temp_bioguides)]

In [4]:
# create character lookup table
lookup_layer = tf.keras.layers.StringLookup(num_oov_indices=0,output_mode='one_hot',vocabulary=np.load('../data/vocab.npy'))

# create speaker id lookup table
bioguide_voc = np.unique(temp_df.bioguide_id.values)
bioguide_lookup_layer = tf.keras.layers.StringLookup(num_oov_indices=0,output_mode='int',vocabulary=bioguide_voc)

  return bool(asarray(a1 == a2).all())
2022-04-27 13:39:33.835007: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# for each speaker in the subset, create tf Dataset
# input_seq,target_seq,speaker_idx
bioguide_processed_speech_dict = {}

for temp_bioguide_id in temp_bioguides:
    
    speaker_df = temp_df[temp_df.bioguide_id==temp_bioguide_id]
    
    temp_train_ds = create_speaker_dataset(temp_df=speaker_df,speaker_idx=bioguide_lookup_layer(temp_bioguide_id),lookup_layer=lookup_layer)
    
    bioguide_processed_speech_dict[temp_bioguide_id] = temp_train_ds

# merge all individual speaker datasets
concat_ds = tf.data.Dataset.sample_from_datasets(list(bioguide_processed_speech_dict.values())).batch(32)

In [6]:
class RNN_LM(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.lstm1 = tf.keras.layers.LSTM(units=256,return_sequences=True)
        self.lstm2 = tf.keras.layers.LSTM(units=256,return_sequences=True)
        self.dense1 = tf.keras.layers.Dense(units=128,activation='relu')
        self.dense2 = tf.keras.layers.Dense(units=len(lookup_layer.get_vocabulary()))

        self.emb_layer = tf.keras.layers.Embedding(input_dim=len(bioguide_lookup_layer.get_vocabulary()),output_dim=3)

    def call(self,concat_x_speaker):
        
        x,speaker_idx = concat_x_speaker
        
        speaker_embedding = self.emb_layer(speaker_idx)
        speaker_embedding = tf.repeat(tf.expand_dims(speaker_embedding,axis=1),repeats=x.shape[1],axis=1)

        x = tf.concat([x,speaker_embedding],axis=2)

        #hiddens = self.lstm2(self.lstm1(x))
        hiddens = self.lstm2(x)
        #outputs = self.dense2(self.dense1(hiddens))
        outputs = self.dense2(hiddens)

        return outputs

    def train_step(self, data):
        
        (input_seq,target_seq),speaker_idx = data


        with tf.GradientTape() as tape:
            predicted_seq = self((input_seq,speaker_idx))
            
            loss = self.compiled_loss(target_seq, predicted_seq, regularization_losses=self.losses)

        
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(target_seq, predicted_seq)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}

In [7]:
model = RNN_LM()

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)

loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=4, verbose=1)

early_stopper = tf.keras.callbacks.EarlyStopping(monitor='loss',min_delta=0, patience=5, verbose=2)

model.compile(optimizer=optimizer, loss=loss)

model.fit(x=concat_ds, epochs=100, verbose=2, callbacks=[early_stopper,lr_scheduler], shuffle=True)

#model.summary()

Epoch 1/100
92/92 - 17s - loss: 3.0959 - lr: 0.0010 - 17s/epoch - 180ms/step
Epoch 2/100
92/92 - 15s - loss: 2.8009 - lr: 0.0010 - 15s/epoch - 159ms/step
Epoch 3/100
92/92 - 14s - loss: 2.5808 - lr: 0.0010 - 14s/epoch - 155ms/step
Epoch 4/100
92/92 - 14s - loss: 2.4475 - lr: 0.0010 - 14s/epoch - 154ms/step
Epoch 5/100
92/92 - 17s - loss: 2.3628 - lr: 0.0010 - 17s/epoch - 185ms/step
Epoch 6/100
92/92 - 21s - loss: 2.2936 - lr: 0.0010 - 21s/epoch - 227ms/step
Epoch 7/100
92/92 - 23s - loss: 2.2394 - lr: 0.0010 - 23s/epoch - 245ms/step
Epoch 8/100
92/92 - 20s - loss: 2.1968 - lr: 0.0010 - 20s/epoch - 220ms/step
Epoch 9/100
92/92 - 18s - loss: 2.1588 - lr: 0.0010 - 18s/epoch - 191ms/step
Epoch 10/100
92/92 - 19s - loss: 2.1303 - lr: 0.0010 - 19s/epoch - 204ms/step
Epoch 11/100
92/92 - 21s - loss: 2.1007 - lr: 0.0010 - 21s/epoch - 226ms/step
Epoch 12/100
92/92 - 19s - loss: 2.0748 - lr: 0.0010 - 19s/epoch - 212ms/step
Epoch 13/100
92/92 - 18s - loss: 2.0515 - lr: 0.0010 - 18s/epoch - 200ms/

<keras.callbacks.History at 0x7f7c88f81640>

In [11]:
print(bioguide_lookup_layer.get_vocabulary())

['C001074', 'E000092', 'G000544', 'H000712', 'H001055', 'H001074', 'M000687', 'O000107', 'R000590', 'S000672']


In [17]:
# prediction
initial_text = ''
conditioned_speaker = 'S000672'

initial_tokens = ['[BOS]'] + list(initial_text)
preds_tensor = np.zeros(shape=(1,len(initial_tokens),len(lookup_layer.get_vocabulary())))
preds_tensor[0,:,:] = lookup_layer(initial_tokens)
preds_tensor = tf.cast(tf.constant(preds_tensor),dtype=tf.float32)
decoded_str = initial_text

while len(decoded_str) < 250:
    conditioned_speaker_idx = tf.expand_dims(bioguide_lookup_layer(conditioned_speaker),axis=0)
    last_of_preds = model((preds_tensor,conditioned_speaker_idx))[:,-1:,:]
    decoded_ch = lookup_layer.get_vocabulary()[sample(tf.squeeze(tf.math.softmax(last_of_preds)))]
    #decoded_ch = lookup_layer.get_vocabulary()[np.argmax(last_of_preds)]
    decoded_str += decoded_ch
    
    preds_tensor = tf.concat([preds_tensor,tf.expand_dims(tf.expand_dims(lookup_layer(decoded_ch),axis=0),axis=0)],axis=1)
    
print(decoded_str)

. chairmans members.  which is a spergans. we ard mymachation. he was ham out know us jocin deword or nater and a teaker, i hope will go talk like the intersoud that texans, and he moyeal national action. today at wemelihal to the balance of my time.
