### Import TensorFlow and other libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile
zip_ref = zipfile.ZipFile("your_file.zip", 'r')
zip_ref.extractall()
zip_ref.close()

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass


In [None]:
import tensorflow as tf

In [None]:
tf.__version__

In [None]:
import glob
import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import PIL
from tensorflow.keras import layers
import time
from sklearn import metrics

from IPython import display

### Load and prepare the dataset

In [None]:
files = []
path = './drive/My Drive/colab/pheme5/' 
for r, d, f in os.walk(path):
    for file in f:
      files.append(file)
print(files)

for i in range(5):
  if i==0:
    text_load = np.load('./drive/My Drive/colab/pheme5/'+files[i]) 
    label = np.ones(len(text_load))*i
  else:
    text_load_temp = np.load('./drive/My Drive/colab/pheme5/'+files[i])
    text_load=np.append(text_load, text_load_temp )
    label=np.append(label,np.ones(len(text_load_temp))*i)
    
print(text_load[0])
print(text_load.shape)
print(label.shape)
print(label)

In [None]:
i=0

for line in text_load:
  
  for word in line.split():
    if len(word)>20:
      line = line.replace(word, 'UNK')
#       word = 'UNK'
  text_load[i] = line
  i+=1
print(text_load[0])

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text_load)
text_as_int = tokenizer.texts_to_sequences(text_load)
print(len(text_as_int))
print(text_as_int[0])
tokenizer.sequences_to_texts([text_as_int[0]])

Convert the integers back to words

In [None]:
len_sen = [len(sublist) for sublist in text_as_int]    
print(max(len_sen))
list_of_int = [item for sublist in text_as_int for item in sublist]
vocab = sorted(set(list_of_int))
print ('{} unique words'.format(len(vocab)))
# vocab = 8317
# tokenizer.sequences_to_texts([[1]])

text_as_int = tf.keras.preprocessing.sequence.pad_sequences(text_as_int,
                                                     value=0,
                                                     padding='post',
                                                     maxlen=max(len_sen))
print(text_as_int.shape)

In [None]:
print(text_as_int[0])
print(label[0])

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
# The maximum length sentence we want for a single input in characters
seq_length = max(len_sen)
# examples_per_epoch = 1
labels = np.asarray(label)

text_as_int_train, text_as_int_test, labels_train, labels_test = train_test_split(text_as_int, labels, test_size=0.2,shuffle=False)
print(text_as_int_test.shape)
print(labels_test.shape)

smote = SMOTE()
res_features, res_labels = smote.fit_sample(text_as_int_train, labels_train)
print(res_features.shape)

In [None]:
# train_data = tf.data.Dataset.from_tensor_slices((text_as_int, labels))
train_data = tf.data.Dataset.from_tensor_slices((res_features, res_labels))
print(train_data)

for (i,j) in train_data.take(1):
    print(i)
    print(tokenizer.sequences_to_texts([i.numpy()]))
    print(j)
    

    
# train_data = train_data.shuffle(10000,reshuffle_each_iteration = False)
train_data = train_data.shuffle(10000)
for (i,j) in train_data.take(1):
    print(i)
    print(tokenizer.sequences_to_texts([i.numpy()]))
    print(j)



In [None]:
# test_data = tf.data.Dataset.from_tensor_slices((res_features_test, res_labels_test))
test_data = tf.data.Dataset.from_tensor_slices((text_as_int_test, labels_test))
print(test_data)

for (i,j) in test_data.take(1):
    print(i)
    print(tokenizer.sequences_to_texts([i.numpy()]))
    print(j)
    

# test_data = test_data.shuffle(10000,reshuffle_each_iteration = False)
test_data = test_data.shuffle(10000)
for (i,j) in test_data.take(1):
    print(i)
    print(tokenizer.sequences_to_texts([i.numpy()]))
    print(j)

In [None]:
BATCH_SIZE = 256
# BATCH_SIZE =1024

train_data = train_data.batch(BATCH_SIZE, drop_remainder = True)
print(train_data)

test_data = test_data.batch(BATCH_SIZE, drop_remainder = True)
print(test_data)

for (input_example, target_example) in train_data.take(1):
    print(input_example)
    print(input_example[0])
    print ('Input data: ', repr(tokenizer.sequences_to_texts([input_example[0].numpy()])))
    print(target_example[0])

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
# !unzip glove*.zip
# !ls
# !pwd

In [None]:
# print('Indexing word vectors.')

# embeddings_index = {}
# f = open('glove.6B.100d.txt', encoding='utf-8')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors.' % len(embeddings_index))

## Create the models

Both the vae and discriminator are defined using the [Keras Sequential API](https://www.tensorflow.org/guide/keras#sequential_model).

### CVAE

In [None]:
import numpy as np
class CVAE(tf.keras.Model):
  def __init__(self, latent_dim, vocab_size, embedding_dim, rnn_units, batch_size):
    super(CVAE, self).__init__()
    self.latent_dim = latent_dim
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.rnn_units = rnn_units
    self.batch_size = batch_size
#     self.weight_matrix = weight_matrix 
    self.inference_net = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(latent_dim + latent_dim, activation = tf.nn.relu)
      ]
    )
    
    self.generative_net = tf.keras.Sequential(
        [
          tf.keras.layers.InputLayer(input_shape=(seq_length,latent_dim),batch_size = batch_size),
          tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
          tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
          tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(vocab_size,activation = tf.nn.relu)),
        ]
    )

  @tf.function
  def sample(self, eps=None):
    if eps is None:
      eps = tf.random.normal(shape=(BATCH_SIZE, self.latent_dim))
    return self.decode(eps, apply_sigmoid=True)

  def encode(self, x):
#     print(x.shape)
    mean, logvar = tf.split(self.inference_net(x), num_or_size_splits=2, axis=2)
    return mean, logvar

  def reparameterize(self, mean, logvar):
    eps = tf.random.normal(shape=mean.shape)
    return eps * tf.exp(logvar * .5) + mean

  def decode(self, z, apply_sigmoid=False):
    logits = self.generative_net(z)
    if apply_sigmoid:
      probs = tf.sigmoid(logits)
      return probs

    return logits

In [None]:
def make_generator_model(latent_dim,vocab_size, embedding_dim, rnn_units, batch_size):
    model = CVAE(latent_dim,vocab_size,embedding_dim,rnn_units, batch_size)
    return model

In [None]:
def generate_text(model, epoch, test_input):
  predictions = model.sample(test_input)
  for i in range(1):
      sampled_indices = tf.random.categorical(predictions[i], num_samples=1) 
      sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
      print("Prediction \n", i, repr("".join(tokenizer.sequences_to_texts([sampled_indices] ))))
  return predictions

In [None]:
latent_dim = 16
num_examples_to_generate = BATCH_SIZE
vocab_size = 8318
embedding_dim = 32
rnn_units = 32
batch_size = BATCH_SIZE 

In [None]:
generator = make_generator_model(latent_dim,vocab_size, embedding_dim, rnn_units,batch_size)
print(generator.inference_net.summary())
print(generator.generative_net.summary())

In [None]:
random_vector_for_generation = tf.random.normal(
    shape=[num_examples_to_generate,seq_length, latent_dim])
print(random_vector_for_generation.shape)

generated_texts = generate_text(generator, 0, random_vector_for_generation)
print(generated_texts.shape)

### The Discriminator

The discriminator is a LSTM-based classifier.

In [None]:
def make_discriminator_model(latent_dim,vocab_size,embedding_dim,rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(seq_length,latent_dim),batch_size = batch_size),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform',
                            dropout=0.5)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform',
                            dropout=0.5)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform',
                            dropout=0.5)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(32,activation=tf.nn.relu,kernel_regularizer=tf.keras.regularizers.l2(0.001)),
        tf.keras.layers.Dense(5,activation=tf.nn.softmax)
      ]
    )

    return model

In [None]:
discriminator = make_discriminator_model(latent_dim,vocab_size,embedding_dim,rnn_units, batch_size)
print(discriminator.summary())


## Define the loss and optimizers

Define loss functions and optimizers for both models.


In [None]:
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False)

### Discriminator loss

In [None]:
def discriminator_loss(output, label):

  return cross_entropy(label,output)

## CVAE loss

In [None]:
def log_normal_pdf(sample, mean, logvar, raxis=1):
  log2pi = tf.math.log(2. * np.pi)
  return tf.reduce_sum(
      -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),
      axis=raxis)

# @tf.function
# def compute_loss(model, x):
#   mean, logvar = model.encode(x)
#   z = model.reparameterize(mean, logvar)
#   x_logit = model.decode(z)

#   cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)
#   logpx_z = -tf.reduce_sum(cross_ent, axis=[1, 2, 3])
#   logpz = log_normal_pdf(z, 0., 0.)
#   logqz_x = log_normal_pdf(z, mean, logvar)
#   return -tf.reduce_mean(logpx_z + logpz - logqz_x)

In [None]:
def generator_loss(generated_images, mean, logvar, z, images):
    cross_ent = tf.keras.losses.sparse_categorical_crossentropy(
            images, generated_images, from_logits=True)
    logpx_z = -tf.reduce_sum(cross_ent)
    logpz = log_normal_pdf(z, 0., 0.)
    logqz_x = log_normal_pdf(z, mean, logvar)
    loss=-tf.reduce_mean(logpx_z + logpz - logqz_x)
    return loss

The discriminator and the CVAE optimizers are different since we will train two networks separately.

In [None]:
generator_optimizer = tf.keras.optimizers.Adam(1e-5)
discriminator_optimizer = tf.keras.optimizers.Adam(5*1e-4)

### Save checkpoints
This notebook also demonstrates how to save and restore models, which can be helpful in case a long running training task is interrupted.

In [None]:
checkpoint_dir_gen = './training_checkpoints_vae_rumor'
checkpoint_prefix_gen = os.path.join(checkpoint_dir_gen, "ckpt")
checkpoint_gen = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 generator=generator)

checkpoint_dir = './training_checkpoints_vaed_rumor'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

## Define the training loop



In [None]:
EPOCHS = 300

In [None]:
# Notice the use of `tf.function`
# This annotation causes the function to be "compiled".
# @tf.function
def train_step(inp, target, test, epoch):
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
      mean, logvar = generator.encode(inp)
      z = generator.reparameterize(mean,logvar)
      generated_images = generator.decode(z)

      output = discriminator(z, training=True)
      target_one_hot=np.eye(5)[np.asarray(target).astype(int)]

      disc_loss = discriminator_loss(output, target_one_hot)
      gen_loss = generator_loss(generated_images, mean, logvar, z, inp)
    
      disc_acc = tf.keras.metrics.CategoricalAccuracy()
      disc_acc.update_state(target_one_hot, output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    
    return gen_loss, disc_loss, generated_images

In [None]:
# import numpy as np
def train(dataset, epochs, test):
  max_test_acc, epoch_maxacc = 0,0
  start = time.time()
  for epoch in range(epochs):
    for (batch_n, (inp, target)) in enumerate(dataset):
      gen_loss, disc_loss, generated_images = train_step(inp, target,test, epoch)
      
    if (epoch+1)%10==0:
      display.clear_output(wait=True)
  
      sampled_indices = tf.random.categorical(generated_images[0], num_samples=1) 
      sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
      print("Input: \n", repr("".join(tokenizer.sequences_to_texts([inp[0].numpy()] ))))
      print("Predictions: \n", repr("".join(tokenizer.sequences_to_texts([sampled_indices ] ))))
      print("Stance: \n", target[0].numpy())
      
      print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))
      print ('gen_loss {}, disc_loss {}'.format(np.mean(gen_loss),np.mean(disc_loss)))
      start = time.time()

In [None]:
# import zipfile
# zip_ref = zipfile.ZipFile("./drive/My Drive/Colab Notebooks/checkpoints/file.zip", 'r')
# zip_ref.extractall()
# zip_ref.close()
# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
checkpoint_gen.restore(tf.train.latest_checkpoint(checkpoint_dir_gen))

##Visualize

In [None]:
import datetime
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

In [None]:
# !pip install tensorflow
%load_ext tensorboard
tensorboard_callback = tf.keras.callbacks.TensorBoard("/logs/gradient_tape", histogram_freq=1)
%tensorboard --logdir logs/gradient_tape

In [None]:
%%time
train(train_data, EPOCHS, test_data)
  
 