Updated colab notebook: https://colab.research.google.com/drive/19oMDAjNZ7hj64pw91TVGv46lVOYCcqZf#scrollTo=eAi-u9COtlWJ

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as layers
from tensorflow.keras.layers import Dense as Linear
import numpy as np
from google.colab import files
import pandas as pd

#You only need decoder if generating text
#just sentiment analysis = encoder --> feed forward

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
class Transformer(tf.Module):
  #input = info after positional embedding
  def __init__(self, vocab_size, input_dim, embedding_dim, out_seq_length, n_heads=3):
    #self.embedding = embedding layer
    self.TextVectorization = tf.keras.layers.TextVectorization(max_tokens = vocab_size, output_mode='int', output_sequence_length=out_seq_length) 

    self.Encoder = Encoder(embedding_dim, n_heads=n_heads)
    self.Decoder = Decoder(embedding_dim, n_heads=n_heads) 

    #final classification layers of model (see graphic if confused later)
    #Extra dense layer might help later: output = self.Dense()
    self.Dense = tf.keras.layers.Dense(vocab_size)
    self.Softmax = tf.keras.layers.Softmax()
  
  def positional_encoding(self, matrix, n=10000):
    empty_matrix = np.zeros(matrix.shape) #index to last 2
    embedding_dim = matrix.shape[1]
    for row in range(matrix.shape[0]):
      for column in range(int(embedding_dim/2)):
        denom = np.power(n, 2*column/embedding_dim)
        #1st column, etc
        empty_matrix[row, 2*column] = np.sin(row/denom)
        #2nd column, etc.
        empty_matrix[row, 2*column+1] = np.cos(row/denom)
        final_matrix = matrix + empty_matrix # Can't this be added at the end and then returned?
    print(":)")
    return final_matrix

  def call(self, converted_dataset):
    model = tf.keras.Sequential([self.callTextVectorization, 
                                 self.positional_encoding(self.TextVectorization.adapt(converted_dataset)),
                                 self.Encoder(),
                                 self.Encoder(),
                                 self.Encoder(),
                                 self.Decoder(),
                                 self.Decoder(),
                                 self.Decoder(),
                                 self.Decoder(),
                                 self.Dense(),
                                 self.Softmax()])

  def old_call(self, converted_dataset):
    vectorized_layer = self.DecoderTextVectorization

    #dataset must be a tf.data.Dataset or numpy array
    #convert dataset to tensorflow outside of Transformer structure
    vectorized_layer.adapt(converted_dataset)
    vectorized_layer.get_vocabulary() #can omit later, just prints vocab
    
    #shape after PE is finished: OG matrix
    #self.positional_encoding = encoding layer --> sum of input embedding vector and positive vector
    output = self.positional_encoding(vectorized_layer)

    #shape after Encoder is finished: ((units * num heads), embedding_dim)
    output = self.Encoder(output)
    output = self.Encoder(output)
    output = self.Encoder(output)

    #shape after Decoder is finished: ((units * num heads), embedding_dim)
    output = self.Decoder()
    output = self.Decoder()
    output = self.Decoder()

    output = self.Dense(output)

    #output probabilities = vocabulary, aka input words --> num words total, take max value
    #put word through output embedding, and shove it back in
    #should be size of vocab, take the highest probability --> after, turn it into an output embedding, and put it into the decoder
    output = self.Softmax(output)

#tf.Module = parent class
#info can be passed from encoder to encoder 
class Encoder(tf.Module):
  def __init__(self, embedding_dim, n_heads=3):
    super().__init__()
    
    self.MultiHeadedAttention = MultiHeadedAttention(embedding_dim, n_heads=n_heads)

    #normalizing = transformation of data that centers data to 0, standard deviation = 1
    #https://www.tensorflow.org/api_docs/python/tf/keras/layers/LayerNormalization
    self.norm1 = tf.keras.layers.LayerNormalization()

    #Dense layers = input_dim[-1] so it matches input dim
    self.feed_forward = tf.keras.Sequential([tf.keras.layers.Dense(embedding_dim),
                                             tf.keras.layers.ReLU(),
                                             tf.keras.layers.Dense(embedding_dim)])
    
    #shape of input & output of Multiheadedattention must match, feed forward too
    self.norm2 = tf.keras.layers.LayerNormalization()

  def __call__(self, x):
    #calling forward from multi-head attention
    attention_output = self.MultiHeadedAttention.forward(x)

    #skip connection pathway = add & norm
    #add to prevent vanishing + exploding gradients
    x = self.norm1(attention_output + x)

    #feed forward
    ff_output = self.feed_forward(x)

    #skip connection pathway
    x = self.norm2(ff_output + x)

    return x
    

class MultiHeadedAttention(tf.Module):
  def __init__(self, embedding_dim, mask=False, n_heads=3):
    super().__init__()
    
    #applies mask to future words if necessary
    self.mask = mask
    self.n_heads = n_heads
    
    #passing x in does matrix multiplication, multiplies x by q weight
    self.embedding_dim = embedding_dim #size (col) of what we want final output to be
    self.Q = [tf.keras.layers.Dense(self.embedding_dim) for i in range(n_heads)]  #use offset
    self.K = [tf.keras.layers.Dense(self.embedding_dim) for i in range(n_heads)]
    self.V = [tf.keras.layers.Dense(self.embedding_dim) for i in range(n_heads)]
    #concatenation layer --> return to original shape, which is ((units * num heads), embedding_dim)
    self.concat_layer = tf.keras.layers.Dense(self.embedding_dim)
    self.softmax = tf.keras.layers.Softmax()
  
  def forward(self, x, encoder_output=None):
    #Assuming no batches --> also assuming the ENTIRE program has no batch size, look at later
    num_words = x.shape[0]

    for i in range(self.n_heads):
      queries = self.Q[i](x)
      if encoder_output is not None:
        keys = self.K[i](encoder_output)
        values = self.V[i](encoder_output)
      else:
        keys = self.K[i](x)
        values = self.V[i](x)

      #https://www.tensorflow.org/api_docs/python/tf/linalg/matmul
      query_key = tf.linalg.matmul(queries, keys.T)
      dimension_k = (x.shape[0]*x.shape[1])

      #scaled query_key
      query_key = query_key / tf.sqrt(dimension_k)

      #apply mask
      if self.mask == True:
        heads_output = []
        negative_mask = np.zeros(query_key.shape)
        for row in range(num_words-1):
            negative_mask[row, row+1:] = -np.inf
            #apply mask
            query_key = query_key + negative_mask

      query_key = self.softmax(query_key)

      final_matrix = tf.linalg.matmul(query_key, values)

      #final shape of head: (words, units), output_size is units & words = num words
      head_shape = final_matrix * (num_words, self.embedding_dim)

      #final shape: ((words, units), num_heads) = (words, (units * num heads)) 
      heads_output.append(head_shape)
      final_heads = tf.concat(heads_output)
    
    #return to original shape, which is ((units * num heads), embedding_dim)
    squashed_heads = self.concat_layer(final_heads)

    return squashed_heads


#google what is the first character input --> 1 row, embed size
#input is 1 vector for the first time
#input # of vectors increases by one each time it runs through the encoder/decoder
class Decoder(tf.Module):
  def __init__(self, embedding_dim, n_heads=3):
    #track output embedding shape --> should be tuple (rows = words, columns = embedding size)
    
    #self.Attention = Attention(mask=True)
    self.MultiHeadedAttention1 = MultiHeadedAttention(embedding_dim, mask=True, n_heads=n_heads)

    #self.norm1 = Layer_norm()
    self.norm1 = tf.keras.layers.LayerNormalization()

    #2nd multi headed attention layer
    self.MultiHeadedAttention2 = MultiHeadedAttention(embedding_dim, n_heads=n_heads)

    #self.norm2 = Layer_norm()
    self.norm2 = tf.keras.layers.LayerNormalization()

    #self.feed_forward = feed forward block (linear, relu, linear)
    self.feed_forward = tf.keras.Sequential([tf.keras.layers.Dense(embedding_dim),
                                             tf.keras.layers.ReLU(),
                                             tf.keras.layers.Dense(embedding_dim)])
    
    self.norm3 = tf.keras.layers.LayerNormalization()

  def __call__(self, previous_outputs, encoder_output):
    #first pass: start token, 2nd pass: start token + first word
    x = self.embedding(previous_outputs)
    x = self.positional_encoding(x) + x

    #multi-head attention
    attention_output = self.MultiHeadedAttention1(x)

    #skip connection pathway/add & norm
    #add to prevent vanishing + exploding gradients
    query = self.norm1(attention_output + x) #output = new query values

    #x = output (no longer query)
    x = self.MultiHeadedAttention2(query, encoder_output)

    x = self.norm2(query + x)
    
    #feed forward
    ff_output = self.feed_forward(x)

    #skip connection pathway
    x = self.norm3(ff_output + x)

#y-data --> label smoothing

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/Datasets/Sentiment140/train_data.csv", encoding='latin-1', names=['sentiment', 'date', 'no_query', 'username', 'text'])

In [None]:
dataset.info()

In [None]:
text_only = dataset['text']