In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import _pickle as pickle
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

from tqdm.notebook import tqdm

from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertModel, RobertaTokenizer, TFRobertaModel, TFRobertaMainLayer
import tensorflow as tf

def equal(a, b):
#     assert len(a) == len(b)
    val = True
    for i in range(len(a)):
        if a[i] != b[i]:
            val = False
    return val

import difflib
import os

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# max_length = 64

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)
max_length = 64

In [None]:
cols = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv' ,encoding='latin-1', names = cols)
df['target'] = df['target'].values / 4

In [None]:
df.shape

In [None]:
df.head()

In [None]:
def reboot(a):
    X = list(np.zeros(a))
    X_masks = list(np.zeros(a))

    X_masked = list(np.zeros(a))
    X_masks_masked = list(np.zeros(a))

    Y = list(np.zeros(a))
    Y_label = list(np.zeros(a))
    return X, X_masks, X_masked, X_masks_masked, Y, Y_label

In [None]:
# a = df.shape[0]
a = 100000

max_length = 64
for index, line in tqdm(df.iterrows(), total = df.shape[0]):
    
    if index % a == 0:
        if index != 0:
            save((X, X_masks, X_masked, X_masks_masked, Y, Y_label), 'batch_'+str(index//a), 'batch')
        X, X_masks, X_masked, X_masks_masked, Y, Y_label = reboot(a)
    
    ind = index % a
    
    s = line['text']
    target = line['target']
    
    encoded = tokenizer.encode_plus(s, add_special_tokens = True, max_length = max_length, pad_to_max_length = True)
    
    input_ids = np.array(encoded['input_ids'])
    attention_masks = np.array(encoded['attention_mask'])
    
    X[ind] = input_ids
    X_masks[ind] = input_ids
    Y_label[ind] = target
    
    ## Random masking
    M = (input_ids != 1).sum()
    to_mask = np.random.randint(0,M,int(0.15*M))
    pred = list(np.zeros(max_length))
    
    for elt in to_mask:
        pred[elt] = input_ids[elt]
        input_ids[elt] = 50264
        attention_masks[elt] = 0
        
    X_masked[ind] = input_ids
    X_masks_masked[ind] = attention_masks
    Y[ind] = pred
    
        
    

## Building RoberTa

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, TimeDistributed, Embedding, Concatenate

vocab_size = 50265
max_length = 64

inputs_ids = Input(shape = (max_length,), dtype = 'int32')
inputs_mask = Input(shape = (max_length,), dtype = 'int32')

inputs = [inputs_ids, inputs_mask]


sentence_encoder = TFRobertaModel.from_pretrained('roberta-base',
                                               output_attentions = False,
                                               output_hidden_states = False,
                                               )
sentence_encoder.config.type_vocab_size = 2 
sentence_encoder.roberta.embeddings.token_type_embeddings = Embedding(2, sentence_encoder.config.hidden_size)

encoded = sentence_encoder(inputs_ids, attention_mask = inputs_mask)
encoded = encoded[0]

drop = Dropout(0.3)(encoded)

out = tf.keras.layers.Dense(vocab_size)(drop)


model = Model(inputs, out)

In [None]:
sentence_encoder.save_weights('test.h5')

In [None]:
model.summary()

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
  
    return tf.reduce_mean(loss_)

def sparse_acc(true,pred):
    
    pred = tf.cast(tf.math.argmax(pred, axis = -1), dtype = true.dtype)
    
    p = tf.equal(true, pred)
    p = tf.cast(p, dtype = true.dtype)
    
    mask = tf.math.logical_not(tf.math.equal(true, 0))
    mask = tf.cast(mask, dtype = true.dtype)
    
    p = p*mask
    
    
    return tf.reduce_sum(p) / tf.reduce_sum(mask)
    
    
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000, factor = 1):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
        self.factor = factor
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) / self.factor
    

In [None]:
epochs_save = 0
batch_save = 5
model.load_weights('./checkpoints/tweetberta_epoch_'+str(epochs_save)+'_batch_'+str(batch_save)+'/checkpoint.h5py')

In [None]:
EPOCHS = 2
# ep_save = 5
for i in range(epochs_save, EPOCHS):
    epochs_save = 0
    print('**********************      EPOCH '+str(i)+'        **************************')
    
    learning_rate = CustomSchedule(512, factor = 1)
    loss_classif     =  loss_function
    optimizer = tf.keras.optimizers.Adam(
                    3e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
    metrics_classif  =  ['sparse_categorical_accuracy', sparse_acc]
    
    model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)

    for batch in range(batch_save + 1,16):
        batch_save = 0
        print('*********** BATCH '+str(batch))
        
        X, X_masks, X_masked, X_masks_masked, Y, Y_label = load('batch_'+str(batch), 'batch')
        
        X_train = [np.array(X_masked), np.array(X_masks_masked)]
        y_train = np.array(Y)
        
        print('batch_loaded')
        
        batch_size = 32
        epochs = 1
        history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)
        
        os.mkdir('./checkpoints/tweetberta_epoch_'+str(i)+'_batch_'+str(batch))
        model.save_weights('./checkpoints/tweetberta_epoch_'+str(i)+'_batch_'+str(batch)+'/checkpoint.h5py')
        sentence_encoder.save_weights('./checkpoints/roberta_layer_epoch_'+str(i)+'_batch_'+str(batch)+'/checkpoint.h5py')