In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import _pickle as pickle
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

from tqdm.notebook import tqdm

from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertModel, RobertaTokenizer, TFRobertaModel, TFRobertaMainLayer
import tensorflow as tf

def equal(a, b):
#     assert len(a) == len(b)
    val = True
    for i in range(len(a)):
        if a[i] != b[i]:
            val = False
    return val

import difflib
import os

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# max_length = 64

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)
max_length = 64

from sklearn.model_selection import train_test_split

In [2]:
cols = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv' ,encoding='latin-1', names = cols)
df['target'] = df['target'].values / 4

In [3]:
df.shape

(1600000, 6)

In [4]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0.0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0.0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0.0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0.0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0.0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [12]:
df['text'].iloc[5].replace(' ', '')

'@Kwesideinotthewholecrew'

In [64]:
X = list(np.zeros(50000))
Y = list(np.zeros(50000))

max_length = 32
for index, line in tqdm(df.iterrows(), total = df.shape[0]):
    if index < 50000:
        s = line['text']
        s1 = s.replace(' ', '')
        
        
        x = tokenizer.encode(s1, add_special_tokens = True, max_length = max_length, pad_to_max_length = True)
        y = tokenizer.encode(s, add_special_tokens = True, max_length = max_length, pad_to_max_length = True)
        
        X[index] = x
        Y[index] = y

HBox(children=(FloatProgress(value=0.0, max=1600000.0), HTML(value='')))

KeyboardInterrupt: 

In [65]:
X=  np.array(X)
Y = np.array(Y)

X[X == 1] = 0
X[:,0] = 1

Y[Y == 1] = 0
Y[:,0] = 1


y = Y[:,1:]
Y = Y[:,:-1]

## Building RoberTa

In [68]:
X_t, X_v, Y_t, Y_v = train_test_split(X, Y, random_state=42, test_size=0.1)
y_train, y_test, _, _ = train_test_split(y, y, random_state=42, test_size=0.1)

X_train = [X_t, Y_t]
X_test = [X_v, Y_v]

In [76]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, TimeDistributed, Embedding, Concatenate
from tf_transformers2 import *

vocab_size = 50265
max_length = 32

inputs_enc = Input(shape = (max_length,), dtype = 'int32')
inputs_dec = Input(shape = (max_length - 1,), dtype = 'int32')

inputs = [inputs_enc, inputs_dec]

transformers = Transformer(
    num_layers = 6, d_model = 512, num_heads = 8, 
    dff = 1024, input_vocab_size = vocab_size, target_vocab_size = vocab_size, 
    pe_input = 64, pe_target = 64,num_types = 2, rate=0.1, 
    bidirectional_encoder = True, bidirectional_decoder = False
)

out, _ = transformers(inputs = inputs_enc, tar = inputs_dec)


model = Model(inputs, out)

In [77]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 31)]         0                                            
__________________________________________________________________________________________________
transformer_5 (Transformer)     ((None, 31, 50265),  108800601   input_11[0][0]                   
Total params: 108,800,601
Trainable params: 108,800,601
Non-trainable params: 0
__________________________________________________________________________________________________


In [78]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
  
    return tf.reduce_mean(loss_)

def sparse_acc(true,pred):
    
    pred = tf.cast(tf.math.argmax(pred, axis = -1), dtype = true.dtype)
    
    p = tf.equal(true, pred)
    p = tf.cast(p, dtype = true.dtype)
    
    mask = tf.math.logical_not(tf.math.equal(true, 0))
    mask = tf.cast(mask, dtype = true.dtype)
    
    p = p*mask
    
    
    return tf.reduce_sum(p) / tf.reduce_sum(mask)
    
    
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000, factor = 1):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
        self.factor = factor
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) / self.factor
    

In [79]:
# learning_rate = CustomSchedule(512, factor = 1)
loss_classif     =  loss_function
optimizer = tf.keras.optimizers.Adam(
                    3e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
metrics_classif  =  ['sparse_categorical_accuracy', sparse_acc]

model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)

In [81]:
epochs = 2
batch_size = 64
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,validation_data=(X_test,  y_test))

Train on 45000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2
 1792/45000 [>.............................] - ETA: 6:52 - loss: 3.5019 - sparse_categorical_accuracy: 0.1314 - sparse_acc: 0.2080

KeyboardInterrupt: 

In [104]:
input_sequence = 'the dr'
input_sequence = input_sequence.replace(' ', '')
print(input_sequence)

input_sequence = tokenizer.encode(input_sequence,add_special_tokens = True, max_length = max_length, pad_to_max_length = True)

iamintoit


In [108]:
def generate(input_sequence):
    count = 0
    start = [1]
    
    while len(start)<31:
        print(count)
        count+=1
        st = start + [0 for elt in range(31-len(start))]
        pred = model.predict([np.array([input_sequence]), np.array([st])])
        
        pred = pred[0][len(start) -1]
        
        a = np.argmax(pred)
        
        start.append(a)
    return start
        

In [109]:
a = generate(input_sequence)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [110]:
tokenizer.decode(a)

'<pad>i have to a a a a a a me  you you to</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>'