In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.utils import np_utils

import _pickle as pickle

## Some useful functions to ease the processings
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertForQuestionAnswering, TFBertModel, TFBertForNextSentencePrediction

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

import tensorflow as tf


# from tensorflow.keras.backend.tensorflow_backend import set_session
# import tensorflow as tf
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# sess = tf.Session(config=config)
# set_session(sess)


from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense

In [None]:
df = pd.read_csv('train.article.txt', sep = '\n', header = None)
df1 = pd.read_csv('train.title.txt', sep = '\n', header = None)
df.columns = ['article']
df['title'] = df1[0]

In [None]:
save(df, 'train_gigaword')

In [None]:
df = load('train_gigaword')

In [None]:
df = df.iloc[:200000]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_length = 64
max_length_out = 16
X = list(np.zeros(df.shape[0]))
X_type = list(np.zeros(df.shape[0]))
X_masks = list(np.zeros(df.shape[0]))
Y = list(np.zeros(df.shape[0]))

text_pairs = []
for index, line in tqdm(df.iterrows(), total = df.shape[0]):
    s1 = line['article']
    s2 = line['title']
    
    tokenized = tokenizer.encode_plus(str(s1), add_special_tokens = True, max_length = max_length, pad_to_max_length = True)
    answer = tokenizer.encode_plus(str(s2), add_special_tokens = True, max_length = max_length_out, pad_to_max_length = True)
    
    X[index] = tokenized['input_ids']
    X_type[index] = tokenized['token_type_ids']
    X_masks[index] = tokenized['attention_mask']
    Y[index] = answer['input_ids']
    text_pairs.append(tokenizer.decode(tokenized['input_ids']))

In [None]:
Y1 = np.array(Y)

In [None]:
Y1[:,-2] = 102*(Y1[:,-2]!=0)
Y1[:,-1] = 0

In [None]:
Y_input = Y1[:,:-1]
Y_output = Y1[:,1:]

In [None]:
Y_input

In [None]:
Y_output

In [None]:
df['inputs_ids'] = X
df['token_type_ids'] = X_type
df['attention_masks'] = X_masks
df['target'] = Y
df['target_input'] = list(Y_input)
df['target_output'] = list(Y_output)

In [None]:
df['target_input'].values

In [None]:
save(df, 'small_data_refined')

In [None]:
df = load('small_data_refined')

In [None]:
df.tail()

In [None]:
X = np.array([list(elt) for elt in df['inputs_ids'].values]).astype(int)
X_masks = np.array([list(elt) for elt in df['attention_masks'].values]).astype(int)
X_type = np.array([list(elt) for elt in df['token_type_ids']]).astype(int)

Y = np.array([list(elt) for elt in df['target']]).astype(int)
Y_input = np.array([list(elt) for elt in df['target_input']]).astype('float32')
Y_output = np.array([list(elt) for elt in df['target_output']]).astype(int)
# from keras.utils import np_utils
# y = np_utils.to_categorical(Y)

In [None]:
Y_input

In [None]:
X_train_ids, X_test_ids, y_train, y_test = train_test_split(X, Y_output, random_state=42, test_size=0.1)
X_train_masks, X_test_masks, y_train_input, y_test_input = train_test_split(X_masks, Y_input, random_state=42, test_size=0.1)
X_train_type, X_test_type, _ , _ = train_test_split(X_type, Y, random_state=42, test_size=0.1)

X_train = [X_train_ids, X_train_masks, X_train_type, y_train_input]
X_test = [X_test_ids, X_test_masks, X_test_type, y_test_input]

In [None]:
Y.max()

In [None]:
def generator(X, y, batch_size = 64):
    
    while True:
        ids = X[0]
        masks = X[1]
        types = X[2]
        Y_input = X[3]
        
        batch = np.random.randint(0, len(ids), batch_size)

        Y1 = y[batch]
        y1 = np_utils.to_categorical(Y1, num_classes = 29611)
        
        Y_input = Y_input[batch]
        y_input = np_utils.to_categorical(Y_input, num_classes = 29611)
        
        
        batch_x = [ids[batch], masks[batch], types[batch], y_input]
        batch_y = y1
        yield( batch_x, batch_y )

In [None]:
a = generator(X_train, y_train)

In [None]:
b = a.send(None)

In [None]:
b[0][3].shape

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, TimeDistributed, LSTM

max_length = 64
max_length_out = 15
vocab_size = 29611

inputs_ids = Input(shape = (max_length,), dtype = 'int32')
inputs_mask = Input(shape = (max_length,), dtype = 'int32')
inputs_type = Input(shape = (max_length,), dtype = 'int32')
inputs_decoder = Input(shape = (max_length_out,vocab_size,), dtype = 'float32')

inputs = [inputs_ids, inputs_mask, inputs_type, inputs_decoder]

sentence_encoder = TFBertModel.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.  
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

encoded = sentence_encoder(inputs_ids, attention_mask = inputs_mask, token_type_ids = inputs_type)

word_embedding = encoded[0]
pooled_encoded = encoded[1]

# encoder = LSTM(512, activation='tanh', recurrent_activation='sigmoid', dropout=0.2, 
#                 recurrent_dropout=0.2,return_sequences=True)(word_embedding)

encoder = LSTM(64, activation = 'tanh', return_state=True, recurrent_dropout = 0.001)
encoder_outputs, state_h, state_c = encoder(word_embedding)
encoder_states = [state_h, state_c]

decoder_lstm = LSTM(64, activation = 'tanh', return_sequences=True, return_state=True, recurrent_dropout = 0.001)
decoder_outputs, _, _ = decoder_lstm(inputs_decoder,
                                     initial_state=encoder_states)
# recurrent_dropout = 0
# decoder_outputs = Dense(vocab_size, activation='softmax')(decoder_outputs)

decoder_outputs = TimeDistributed(Dense(vocab_size, activation = 'sigmoid'))(decoder_outputs)


model = Model(inputs, decoder_outputs)

In [None]:
model.summary()

In [None]:
y_train.shape

In [None]:
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.metrics import sparse_categorical_accuracy
import tensorflow as tf

loss_classif     =  'categorical_crossentropy'# find the right loss for multi-class classification
optimizer        =  Adam(3e-5, 1e-8) # find the right optimizer
metrics_classif  =  ['accuracy']


model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)

In [None]:
X_train[0].shape

In [None]:
batch_size = 32
epochs = 4

with tf.device('/device:GPU:0'):
    history = model.fit_generator(generator(X_train, y_train, batch_size=batch_size), steps_per_epoch=len(X_train[0]) // batch_size,
        epochs=epochs)

In [None]:
model.save('text_summary')

In [None]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

In [None]:
import tensorflow as tf

In [None]:
tf.device('XLA_GPU:0')

In [None]:
get_available_gpus()

In [None]:
import tensorflow
tensorflow.__version__

In [None]:
np.argmax(a.send(None)[1], axis = 2)

In [None]:
a = generator(X_test, y_test, batch_size = 1)
batch = a.send(None)
x = batch[0]
y = batch[1]
pred = model.predict(x)

In [None]:
def encode(x):
    return np_utils.to_categorical(x, num_classes = 29611)

In [None]:
x[3]

In [None]:
np.array([encode([101,0,0,0,0,0,0,0,0,0,0])])

In [None]:
x[3].shape

In [None]:
np.argmax(x[3], axis = 2)[0]

In [None]:
tok_pred = np.argmax(pred, axis = 2)[0]

In [None]:
tok_true = np.argmax(y, axis = 2)[0]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print(tokenizer.decode(x[0][0]))
print('\n')
print(tokenizer.decode(tok_true))
print('\n')
print(tokenizer.decode(tok_pred))
print('\n')