In [None]:
!pip install transformers
!pip install --upgrade tensorflow
# !pip install bert
# !pip list

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Dense, Dropout,Embedding, LSTM, Bidirectional, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import backend

# Used for Huggingface Transformers
import transformers as trans

# Used for TFHub
# import bert
# import tensorflow_hub as hub
# from tqdm import tqdm_notebook

In [35]:
from tensorflow.python.client import device_lib

def get_available_devices():
  local_device_protos = device_lib.list_local_devices()
  return[x.name for x in local_device_protos]

print(get_available_devices())

['/device:CPU:0', '/device:XLA_CPU:0']


## Data

In [None]:
# Read temporal split file
latest_train_url = 'https://raw.githubusercontent.com/harish-cu/tweet-url-relationships/master/data/temp/sampling_blind/latest/articles_train.csv'
latest_test_url  = 'https://raw.githubusercontent.com/harish-cu/tweet-url-relationships/master/data/temp/sampling_blind/latest/articles_test.csv'

df_articles_train  = pd.read_csv(latest_train_url)
df_articles_test   = pd.read_csv(latest_test_url)

## Get Pre-trained BERT

In [None]:
bert_model_type = 'bert-base-uncased'
# bert_model_type = 'bert-large-uncased' 
# bert_model_type = 'bert-base-cased' 
# bert_model_type = 'bert-large-cased'

bert_tokenizer_transformer = trans.BertTokenizer.from_pretrained(bert_model_type, do_lower_case=True)
bert_model = trans.TFBertModel.from_pretrained(bert_model_type)

## Tokenize

In [None]:
def _get_inputs(df,_maxlen,tokenizer):
    
    sentences_tokenized = []
    sentences_mask = []
    # sentences_segment = []

    for idx, row in df.iterrows():
      text_seq = row[0]

      encoded_dict = tokenizer.encode_plus(text_seq, 
                                           max_length = _maxlen,
                                           pad_to_max_length = True)

      sentences_tokenized.append(encoded_dict['input_ids'])
      sentences_mask.append(encoded_dict['attention_mask'])
      # sentences_segment.append(encoded_dict['token_type_ids']) 

    # print(sentences_tokenized[0])
    # print(sentences_mask[0])
    # print(sentences_segment[0])

    # Convert list into tensor integer arrays and return it
    return [tf.cast(sentences_tokenized,tf.int32), 
            tf.cast(sentences_mask,tf.int32)
            # tf.cast(sentences_segment,tf.int32)
            ]

##########################################################

def _get_inputs_source(df,_maxlen,tokenizer):
    
    sentences_tokenized = []
    sentences_mask = []
    sentences_segment = []

    for idx, row in df.iterrows():
      source_seq = row[0]
      text_seq   = row[1]

      encoded_dict = tokenizer.encode_plus(source_seq, 
                                           text_seq, 
                                           max_length = _maxlen,
                                           pad_to_max_length = True)

      sentences_tokenized.append(encoded_dict['input_ids'])
      sentences_mask.append(encoded_dict['attention_mask'])
      sentences_segment.append(encoded_dict['token_type_ids']) 

    # print(sentences_tokenized[0])
    # print(sentences_mask[0])
    # print(sentences_segment[0])

    # Convert list into tensor integer arrays and return it
    return [tf.cast(sentences_tokenized,tf.int32), 
            tf.cast(sentences_mask,tf.int32),
            tf.cast(sentences_segment,tf.int32)]            

## Model 1 (Article Text)

In [None]:
X_train = df_articles_train[['content_body_clean']]
X_test  = df_articles_test[['content_body_clean']]

y_train = df_articles_train[['blind_mean_rating']]
y_test  = df_articles_test[['blind_mean_rating']]

In [None]:
max_sequence_length = 128

bert_inputs_train = _get_inputs(df=X_train, tokenizer=bert_tokenizer_transformer, _maxlen=max_sequence_length)
bert_inputs_test  = _get_inputs(df=X_test,  tokenizer=bert_tokenizer_transformer, _maxlen=max_sequence_length)

Xtr = bert_inputs_train
ytr = np.asarray(y_train)

Xte = bert_inputs_test
yte = np.asarray(y_test)

In [None]:
inputs_length = max_sequence_length

token_inputs = Input((inputs_length), dtype=tf.int32, name='input_word_ids')
mask_inputs = Input((inputs_length), dtype=tf.int32, name='input_masks')
# seg_inputs = Input((inputs_length), dtype=tf.int32, name='input_segments')

# seq_output,_ = bert_model([token_inputs, mask_inputs, seg_inputs])
seq_output,_ = bert_model([token_inputs, mask_inputs])
X = GlobalAveragePooling1D()(seq_output)
X = Dense(100, activation='relu')(X)
output_= Dense(1, activation='linear', name='output')(X)
# output_= Dense(1, activation=None, name='output')(X)

# bert_model2 = Model([token_inputs, mask_inputs, seg_inputs],output_)
bert_model2 = Model([token_inputs, mask_inputs],output_)
bert_model2.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_segments (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     multiple             109482240   input_word_ids[0][0]             
                                                                 input_masks[0][0]          

In [None]:
lr = 0.0001  ## 0.1, 0.01, 0.001, 2e-5 
opt = tf.keras.optimizers.Adam(learning_rate=lr)

def rmse(y_true, y_pred):
	return backend.sqrt(backend.mean(backend.square(y_pred - y_true)))

bert_model2.compile(optimizer=opt, loss='mse', metrics=['mae', rmse])
 # bert_model2.compile(optimizer='Adam', loss='mse', metrics=['mae', rmse])

# Callbacks
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min')
# reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.000001)


In [None]:
history = bert_model2.fit(Xtr, ytr, 
                          epochs=25, 
                          validation_split = 0.2,
                          # batch_size = 100,
                          # validation_data=(Xtr_val, ytr_val),
                          verbose = 1,
                          callbacks=[earlyStopping]
                          # callbacks=[earlyStopping, reduce_lr]
                          )

In [None]:
test_results = bert_model2.evaluate(Xte, yte)
print('Test MSE: {0:.4f}'.format(test_results[0]))
print('Test MAE: {0:.4f}'.format(test_results[1]))
print('Test RMSE: {0:.4f}'.format(test_results[2]))

In [None]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(train_loss))

plt.plot(epochs, train_loss, label='Traning Loss')
plt.plot(epochs, val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
y_pred = bert_model2.predict(Xte)
y_pred[:10]

In [None]:
yte[:10]

In [None]:
# y_pred = y_pred.reshape(y_pred.shape[0])
# d = yte - y_pred
# mse_f = np.mean(d**2)
# mae_f = np.mean(abs(d))
# rmse_f = np.sqrt(mse_f)
# # r2_f = 1-(np.sum(d**2)/np.sum((y-np.mean(y))**2))

# print("RESULTS...")
# print("MAE:",mae_f)
# print("MSE:", mse_f)
# print("RMSE:", rmse_f)
# print("R-Squared:", r2_f)

In [None]:
# def plot_history(histories, key='loss'):
#   plt.figure(figsize=(10,8))
    
#   for name, history in histories:
#     val = plt.plot([x+1 for x in history.epoch], history.history['val_'+key],
#                    label='Val_'+key)
#     plt.plot([x+1 for x in history.epoch], history.history[key],
#              label='Train_'+key)

#   plt.xlabel('Epochs')
#   plt.xticks([x+1 for x in history.epoch])
#   plt.ylabel(key.replace('_',' ').title())
#   plt.legend()

#  plot_history([('linear_model', history)]) 

In [None]:
### Fit on test data
plt.plot(yte, y_pred, '.')

# # plot a line, a perfet predict would all fall on this line
x = np.linspace(0, 5)
y = x
plt.plot(x, y)
plt.show()

## Model 2 (Source + Article Text)

In [None]:
X_train_source = df_articles_train[['content_source_desc','content_body_clean']]
X_test_source  = df_articles_test[['content_source_desc','content_body_clean']]

y_train = df_articles_train[['blind_mean_rating']]
y_test  = df_articles_test[['blind_mean_rating']]

In [None]:
max_sequence_length = 128

bert_inputs_train_source = _get_inputs_source(df=X_train_source, tokenizer=bert_tokenizer_transformer, _maxlen=max_sequence_length)
bert_inputs_test_source  = _get_inputs_source(df=X_test_source,  tokenizer=bert_tokenizer_transformer, _maxlen=max_sequence_length)

Xtr_source = bert_inputs_train_source
ytr = np.asarray(y_train)

Xte_source = bert_inputs_test_source
yte = np.asarray(y_test)

In [42]:
inputs_length = max_sequence_length

token_inputs = Input((inputs_length), dtype=tf.int32, name='input_word_ids')
mask_inputs = Input((inputs_length), dtype=tf.int32, name='input_masks')
seg_inputs = Input((inputs_length), dtype=tf.int32, name='input_segments')

seq_output,_ = bert_model([token_inputs, mask_inputs, seg_inputs])
X = GlobalAveragePooling1D()(seq_output)
X = Dense(100, activation='relu')(X)
output_= Dense(1, activation='linear', name='output')(X)
# output_= Dense(1, activation=None, name='output')(X)

bert_model3 = Model([token_inputs, mask_inputs, seg_inputs],output_)
bert_model3.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_segments (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   ((None, 128, 768), ( 109482240   input_word_ids[0][0]             
                                                                 input_masks[0][0]          

In [None]:
lr = 0.0001  ## 0.1, 0.01, 0.001, 2e-5 
opt = tf.keras.optimizers.Adam(learning_rate=lr)

def rmse(y_true, y_pred):
	return backend.sqrt(backend.mean(backend.square(y_pred - y_true)))

bert_model3.compile(optimizer=opt, loss='mse', metrics=['mae', rmse]) 

# Callbacks
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min')
# reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.000001)


In [None]:
history_source = bert_model3.fit(Xtr_source, ytr, 
                                epochs=10, 
                                validation_split = 0.2,
                                # batch_size = 100,
                                # validation_data=(Xtr_val, ytr_val),
                                verbose = 1,
                                callbacks=[earlyStopping]
                                # callbacks=[earlyStopping, reduce_lr]
                                )

In [None]:
test_results = bert_model3.evaluate(Xte_source, yte)
print('Test MSE: {0:.4f}'.format(test_results[0]))
print('Test MAE: {0:.4f}'.format(test_results[1]))
print('Test RMSE: {0:.4f}'.format(test_results[2]))

In [None]:
train_loss = history_source.history['loss']
val_loss = history_source.history['val_loss']
epochs = range(len(train_loss))

plt.plot(epochs, train_loss, label='Traning Loss')
plt.plot(epochs, val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
y_pred = bert_model3.predict(Xte_source)
y_pred[:10]

In [None]:
yte[:10]

In [None]:
# y_pred = y_pred.reshape(y_pred.shape[0])
# d = yte - y_pred
# mse_f = np.mean(d**2)
# mae_f = np.mean(abs(d))
# rmse_f = np.sqrt(mse_f)
# # r2_f = 1-(np.sum(d**2)/np.sum((y-np.mean(y))**2))

# print("RESULTS...")
# print("MAE:",mae_f)
# print("MSE:", mse_f)
# print("RMSE:", rmse_f)
# print("R-Squared:", r2_f)

In [None]:
# def plot_history(histories, key='loss'):
#   plt.figure(figsize=(10,8))
    
#   for name, history in histories:
#     val = plt.plot([x+1 for x in history.epoch], history.history['val_'+key],
#                    label='Val_'+key)
#     plt.plot([x+1 for x in history.epoch], history.history[key],
#              label='Train_'+key)

#   plt.xlabel('Epochs')
#   plt.xticks([x+1 for x in history.epoch])
#   plt.ylabel(key.replace('_',' ').title())
#   plt.legend()

#  plot_history([('linear_model', history)]) 

In [None]:
### Fit on test data
plt.plot(yte, y_pred, '.')

# # plot a line, a perfet predict would all fall on this line
x = np.linspace(0, 5)
y = x
plt.plot(x, y)
plt.show()