In [22]:
import numpy as np
import pandas as pd

import os
import re

import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras import backend as K
from transformers import *

## Data Prep

In [52]:
# Read articles and Ratings file

df_articles_train  = pd.read_csv('Data/latest/articles_train.csv')
df_articles_test  = pd.read_csv('Data/latest/articles_test.csv')

X_train = df_articles_train['content_body_clean']
y_train = df_articles_train['blind_mean_rating']

X_test = df_articles_test['content_body_clean']
y_test = df_articles_test['blind_mean_rating']

## Using TFHub

In [None]:
# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(bert_path, trainable=True)
# max_seq_length = 256

In [None]:
vocab_file1 = bert_layer.resolved_object.vocab_file.asset_path.numpy()
bert_tokenizer_tfhub = bert.bert_tokenization.FullTokenizer(vocab_file1, do_lower_case=True)

## Using Huggingface Transformers

In [41]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
bert_tokenizer_transformer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [50]:
def _get_segments(sentences):
    sentences_segments = []
    for sent in sentences:
      temp = []
      i = 0
      for token in sent.split(" "):
        temp.append(i)
        if token == "[SEP]":
          i += 1
      sentences_segments.append(temp)
    return sentences_segments

def _get_inputs(df,_maxlen,tokenizer,use_keras_pad=False):


    maxqnans = np.int((_maxlen-20)/2)
    pattern = '[^\w\s]+|\n' # remove everything including newline (|\n) other than words (\w) or spaces (\s)
    
    sentences = ["[CLS] " + " ".join(tokenizer.tokenize(text)[:maxqnans]) +" [SEP] " 
#               + " ".join(tokenizer.tokenize(ans)[:maxqnans]) +" [SEP] " 
#               + " ".join(tokenizer.tokenize(title)[:10]) + " [SEP] "
#               + " ".join(tokenizer.tokenize(cat)[:10]) +" [SEP]" 
                for (text) in zip(df.str.replace(pattern, '').values.tolist())
                ]
              #train.head()[['question_title','question_body','answer','category']].values.tolist()]
    

    #generate masks
    # bert requires a mask for the words which are padded. 
    # Say for example, maxlen is 100, sentence size is 90. then, [1]*90 + [0]*[100-90]
    sentences_mask = [[1]*len(sent.split(" "))+[0]*(_maxlen - len(sent.split(" "))) for sent in sentences]
 
    #generate input ids  
    # if less than max length provided then the words are padded
    if use_keras_pad:
      sentences_padded = pad_sequences(sentences.split(" "), dtype=object, maxlen=10, value='[PAD]',padding='post')
    else:
      sentences_padded = [sent + " [PAD]"*(_maxlen-len(sent.split(" "))) if len(sent.split(" "))!=_maxlen else sent for sent in sentences ]

    sentences_converted = [tokenizer.convert_tokens_to_ids(s.split(" ")) for s in sentences_padded]
    
    #generate segments
    # for each separation [SEP], a new segment is converted
    sentences_segment = _get_segments(sentences_padded)

    genLength = set([len(sent.split(" ")) for sent in sentences_padded])

    if _maxlen < 20:
      raise Exception("max length cannot be less than 20")
    elif len(genLength)!=1: 
      print(genLength)
      raise Exception("sentences are not of same size")



    #convert list into tensor integer arrays and return it
    #return sentences_converted,sentences_segment, sentences_mask
    #return [np.asarray(sentences_converted, dtype=np.int32), 
    #        np.asarray(sentences_segment, dtype=np.int32), 
    #        np.asarray(sentences_mask, dtype=np.int32)]
    return [tf.cast(sentences_converted,tf.int32), tf.cast(sentences_segment,tf.int32), tf.cast(sentences_mask,tf.int32)]

In [None]:
bert_inputs = _get_inputs(df=X_train.head(),tokenizer=bert_tokenizer_transformer,_maxlen=256)

## Add Pre-trained model using Transformer

In [34]:
#reference: https://github.com/huggingface/transformers

In [53]:
from tensorflow.keras.layers import Dense, Dropout,Embedding, LSTM, Bidirectional, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence

In [None]:
Xtr = bert_inputs
ytr = np.asarray(y_train)

Xte = _get_inputs(X_test.head(),_maxlen=100, tokenizer = bert_tokenizer_transformer )
yte = np.asarray(y_test)

## BERT Model

In [None]:
#reference: https://github.com/huggingface/transformers/issues/1350

MAX_SEQUENCE_LENGTH = 100

token_inputs = Input((MAX_SEQUENCE_LENGTH), dtype=tf.int32, name='input_word_ids')
mask_inputs = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
seg_inputs = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')

bert_model = TFBertModel.from_pretrained("bert-base-uncased")
seq_output,_ = bert_model([token_inputs, mask_inputs, seg_inputs])
X = GlobalAveragePooling1D()(seq_output)
X = Dense(100, activation='relu')(X)
output_= Dense(30, activation='sigmoid', name='output')(X)

bert_model2 = Model([token_inputs, mask_inputs, seg_inputs],output_)
bert_model2.summary()


In [None]:
bert_model2.compile(optimizer='adam', loss='binary_crossentropy')

In [None]:
bert_model2.fit(Xtr,ytr,epochs=1,batch_size = 3)

In [None]:
result = bert_model2.predict(Xte)
result.shape