In [1]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [2]:
!pip install bert-tensorflow==1.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
import re
import nltk
import logging
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
tf.gfile = tf.io.gfile
import tensorflow_hub as hub
from bert import tokenization
from keras.models import Model        
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from keras.callbacks import EarlyStopping            
from nltk.stem.porter import PorterStemmer
from keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical

warnings.filterwarnings("ignore", category = UserWarning, module = 'bs4')

In [34]:
logging.basicConfig(level=logging.INFO)
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [35]:
train = pd.read_csv('data.csv')
train.columns = ['Phrase', 'Sentiment']

In [36]:
def clean(df):
  nltk.download('stopwords')
  for i in range(df.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', df['Phrase'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    df['Phrase'][i] = review
  return df

In [37]:
data = clean(train)
train = data
encoded_dict = {'negative': 0, 'neutral': 1, 'positive': 2}
train['Sentiment'] = train.Sentiment.map(encoded_dict)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
  all_tokens = []
  all_masks = []
  all_segments = []
  
  for text in texts:
    text = tokenizer.tokenize(text)
        
    text = text[:max_len-2]
    input_sequence = ["[CLS]"] + text + ["[SEP]"]
    pad_len = max_len - len(input_sequence)
    
    tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
    pad_masks = [1] * len(input_sequence) + [0] * pad_len
    segment_ids = [0] * max_len
    
    all_tokens.append(tokens)
    all_masks.append(pad_masks)
    all_segments.append(segment_ids)
  
  return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def build_model(bert_layer, max_len = 512):
  input_word_ids = Input(shape = (max_len,), dtype = tf.int32, name = "input_word_ids")
  input_mask = Input(shape = (max_len,), dtype = tf.int32, name = "input_mask")
  segment_ids = Input(shape = (max_len,), dtype = tf.int32, name = "segment_ids")

  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
  clf_output = sequence_output[:, 0, :]
  net = Dense(64, activation = 'relu')(clf_output)
  net = Dropout(0.2)(net)
  net = Dense(32, activation = 'relu')(net)
  net = Dropout(0.2)(net)
  out = Dense(3, activation = 'softmax')(net)
  
  model = Model(inputs = [input_word_ids, input_mask, segment_ids], outputs=out)
  model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
  
  return model

In [39]:
max_len = 150
train_input = bert_encode(train.Phrase.values, tokenizer, max_len = max_len)
train_labels = to_categorical(train.Sentiment.values, num_classes = 3)

model = build_model(bert_layer, max_len = max_len)
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 150)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 150)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 150)]        0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     [(None, 1024),       335141889   ['input_word_ids[0][0]',         
                                 (None, 150, 1024)]               'input_mask[0][0]',       

In [42]:
earlystopping = EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=3,
    callbacks=[earlystopping],
    batch_size=8,
    verbose=1
)

Epoch 1/3
Epoch 2/3
Epoch 3/3
