BERT Classification Model using TFHUB

In [2]:
!pip install tensorflow==2.0.0

Collecting tensorflow==2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/46/0f/7bd55361168bb32796b360ad15a25de6966c9c1beb58a8e30c01c8279862/tensorflow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (86.3MB)
[K     |████████████████████████████████| 86.3MB 78kB/s 
[?25hCollecting tensorflow-estimator<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/fc/08/8b927337b7019c374719145d1dceba21a8bb909b93b1ad6f8fb7d22c1ca1/tensorflow_estimator-2.0.1-py2.py3-none-any.whl (449kB)
[K     |████████████████████████████████| 450kB 29.7MB/s 
Collecting tensorboard<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/76/54/99b9d5d52d5cb732f099baaaf7740403e83fe6b0cedde940fabd2b13d75a/tensorboard-2.0.2-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 31.4MB/s 
Collecting gast==0.2.2
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz
Bu

In [3]:
!pip install bert_for_tf2

Collecting bert_for_tf2
[?25l  Downloading https://files.pythonhosted.org/packages/35/5c/6439134ecd17b33fe0396fb0b7d6ce3c5a120c42a4516ba0e9a2d6e43b25/bert-for-tf2-0.14.4.tar.gz (40kB)
[K     |████████                        | 10kB 15.2MB/s eta 0:00:01[K     |████████████████▏               | 20kB 1.9MB/s eta 0:00:01[K     |████████████████████████▎       | 30kB 2.7MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 2.3MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [0]:
import tensorflow_hub as hub
import tensorflow as tf
import bert
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import unicodedata
import re

In [5]:
tf.__version__

'2.0.0'

In [0]:
pd.set_option('display.max_colwidth', 500)

In [0]:
# BERT_URL = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'
# module = hub.load(BERT_URL)

### Import BERT TFHub model and create a tokenizer

In [0]:
bert_path = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1'

In [0]:
bert_layer = hub.KerasLayer(bert_path)

In [0]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

In [0]:
bert_tokenizer_tfhub = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case=True)

### Methods to transform input texts (two columns - Tweet Text & Headline) into BERT compatable form. 


In [0]:
def _get_segments(sentences):
    sentences_segments = []
    for sent in sentences:
      temp = []
      i = 0
      for token in sent.split(" "):
        temp.append(i)
        if token == "[SEP]":
          i += 1
      sentences_segments.append(temp)
    return sentences_segments

def _get_inputs(df,_maxlen,tokenizer,use_keras_pad=False, verbose=0):

    maxqnans = np.int((_maxlen-20)/2)
    pattern = '[^\w\s]+|\n' # remove everything including newline (|\n) other than words (\w) or spaces (\s)
    
    sentences = ["[CLS] " + " ".join(tokenizer.tokenize(txt)) +" [SEP] " 
              + " ".join(tokenizer.tokenize(head)) +" [SEP]" 
                for (txt,head) 
                in 
              zip(df['Text'].str.replace(pattern, '').values.tolist(),
              df['News headline'].str.replace(pattern, '').values.tolist())]
   
    # print(sentences)
    if (verbose > 0):
      for i in range(min(len(sentences),3)):
        print(sentences[i])
      
    #generate masks
    # bert requires a mask for the words which are padded. 
    # Say for example, maxlen is 100, sentence size is 90. then, [1]*90 + [0]*[100-90]
    sentences_mask = [[1]*len(sent.split(" "))+[0]*(_maxlen - len(sent.split(" "))) for sent in sentences]
 
    #generate input ids  
    # if less than max length provided then the words are padded
    if use_keras_pad:
      sentences_padded = pad_sequences(sentences.split(" "), dtype=object, maxlen=10, value='[PAD]',padding='post')
    else:
      sentences_padded = [sent + " [PAD]"*(_maxlen-len(sent.split(" "))) if len(sent.split(" "))!=_maxlen else sent for sent in sentences ]

    sentences_converted = [tokenizer.convert_tokens_to_ids(s.split(" ")) for s in sentences_padded]
    
    #generate segments
    # for each separation [SEP], a new segment is converted
    sentences_segment = _get_segments(sentences_padded)

    genLength = set([len(sent.split(" ")) for sent in sentences_padded])

    if _maxlen < 20:
      raise Exception("max length cannot be less than 20")
    elif len(genLength)!=1: 
      print(genLength)
      raise Exception("sentences are not of same size")

    return [tf.cast(sentences_converted,tf.int32), tf.cast(sentences_segment,tf.int32), tf.cast(sentences_mask,tf.int32)]

### Test method for checking inputs to Bert layer

In [0]:
df_t = pd.DataFrame(columns=['Text', 'News headline'])
df_t['Text'] = ['It''s a 45 person, lady.',' It''s the second sentence for testing']
df_t['News headline'] = ['The Senate is voting on 23654 a 20-week abortion ban. Opponents say it''s ‚Äúbasically relying on junk science.‚Äù',' Iam the 2nd sentence. And then again the next'] 

In [40]:
df_t

Unnamed: 0,Text,News headline
0,"Its a 45 person, lady.",The Senate is voting on 23654 a 20-week aborti...
1,Its the second sentence for testing,Iam the 2nd sentence. And then again the next


In [41]:
bert_inputs = _get_inputs(df_t,tokenizer=bert_tokenizer_tfhub,_maxlen=50, verbose=1)

[CLS] its a 45 person lady [SEP] the senate is voting on 236 ##54 a 20 ##week abortion ban opponents say its au ##bas ##ically relying on junk science ##au [SEP]
[CLS] its the second sentence for testing [SEP] ia ##m the 2nd sentence and then again the next [SEP]


In [42]:
bert_inputs

[<tf.Tensor: id=23038, shape=(2, 50), dtype=int32, numpy=
 array([[  101,  2049,  1037,  3429,  2711,  3203,   102,  1996,  4001,
          2003,  6830,  2006, 23593, 27009,  1037,  2322, 28075, 11324,
          7221,  7892,  2360,  2049,  8740, 22083, 15004, 18345,  2006,
         18015,  2671,  4887,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  2049,  1996,  2117,  6251,  2005,  5604,   102, 24264,
          2213,  1996,  3416,  6251,  1998,  2059,  2153,  1996,  2279,
           102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]], dtype=int32)>,
 <tf.Tensor: id=23039, shape=(2, 50), dtype=int32, numpy=
 array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

### Data preprocessing
Remove Special Characters  
Create Train and test datasets  
Convert them to Bert inputs

In [47]:
!wget https://raw.githubusercontent.com/harish-cu/tweet-url-relationships/master/data/raw/news-url-data-annotated-4-19-2020.csv

--2020-04-30 14:35:03--  https://raw.githubusercontent.com/harish-cu/tweet-url-relationships/master/data/raw/news-url-data-annotated-4-19-2020.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 209597 (205K) [text/plain]
Saving to: ‘news-url-data-annotated-4-19-2020.csv.1’


2020-04-30 14:35:03 (4.80 MB/s) - ‘news-url-data-annotated-4-19-2020.csv.1’ saved [209597/209597]



In [0]:
df = pd.read_csv("news-url-data-annotated-4-19-2020.csv")

In [49]:
df.head()

Unnamed: 0,Label,Text,Tweet,News headline
0,COMMENT,Ironic @voxdotcom attempts to discredit scienc...,Ironic @voxdotcom attempts to discredit scienc...,The Senate is voting on a 20-week abortion ban...
1,SUBJ-ARTICLE,#reproductiverights #abortionrights https://t....,#reproductiverights #abortionrights https://t....,The Senate is voting on a 20-week abortion ban...
2,COMMENT,i feel like this issue boils down to at what p...,i feel like this issue boils down to at what p...,The Senate is voting on a 20-week abortion ban...
3,COMMENT,I stg they just pick a random number from a ha...,I stg they just pick a random number from a ha...,The Senate is voting on a 20-week abortion ban...
4,COMMENT,"Maybe you can't regulate evil, but it sure loo...","Maybe you can't regulate evil, but it sure loo...",The Senate is voting on a 20-week abortion ban...


In [0]:
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
  s = s.strip()
  return s

In [0]:
df['Text'] = df['Text'].astype(str).apply(preprocess)

In [0]:
df['News headline'] = df['News headline'].astype(str).apply(preprocess)

In [0]:
# df['Label'].unique()

In [0]:
df.Label = df.Label.str.rstrip()

In [0]:
df_filtered = df[(df['Label'] != 'QUOTE') & (df['Label'] != 'I am disappointed in twitter censorship https://t.co/vFAEUUv70N') & (df.Label.notnull()) & (df['Label'] != 'COMMENT + RHET')]

In [56]:
df_filtered.Label.astype('category').cat.codes.unique()

array([0, 7, 1, 5, 8, 2, 4, 6, 3], dtype=int8)

In [57]:
df_filtered.Label.unique()

array(['COMMENT', 'SUBJ-ARTICLE', 'DIRECT', 'RHET', 'SUMMARY', 'HEADLINE',
       'NON-EN', 'SPAM', 'META'], dtype=object)

In [0]:
cat_codes = list(df_filtered.Label.astype('category').cat.codes.unique())

In [0]:
categories = list(df_filtered.Label.unique())

In [0]:
dict_cat = {}
for i in range(len(cat_codes)):
  dict_cat[cat_codes[i]] = categories[i]

In [61]:
dict_cat

{0: 'COMMENT',
 1: 'DIRECT',
 2: 'HEADLINE',
 3: 'META',
 4: 'NON-EN',
 5: 'RHET',
 6: 'SPAM',
 7: 'SUBJ-ARTICLE',
 8: 'SUMMARY'}

In [62]:
df_filtered['target'] = df_filtered.Label.astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [63]:
df_filtered.head()

Unnamed: 0,Label,Text,Tweet,News headline,target
0,COMMENT,Ironic voxdotcom attempts to discredit science...,Ironic @voxdotcom attempts to discredit scienc...,The Senate is voting on a week abortion ban . ...,0
1,SUBJ-ARTICLE,reproductiverights abortionrights https t . co...,#reproductiverights #abortionrights https://t....,The Senate is voting on a week abortion ban . ...,7
2,COMMENT,i feel like this issue boils down to at what p...,i feel like this issue boils down to at what p...,The Senate is voting on a week abortion ban . ...,0
3,COMMENT,I stg they just pick a random number from a ha...,I stg they just pick a random number from a ha...,The Senate is voting on a week abortion ban . ...,0
4,COMMENT,"Maybe you can t regulate evil , but it sure lo...","Maybe you can't regulate evil, but it sure loo...",The Senate is voting on a week abortion ban . ...,0


In [0]:
X_train, X_test, y_train, y_test = train_test_split(df_filtered[['Text','News headline']], df_filtered['target'], test_size=0.2)

In [65]:
X_train.head()

Unnamed: 0,Text,News headline
526,If Twitter thinks talking about abortions is b...,Twitter Blocks Marsha Blackburn Senate Announc...
198,Irony Congress votes on wk abortion ban based ...,The Senate is voting on a week abortion ban . ...
218,Pregnant women should be able to decide what t...,The Senate is voting on a week abortion ban . ...
160,. . but HouseGOP is attacking women s health a...,The Senate is voting on a week abortion ban . ...
581,TN will stay Red ! ! https t . co E xjvvUtEO,Twitter Blocks Marsha Blackburn Senate Announc...


In [0]:
y_train = y_train.to_numpy()

In [70]:
bert_inputs = _get_inputs(X_train,tokenizer=bert_tokenizer_tfhub,_maxlen=200, verbose=1)

[CLS] if twitter thinks talking about abortion ##s is bad they re gonna be shocked when they hear what pp ##act has been up to https t co ty ##q ##f ##q ##k ##l ##wo ##c [SEP] twitter blocks marsh ##a blackburn senate announcement because of her pro life stance [SEP]
[CLS] irony congress votes on w ##k abortion ban based on un ##sc ##ient ##ific fetal pain claim lets insurance for m kids ex ##pire https t co vu ##vy ##z ##va ##x ##gy [SEP] the senate is voting on a week abortion ban opponents say it s basically relying on junk science [SEP]
[CLS] pregnant women should be able to decide what to with their bodies [SEP] the senate is voting on a week abortion ban opponents say it s basically relying on junk science [SEP]


### Model Creation

In [0]:
def build_model_fullyconnected(MAX_SEQUENCE_LENGTH = 200):
    """add pretrained bert model as a keras layer"""
    input_word_ids = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    sout, _word_emb = bert_layer([input_word_ids, input_masks, input_segments])
    X= Dense(100, activation='relu')(sout) 
    X= Dense(64, activation='relu')(X) 
    # X = GlobalAveragePooling1D()(X)
    output_= Dense(len(categories), activation='softmax', name='output')(X)

    #model = Model(input_,output_)
    model = Model([input_word_ids, input_masks, input_segments],output_)
    print(model.summary())

    return model

In [0]:
model1 = model1 = build_model_fullyconnected()

model1.compile(optimizer = "adam",loss='sparse_categorical_crossentropy',
              metrics=['acc'])

earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='acc', patience=20, verbose=0)
cp_save = tf.keras.callbacks.ModelCheckpoint('model-e{epoch:03d}.ckpt', 
                                             save_best_only=True, monitor='acc', mode='min')

history1 = model1.fit(bert_inputs, y_train, epochs=50, verbose=2, 
                     callbacks=[earlyStopping, cp_save] )
                    #  ,validation_split=0.2 )

Model: "model_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_segments (InputLayer)     [(None, 200)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_masks[0][0]         







537/537 - 463s - loss: 1.6896 - acc: 0.5047
Epoch 2/50








537/537 - 456s - loss: 1.4914 - acc: 0.5400
Epoch 3/50








537/537 - 463s - loss: 1.4884 - acc: 0.5400
Epoch 4/50








537/537 - 460s - loss: 1.4813 - acc: 0.5400
Epoch 5/50








537/537 - 463s - loss: 1.4841 - acc: 0.5400
Epoch 6/50








537/537 - 463s - loss: 1.4727 - acc: 0.5400
Epoch 7/50








537/537 - 463s - loss: 1.4733 - acc: 0.5400
Epoch 8/50








537/537 - 457s - loss: 1.4772 - acc: 0.5400
Epoch 9/50








537/537 - 458s - loss: 1.4832 - acc: 0.5400
Epoch 10/50








537/537 - 457s - loss: 1.4870 - acc: 0.5400
Epoch 11/50








537/537 - 463s - loss: 1.4734 - acc: 0.5400
Epoch 12/50








537/537 - 468s - loss: 1.4725 - acc: 0.5400
Epoch 13/50








537/537 - 468s - loss: 1.4683 - acc: 0.5400
Epoch 14/50








537/537 - 480s - loss: 1.4616 - acc: 0.5400
Epoch 15/50








537/537 - 481s - loss: 1.4701 - acc: 0.5400
Epoch 16/50








537/537 - 480s - loss: 1.4610 - acc: 0.5400
Epoch 17/50








537/537 - 477s - loss: 1.4645 - acc: 0.5400
Epoch 18/50








537/537 - 481s - loss: 1.4680 - acc: 0.5400
Epoch 19/50








537/537 - 481s - loss: 1.4693 - acc: 0.5400
Epoch 20/50








537/537 - 479s - loss: 1.4607 - acc: 0.5400
Epoch 21/50








537/537 - 487s - loss: 1.4578 - acc: 0.5400
Epoch 22/50








537/537 - 490s - loss: 1.4550 - acc: 0.5400
Epoch 23/50








537/537 - 484s - loss: 1.4625 - acc: 0.5400
Epoch 24/50








537/537 - 484s - loss: 1.4552 - acc: 0.5400
Epoch 25/50








537/537 - 489s - loss: 1.4677 - acc: 0.5400
Epoch 26/50








537/537 - 485s - loss: 1.4508 - acc: 0.5400
Epoch 27/50








537/537 - 486s - loss: 1.4453 - acc: 0.5400
Epoch 28/50








537/537 - 487s - loss: 1.4544 - acc: 0.5400
Epoch 29/50








537/537 - 488s - loss: 1.4537 - acc: 0.5400
Epoch 30/50








537/537 - 488s - loss: 1.4510 - acc: 0.5400
Epoch 31/50








537/537 - 493s - loss: 1.4450 - acc: 0.5400
Epoch 32/50








537/537 - 493s - loss: 1.4497 - acc: 0.5400
Epoch 33/50








537/537 - 494s - loss: 1.4413 - acc: 0.5400
Epoch 34/50








537/537 - 492s - loss: 1.4439 - acc: 0.5400
Epoch 35/50








537/537 - 487s - loss: 1.4399 - acc: 0.5400
Epoch 36/50








537/537 - 490s - loss: 1.4433 - acc: 0.5400
Epoch 37/50








537/537 - 491s - loss: 1.4378 - acc: 0.5400
Epoch 38/50








537/537 - 484s - loss: 1.4422 - acc: 0.5400
Epoch 39/50








537/537 - 483s - loss: 1.4465 - acc: 0.5400
Epoch 40/50








537/537 - 478s - loss: 1.4443 - acc: 0.5400
Epoch 41/50








537/537 - 475s - loss: 1.4406 - acc: 0.5400
Epoch 42/50


In [0]:
# type(y_train)

In [0]:
def build_model_bertembed(MAX_SEQUENCE_LENGTH = 200):

    input_ = Input(shape = (768), name='bert_enconding')
    X= Dense(100, activation='relu')(input_) 
    X= Dense(64, activation='relu')(X) 
    # X = GlobalAveragePooling1D()(X)
    output_= Dense(len(categories), activation='softmax', name='output')(X)
    
    model = Model(input_,output_)
    print(model.summary())
    return model

In [0]:
Xtr_bert,_ = bert_layer(bert_inputs)

In [73]:
Xtr_bert.shape

TensorShape([537, 768])

In [74]:
model2 = build_model_bertembed()

model2.compile(optimizer = "adam",loss='sparse_categorical_crossentropy',
              metrics=['acc'])

earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='acc', patience=100, verbose=0)
cp_save = tf.keras.callbacks.ModelCheckpoint('model-e{epoch:03d}.ckpt', 
                                             save_best_only=True, monitor='acc', mode='min')

history2 = model2.fit(Xtr_bert, y_train, epochs=200, verbose=2, 
                     callbacks=[earlyStopping, cp_save])
                      # , validation_split=0.2 )

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert_enconding (InputLayer)  [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 100)               76900     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                6464      
_________________________________________________________________
output (Dense)               (None, 9)                 585       
Total params: 83,949
Trainable params: 83,949
Non-trainable params: 0
_________________________________________________________________
None
Train on 537 samples
Epoch 1/200
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: model-e001.ckpt/assets


INFO:tensorflow:Assets written to: model-e001.ckpt/assets


537/537 - 1s - loss: 1.6001 - acc: 0.5121
Epoch 2/200
537/537 - 0s - loss: 1.5203 - acc: 0.5345
Epoch 3/200
537/537 - 0s - loss: 1.5394 - acc: 0.5345
Epoch 4/200
537/537 - 0s - loss: 1.5213 - acc: 0.5345
Epoch 5/200
537/537 - 0s - loss: 1.5030 - acc: 0.5345
Epoch 6/200
537/537 - 0s - loss: 1.5061 - acc: 0.5345
Epoch 7/200
537/537 - 0s - loss: 1.5010 - acc: 0.5345
Epoch 8/200
537/537 - 0s - loss: 1.4922 - acc: 0.5345
Epoch 9/200
537/537 - 0s - loss: 1.4982 - acc: 0.5345
Epoch 10/200
537/537 - 0s - loss: 1.5037 - acc: 0.5345
Epoch 11/200
537/537 - 0s - loss: 1.4942 - acc: 0.5345
Epoch 12/200
537/537 - 0s - loss: 1.4898 - acc: 0.5345
Epoch 13/200
537/537 - 0s - loss: 1.4904 - acc: 0.5345
Epoch 14/200
537/537 - 0s - loss: 1.5092 - acc: 0.5345
Epoch 15/200
537/537 - 0s - loss: 1.4973 - acc: 0.5345
Epoch 16/200
537/537 - 0s - loss: 1.4971 - acc: 0.5345
Epoch 17/200
537/537 - 0s - loss: 1.4846 - acc: 0.5345
Epoch 18/200
537/537 - 0s - loss: 1.4823 - acc: 0.5345
Epoch 19/200
537/537 - 0s - los

In [78]:
test_bert_inputs = _get_inputs(X_test,tokenizer=bert_tokenizer_tfhub,_maxlen=200, verbose=1)

[CLS] guess we shouldn t be surprised that fake ##ne ##ws liberals prefer to hide the truth https t co w ##dm w ##p ##b r ##j [SEP] twitter blocks marsh ##a blackburn senate announcement because of her pro life stance [SEP]
[CLS] ce ##rno ##vich hi ##ck ##field twitter blocks marsh ##a blackburn senate announcement because of her pro life stance https t co o v g ##dy ##r [SEP] twitter blocks marsh ##a blackburn senate announcement because of her pro life stance [SEP]
[CLS] doesn t matter now the bill is passed pro ##life https t co p ##x ##oo ##c ##gs ##h z [SEP] the senate is voting on a week abortion ban opponents say it s basically relying on junk science [SEP]


In [0]:
Xtst_bert,_ = bert_layer(test_bert_inputs)

In [0]:
y_test = y_test.to_numpy()

In [81]:
model2.evaluate(Xtst_bert, y_test)



[1.426576895183987, 0.5555556]

In [98]:
model2.predict(Xtst_bert[:2])

array([[0.45591283, 0.12246379, 0.13989314, 0.00906001, 0.01216222,
        0.10056213, 0.00885698, 0.06756611, 0.0835228 ],
       [0.44960558, 0.11820823, 0.14978886, 0.00860436, 0.01341835,
        0.0955203 , 0.00900478, 0.06873979, 0.08710971]], dtype=float32)

In [0]:
def return_labels(np_arr):
  return [dict_cat[idx] for idx in list(np_arr)]

def prediction_labels(mdl, np_arr):
  argmax_output = [np.argmax(lst) for lst in list(mdl.predict(np_arr))]
  return return_labels(argmax_output)

In [114]:
X_test[:4]

Unnamed: 0,Text,News headline
386,Guess we shouldn t be surprised that fakenews liberals prefer to hide the truth https t . co wdM WPB rJ,Twitter Blocks Marsha Blackburn Senate Announcement Because of Her Pro Life Stance
457,Cernovich hickfield Twitter Blocks Marsha Blackburn Senate Announcement Because of Her Pro Life Stance https t . co o v GDYR,Twitter Blocks Marsha Blackburn Senate Announcement Because of Her Pro Life Stance
52,Doesn t matter now . . . the bill is PASSED ! ProLife https t . co pxOocGSh z,The Senate is voting on a week abortion ban . Opponents say it s basically relying on junk science .


In [117]:
print('Actual Label -', return_labels(y_test[:4]))

Actual Label - ['COMMENT', 'HEADLINE', 'COMMENT', 'DIRECT']


In [120]:
print('Predicted Label -', prediction_labels(model2, Xtst_bert[:4]))

Predicted Label - ['COMMENT', 'COMMENT', 'COMMENT', 'COMMENT']


In [0]:
# def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
#     tokens = ['[CLS]']
#     tokens.extend(tokenizer.tokenize(sentence))
#     if len(tokens) > max_seq_len-1:
#         tokens = tokens[:max_seq_len-1]
#     tokens.append('[SEP]')
    
#     segment_ids = [0] * len(tokens)
#     input_ids = tokenizer.convert_tokens_to_ids(tokens)
#     input_mask = [1] * len(input_ids)

#     #Zero Mask till seq_length
#     zero_mask = [0] * (max_seq_len-len(tokens))
#     input_ids.extend(zero_mask)
#     input_mask.extend(zero_mask)
#     segment_ids.extend(zero_mask)
    
#     return input_ids, input_mask, segment_ids

# def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
#     all_input_ids = []
#     all_input_mask = []
#     all_segment_ids = []
    
#     for sentence in sentences:
#         input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
#         all_input_ids.append(input_ids)
#         all_input_mask.append(input_mask)
#         all_segment_ids.append(segment_ids)
    
#     return all_input_ids, all_input_mask, all_segment_ids