# BERT & TF

In [1]:
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
import pickle as pk
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from collections import defaultdict
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split

# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

### Load Data - Latest Data Split - Blind

In [2]:
data_train = pd.read_csv("Data/latest/articles_train.csv")
data_test = pd.read_csv("Data/latest/articles_test.csv")

In [3]:
data_train.head()

Unnamed: 0,content_id,month,day,year,date,content_source_desc,content_title_clean,content_body_clean,blind_mean_rating,blind_rating_count,blind_ratings
0,2932,11,2,2017,2017-11-02,The New York Times,A Tax Cut That Lifts the Economy? Opinions Are...,Yet if the House plan resolves some longstandi...,3.177778,45,"[4.5, 1.5, 0.5, 4.5, 1.0, 4.0, 3.5, 3.5, 1.5, ..."
1,2870,11,1,2017,2017-11-01,Fox News,"Tom Tancredo enters Colorado governor's race, ...",Former U.S. Rep. Tom Tancredo announced Tuesda...,2.375,16,"[3.5, 4.0, 3.0, 2.5, 0.5, 3.0, 0.5, 0.5, 4.5, ..."
2,2869,11,1,2017,2017-11-01,The New York Times,Panel Recommends Opioid Solutions but Puts No ...,President Trump’s bipartisan commission on th...,3.916667,6,"[5.0, 4.0, 4.5, 3.5, 2.0, 4.5]"
3,2864,11,1,2017,2017-11-01,Fox News,"Trump vows to end non merit-base immigration, ...",President Trump vowed Wednesday to scrap the f...,2.1,5,"[2.0, 2.5, 4.0, 0.5, 1.5]"
4,2868,11,1,2017,2017-11-01,Breitbart,WATCH: Congress Holds Hearing on Banning Abort...,Congress will hold a hearing Wednesday on a bi...,2.428571,21,"[2.0, 1.0, 0.5, 1.0, 3.5, 5.0, 2.0, 0.5, 2.5, ..."


In [4]:
#clean data
def clean_data(text):
    #remove punctuation, digits, extra stuff. make lowercase
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    #lemma it - include POS tag in order to lemma it better
    tag_map = defaultdict(lambda : wordnet.NOUN)
    tag_map['J'] = wordnet.ADJ
    tag_map['V'] = wordnet.VERB
    tag_map['R'] = wordnet.ADV
    lemmatizer = WordNetLemmatizer()
    textTokens = word_tokenize(text)
    #remove stopwords
    word_tokens_nostop = [w for w in textTokens if not w in stopwords.words('english')] 
    #now lemma
    text = [lemmatizer.lemmatize(tok, tag_map[tag[0]]) for tok, tag in pos_tag(word_tokens_nostop)]
    return " ".join(text)

### Get clean body

In [5]:
clean_body_train = data_train["content_body_clean"].apply(clean_data)
y_train = data_train['blind_mean_rating']

In [6]:
clean_body_test = data_test["content_body_clean"].apply(clean_data)
y_test = data_test['blind_mean_rating']

### Split train and val

In [8]:
X_train, X_val, y_train_t, y_val = train_test_split(clean_body_train, y_train, test_size=0.15, shuffle=True, random_state=3)

In [17]:
y_train_std = np.array(list(y_train_t.div(5)))
y_val_std = np.array(list(y_val.div(5)))
y_test_std = np.array(list(y_test.div(5)))

### Word2Vec Embeddings

In [55]:
EMBED_DIM = 64

In [56]:
train_body = []
articles = list(clean_body_train)

for article in articles:
    art_tokens = word_tokenize(article)
    train_body.append(art_tokens)

In [57]:
len(train_body)

1301

In [58]:
import gensim

# train word2vec model
modelw2 = gensim.models.Word2Vec(sentences=train_body, size=EMBED_DIM, window=5, workers=4, min_count=1)
# vocab size
total_words = list(modelw2.wv.vocab)
print("Vocab size",len(total_words))

Vocab size 29377


In [59]:
modelw2.wv.most_similar('president')

[('donald', 0.9432460069656372),
 ('realdonaldtrump', 0.9398722648620605),
 ('barack', 0.9383809566497803),
 ('crosshairs', 0.9366528391838074),
 ('jinping', 0.9324452877044678),
 ('obama', 0.9309636950492859),
 ('emmanuel', 0.9299267530441284),
 ('mr', 0.9168338775634766),
 ('eldest', 0.916319727897644),
 ('sought', 0.9155704975128174)]

In [60]:
modelw2.wv.most_similar_cosmul(positive=['health','president'], negative=['trump'])

[('faithfully', 0.9857660531997681),
 ('intensive', 0.9810300469398499),
 ('stafford', 0.969285249710083),
 ('act', 0.9682179689407349),
 ('affordable', 0.9674921035766602),
 ('nanoscale', 0.9595308899879456),
 ('care', 0.9423438310623169),
 ('inhibitor', 0.9407963156700134),
 ('dirtcheap', 0.9346645474433899),
 ('public', 0.93406081199646)]

In [61]:
modelw2.wv.save_word2vec_format('Data/word2vec_embed.txt',binary=False)

In [62]:
embeddings_index = {}
f = open(os.path.join('Data','word2vec_embed.txt'), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

### Tokenize, Sequence, Pad

In [63]:
VOCAB_SIZE = 29377

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(clean_body_train)

# Note, the tokenizer's word_index will not respect VOCAB_SIZE.
# but, that parameter will be respected in later methods,
# (for example, when you call text_to_sequences).
# Also note that '0' is a reserved index for padding.
word_index = tokenizer.word_index
print("Word index", len(word_index))

Word index 29377


In [64]:
# Use the texts_to_sequences utility to vectorize your training, 
# validation, and test questions. 
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_val = tokenizer.texts_to_sequences(X_val)
sequences_test = tokenizer.texts_to_sequences(clean_body_test)

### Choose max sequence length

In [65]:
train_word_lengths = []
for w in sequences_train:
    train_word_lengths.append(len(w))
words_length = np.array(train_word_lengths)

print("# of Words in the 90 percentile:",np.percentile(words_length, 90))
print("# of Words in the 95 percentile:",np.percentile(words_length, 95))
print("# of Words in the 99 percentile:",np.percentile(words_length, 99))
print("# of Words in the 100 percentile:",np.percentile(words_length, 100))

# of Words in the 90 percentile: 764.6
# of Words in the 95 percentile: 1103.6
# of Words in the 99 percentile: 1894.2000000000035
# of Words in the 100 percentile: 9317.0


In [45]:
MAX_SEQ_LEN = 2000

### Pad

In [66]:
padded_train = tf.keras.preprocessing.sequence.pad_sequences(sequences_train, maxlen=MAX_SEQ_LEN)
padded_val = tf.keras.preprocessing.sequence.pad_sequences(sequences_val, maxlen=MAX_SEQ_LEN)
padded_test = tf.keras.preprocessing.sequence.pad_sequences(sequences_test, maxlen=MAX_SEQ_LEN)

In [67]:
len(padded_train[0])

2000

### Map embeddings to tokenizer

In [68]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words,EMBED_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be 0s
        embedding_matrix[i] = embedding_vector

### Initial LSTM model

In [77]:
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input, LSTM, GRU, Bidirectional
#from tensorflow.keras.layers.embeddings import Embedding
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant

In [78]:
model = Sequential()
embedding_layer = Embedding(input_dim=num_words,
                            output_dim=EMBED_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQ_LEN,
                            trainable=False
                           )
model.add(embedding_layer)
model.add(Bidirectional(LSTM(64)))
#model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.summary()

"""[
    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=64, input_length=MAX_SEQ_LEN),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
]"""

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 2000, 64)          1880192   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 1,954,561
Trainable params: 74,369
Non-trainable params: 1,880,192
_________________________________________________________________


'[\n    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=64, input_length=MAX_SEQ_LEN),\n    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),\n    tf.keras.layers.Dense(64, activation="relu"),\n    tf.keras.layers.Dense(1, activation="sigmoid")\n]'

In [79]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=11, verbose=0, mode='min')
cp_save = tf.keras.callbacks.ModelCheckpoint('model-e{epoch:03d}.ckpt', 
                                             save_best_only=True, monitor='val_loss', mode='min')

In [None]:
history = model.fit(padded_train, y_train_std, 
                     epochs=100, 
                     verbose=2, 
                     callbacks=[earlyStopping,cp_save],
                     validation_data=(padded_val,y_val_std))

Train on 1105 samples, validate on 196 samples
Epoch 1/100
INFO:tensorflow:Assets written to: model-e001.ckpt\assets
1105/1105 - 482s - loss: 0.6612 - accuracy: 0.0326 - val_loss: 0.6717 - val_accuracy: 0.0102
Epoch 2/100


In [None]:
#plotting accuracy and loss as a function of epochs to ensure no overfitting
def plot(history):
    # The history object contains results on the training and test
    # sets for each epoch
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    # Get the number of epochs
    epochs = range(len(acc))

    plt.title('Training and validation accuracy')
    plt.plot(epochs, acc, color='blue', label='Train')
    plt.plot(epochs, val_acc, color='orange', label='Val')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    _ = plt.figure()
    plt.title('Training and validation loss')
    plt.plot(epochs, loss, color='blue', label='Train')
    plt.plot(epochs, val_loss, color='orange', label='Val')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()