### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import keras
from keras import initializers
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.layers import *
from keras.models import *
from keras import backend as K
import os
import shutil
import pathlib

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras.backend as K
from sklearn.model_selection import train_test_split

!pip install tensorflow-addons
import tensorflow_addons as tfa
tf.get_logger().setLevel('ERROR')




[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Load Data

In [2]:
df = pd.read_csv('data/labelled_sub.csv')

### Pre Processing

In [3]:
df.dropna(inplace = True)

In [4]:
#Setting label = 1 for all subjective data
for i in range(len(df['Label'])):
    if df['Label'][i] == -1:
        df['Label'][i] = 1
df = df.reset_index()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Making class weights equal

In [5]:
df['Label'].value_counts()

1    759
0    369
Name: Label, dtype: int64

In [6]:
pos = df[df['Label'] == 1]
neut = df[df['Label'] == 0]
l = df['Label'].value_counts()[0]
df = pd.concat([pos.head(l), neut])

In [7]:
df['Label'].value_counts()

1    369
0    369
Name: Label, dtype: int64

0 = Neutral

1 = Opinionated 

### Define Metrics

In [6]:
def recall_m(y_true, y_pred, threshold = 0.5):
#     y_pred = tf.convert_to_tensor(y_pred)
#     threshold = tf.cast(threshold, y_pred.dtype)
#     y_pred = tf.cast(y_pred > threshold, y_pred.dtype)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred, threshold = 0.5):
#     y_pred = tf.convert_to_tensor(y_pred)
#     threshold = tf.cast(threshold, y_pred.dtype)
#     y_pred = tf.cast(y_pred > threshold, y_pred.dtype)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred, threshold = 0.5):
#     y_pred = tf.convert_to_tensor(y_pred)
#     threshold = tf.cast(threshold, y_pred.dtype)
#     y_pred = tf.cast(y_pred > threshold, y_pred.dtype)
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [7]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.metrics.BinaryAccuracy(), f1_m, precision_m, recall_m]
callback = tf.keras.callbacks.EarlyStopping(monitor = 'val_f1_m', patience = 3)

### Train test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['Lemmatized_Text'], df['Label'], test_size=0.2, random_state=42)

In [10]:
train_dict = {
    'text': X_train,
    'target':y_train
}
train_df = pd.DataFrame(train_dict)

test_dict = {
    'text': X_test,
    'target':y_test
}
test_df = pd.DataFrame(test_dict)

### Encoding text

In [4]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'  #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]
map_name_to_handle = {'small_bert/bert_en_uncased_L-4_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'}
map_model_to_preprocess = {'small_bert/bert_en_uncased_L-4_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'}
tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [5]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [16]:
x_train = bert_preprocess_model(train_df['text'])['input_word_ids']
x_test = bert_preprocess_model(test_df['text'])['input_word_ids']

In [17]:
high = max(x_test[0])
for i in x_train:
    if max(i) > high:
        high = max(i)

for i in x_test:
    if max(i) > high:
        high = max(i)
print(high)

tf.Tensor(29593, shape=(), dtype=int32)


In [18]:
n_unique_words = high + 1 # cut texts after this number of words
maxlen = 256
batch_size = 64

### Padding Text

In [19]:
x = sequence.pad_sequences(x_train, maxlen=maxlen, padding='post', truncating='post')

x_test = sequence.pad_sequences(x_test, maxlen=maxlen, padding='post', truncating='post')

y = np.array(train_df['target']).reshape((-1,1))

y_test = np.array(test_df['target']).reshape((-1,1))

## Defining Models

### 1. Bi-LSTM

In [20]:
def build_LSTM():
    model = Sequential()
    model.add(Embedding(n_unique_words, 128, input_length=maxlen))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics) 
    return model

### Define Attention Class

In [21]:
class attention(Layer):
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences

        super(attention,self).__init__()

    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1), initializer="zeros")
        super(attention,self).build(input_shape)


    def call(self, x):
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        if self.return_sequences:
            return output
        return K.sum(output, axis=1)

### 2. Attention based Bi-LSTM

In [22]:
def build_Att_Bi_LSTM():
    model = Sequential()
    model.add(Embedding(n_unique_words, 64, input_length=maxlen))
    model.add(Bidirectional(LSTM(32,return_sequences=True)))
    model.add(attention(return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics) 
    return model

## Training and Evaluation

In [23]:
checkpoint_path = "weights/att_subjectivity.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

### LSTM

In [24]:
model = build_LSTM()
model.summary()
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.5, random_state=22)
model.fit(X_train, y_train, epochs=8, validation_data = [X_val, y_val])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 256, 128)          3788032   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              98816     
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 3,886,977
Trainable params: 3,886,977
Non-trainable params: 0
_________________________________________________________________
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x28b971a4630>

In [25]:
model.evaluate(x_test, y_test)



[0.6280935406684875,
 0.6216216087341309,
 0.611852765083313,
 0.6799784302711487,
 0.5704761743545532]

### Attention based Bi-LSTM

In [26]:
tf.config.run_functions_eagerly(True)

In [27]:
model = build_Att_Bi_LSTM()
model.summary()
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.5, random_state=15)
model.fit(X_train, y_train, epochs=12, validation_data = [X_val, y_val], callbacks = callback)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 256, 64)           1894016   
                                                                 
 bidirectional_1 (Bidirectio  (None, 256, 64)          24832     
 nal)                                                            
                                                                 
 attention (attention)       (None, 64)                320       
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,919,233
Trainable params: 1,919,233
Non-trainable params: 0
_________________________________________________________________
Epoch 1/12


  "Even though the `tf.config.experimental_run_functions_eagerly` "


Epoch 2/12
Epoch 3/12
Epoch 4/12


<keras.callbacks.History at 0x28b97e0cd68>

In [28]:
model.evaluate(x_test, y_test)



[0.6895248889923096,
 0.6689189076423645,
 0.7298630475997925,
 0.6430768966674805,
 0.8507143259048462]

loss: 0.6196 - binary_accuracy: 0.6892 - f1_m: 0.7617 - precision_m: 0.6551 - recall_m: 0.9233

In [34]:
def compute_class_weights(labels):

    # Count number of postive and negative bags.
    negative_count = len(np.where(labels == 0)[0])
    positive_count = len(np.where(labels == 1)[0])
    total_count = negative_count + positive_count

    # Build class weight dictionary.
    return {
        0: (1 / negative_count) * (total_count / 2),
        1: (1 / positive_count) * (total_count / 2),
    }
