In [2]:
!pip install -q tf-models-official==2.11.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf 21.12.2 requires cupy-cuda115, which is not installed.
cudf 21.12.2 requires cupy-cuda115, which is not installed.
tfx-bsl 1.12.0 requires google-api-python-client<2,>=1.7.11, but you have google-api-python-client 2.83.0 which is incompatible.
tfx-bsl 1.12.0 requires pyarrow<7,>=6, but you have pyarrow 5.0.0 which is incompatible.
tensorflow-transform 1.12.0 requires pyarrow<7,>=6, but you have pyarrow 5.0.0 which is incompatible.
onnx 1.13.1 requires protobuf<4,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.
distributed 2021.11.2 requires dask==2021.11.2, but you have dask 2022.2.0 which is incompatible.
dask-cudf 21.12.2 requires dask<=2021.11.2,>=2021.11.1, but you have dask 2022.2.0 which is incompatible.
apache-beam 2.44.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.6 whi

In [3]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [4]:
import pandas as pd
from tensorflow import keras

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
def load_imdb():
  # download dataset
  url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

  dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

  dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
  train_dir = os.path.join(dataset_dir, 'train')
  test_dir = os.path.join(dataset_dir, 'test')
  
  # remove irrelevant data
  remove_dir = os.path.join(train_dir, 'unsup')
  shutil.rmtree(remove_dir)

  # load to dataframes
  train_lst, test_lst = [], []
  label2id = {"pos" : 1, "neg" : 0}

  for label in ['pos', 'neg']:
    path = train_dir + "/" + label
    files = os.listdir(path)
    for _file in files:
      with open(os.path.join(path, _file), 'r') as f:
        # strip <br /> tags
        text = f.read()
        train_lst.append([text, label2id[label]])
    
    path = test_dir + "/" + label
    files = os.listdir(path)
    for _file in files:
      with open(os.path.join(path, _file), 'r') as f:
        text = f.read()
        test_lst.append([text, label2id[label]])
    
  df_train = pd.DataFrame(train_lst, columns=['text', 'label'])
  df_test  = pd.DataFrame(test_lst, columns=['text', 'label'])
  x_train, y_train = df_train["text"], df_train["label"]
  x_test, y_test = df_test["text"], df_test["label"]

  return x_train, y_train, x_test, y_test

In [7]:
def load_fin():
  # download dataset
  url = '/kaggle/input/financial-sentiment-analysis/data.csv'

  # load to dataframes
  df_raw = pd.read_csv(url)
  label2id = {"positive" : 2, "neutral" : 1, "negative" : 0}
  df_raw["Sentiment"] = df_raw["Sentiment"].apply(lambda x : label2id[x])
    
  df_train, df_test = train_test_split(df_raw)
  x_train, y_train = df_train["Sentence"], df_train["Sentiment"]
  x_test, y_test = df_test["Sentence"], df_test["Sentiment"]

  return x_train, y_train, x_test, y_test

In [8]:
def load_sst5():
    train_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_train.csv'
    test_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_test.csv'
    
    df_train = pd.read_csv(train_url)
    df_test = pd.read_csv(test_url)
    
    x_train, y_train = df_train["sentence"], df_train["label"]
    x_test, y_test = df_test["sentence"], df_test["label"]

    return x_train, y_train, x_test, y_test

In [9]:
def load_sst2():
    train_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_train.csv'
    test_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_test.csv'
    
    df_train = pd.read_csv(train_url)
    df_test = pd.read_csv(test_url)
    
    # remove neutral
    df_train = df_train[df_train["label"] != 2]
    df_test = df_test[df_test["label"] != 2]
    
    # map to positive or negative
    label2id = {0:0, 1:0, 3:1, 4:1}
    df_train["label"] = df_train["label"].apply(lambda x : label2id[x])
    df_test["label"] = df_test["label"].apply(lambda x : label2id[x])
    
    x_train, y_train = df_train["sentence"], df_train["label"]
    x_test, y_test = df_test["sentence"], df_test["label"]

    return x_train, y_train, x_test, y_test

In [10]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re

def preprocess(text_inp):
    TAG_RE = re.compile(r'<[^>]+>')

    text = TAG_RE.sub('', text_inp)
    text = re.sub('[^a-zA-Z]', ' ', text) # non alphabets
    text = re.sub(r'\s+', ' ', text)  # multiple space
    
    # stopwords
    text = text.lower().split()
    stopwords_set = set(stopwords.words('english'))
    text = [x for x in text if x not in stopwords_set]
    return " ".join(text)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
def build_bert(bert_url, preprocess_url, num_class, epochs=10, steps_per_epoch=500):
    text_input = keras.layers.Input(shape=(), dtype=tf.string, name='text')

    preprocess_layer = hub.KerasLayer(preprocess_url)
    preprocessed_input = preprocess_layer(text_input)

    bert_layer = hub.KerasLayer(bert_url, trainable=True)
    bert_output = bert_layer(preprocessed_input)

    net = bert_output['pooled_output']
    net = keras.layers.Dense(64, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)

    if (num_class == 2):
        net = tf.keras.layers.Dense(1, activation='sigmoid')(net)
    else:
        net = tf.keras.layers.Dense(num_class, activation='softmax')(net)

    model = keras.models.Model(text_input, net)
    
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)

    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                              num_train_steps=num_train_steps,
                                              num_warmup_steps=num_warmup_steps,
                                              optimizer_type='adamw')
    
    if (num_class == 2):
        model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    else:
        model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [14]:
def train(model, x_train, y_train, x_val, y_val, epochs=10):
  # Train the model
  history = model.fit(x_train, y_train, epochs=epochs,
                      validation_data=(x_val, y_val))

  # Evaluate the model on the validation set
  loss, accuracy = model.evaluate(x_val, y_val)
  print(f'Validation loss: {loss:.4f}, Validation accuracy: {accuracy:.4f}')
  return history

In [30]:
def test_model(data, model_name):
    if data == "fin":
        x_train_raw, y_train_, x_test_raw, y_test = load_fin()
        num_classes = 3
        steps_per_epoch = 110
        
    elif data == "imdb":
        x_train_raw, y_train_, x_test_raw, y_test = load_imdb()
        num_classes = 2
        steps_per_epoch = 625
        
    elif data == "sst5":
        x_train_raw, y_train_, x_test_raw, y_test = load_sst5()
        num_classes = 5
        steps_per_epoch = 214
        
    elif data == "sst2":
        x_train_raw, y_train_, x_test_raw, y_test = load_sst2()
        num_classes = 2
        steps_per_epoch = 173
    
    x_train_ = x_train_raw.apply(preprocess)
    x_test = x_test_raw.apply(preprocess)
    y_train = y_train_
    
    if model_name == "sbert":
        model_url = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1'
        pre_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
        epochs = 10
        
    elif model_name == 'bert':
        model_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
        pre_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
        epochs = 5
    
    
    model = build_bert(model_url, pre_url, num_classes, epochs, steps_per_epoch)
    print(model.summary())
    hist = train(model, x_train, y_train, x_test, y_test, epochs)
    
    model.evaluate(x_test, y_test)
    hist_df = pd.DataFrame(hist.history)
    hist_df.to_csv(model_save + "_hist.csv")

In [None]:
test_model('imdb', 'sbert')

In [31]:
test_model('sst2', 'bert')

(5536,) (1384,)
<keras.engine.functional.Functional object at 0x77a9630788d0>
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation loss: 0.7144, Validation accuracy: 0.8353


In [32]:
test_model('imdb', 'bert')

(20000,) (5000,)
<keras.engine.functional.Functional object at 0x77a9385c3550>
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation loss: 0.5547, Validation accuracy: 0.8902


In [33]:
test_model('sst5', 'bert')

(6835,) (1709,)
<keras.engine.functional.Functional object at 0x77ad02068dd0>
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation loss: 1.5410, Validation accuracy: 0.4447


In [34]:
test_model('fin', 'bert')

(3504,) (877,)
<keras.engine.functional.Functional object at 0x77ad05e03050>
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation loss: 0.6696, Validation accuracy: 0.7526


In [None]:
test_model('imdb', 'bert')

In [20]:
test_model('sst2', 'sbert')

4588    0
3596    1
7045    0
7386    0
91      1
3171    0
3688    1
298     1
5903    0
2436    1
Name: label, dtype: int64 (5536,)
(5536,) (1384,)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation loss: 1.2021, Validation accuracy: 0.7319
