In [1]:
import tensorflow as tf
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
def load_imdb():
  # download dataset
  url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

  dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

  dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
  train_dir = os.path.join(dataset_dir, 'train')
  test_dir = os.path.join(dataset_dir, 'test')
  
  # remove irrelevant data
  remove_dir = os.path.join(train_dir, 'unsup')
  shutil.rmtree(remove_dir)

  # load to dataframes
  train_lst, test_lst = [], []
  label2id = {"pos" : 1, "neg" : 0}

  for label in ['pos', 'neg']:
    path = train_dir + "/" + label
    files = os.listdir(path)
    for _file in files:
      with open(os.path.join(path, _file), 'r') as f:
        # strip <br /> tags
        text = f.read()
        train_lst.append([text, label2id[label]])
    
    path = test_dir + "/" + label
    files = os.listdir(path)
    for _file in files:
      with open(os.path.join(path, _file), 'r') as f:
        text = f.read()
        test_lst.append([text, label2id[label]])
    
  df_train = pd.DataFrame(train_lst, columns=['text', 'label'])
  df_test  = pd.DataFrame(test_lst, columns=['text', 'label'])
  x_train, y_train = df_train["text"], df_train["label"]
  x_test, y_test = df_test["text"], df_test["label"]

  return x_train, y_train, x_test, y_test

In [3]:
def load_fin():
  # download dataset
  url = '/kaggle/input/financial-sentiment-analysis/data.csv'

  # load to dataframes
  df_raw = pd.read_csv(url)
  label2id = {"positive" : 2, "neutral" : 1, "negative" : 0}
  df_raw["Sentiment"] = df_raw["Sentiment"].apply(lambda x : label2id[x])
    
  df_train, df_test = train_test_split(df_raw)
  x_train, y_train = df_train["Sentence"], df_train["Sentiment"]
  x_test, y_test = df_test["Sentence"], df_test["Sentiment"]

  return x_train, y_train, x_test, y_test

In [4]:
def load_sst5():
    train_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_train.csv'
    test_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_test.csv'
    
    df_train = pd.read_csv(train_url)
    df_test = pd.read_csv(test_url)
    
    x_train, y_train = df_train["sentence"], df_train["label"]
    x_test, y_test = df_test["sentence"], df_test["label"]

    return x_train, y_train, x_test, y_test

In [5]:
def load_sst2():
    train_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_train.csv'
    test_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_test.csv'
    
    df_train = pd.read_csv(train_url)
    df_test = pd.read_csv(test_url)
    
    # remove neutral
    df_train = df_train[df_train["label"] != 2]
    df_test = df_test[df_test["label"] != 2]
    
    # map to positive or negative
    label2id = {0:0, 1:0, 3:1, 4:1}
    df_train["label"] = df_train["label"].apply(lambda x : label2id[x])
    df_test["label"] = df_test["label"].apply(lambda x : label2id[x])
    
    x_train, y_train = df_train["sentence"], df_train["label"]
    x_test, y_test = df_test["sentence"], df_test["label"]

    return x_train, y_train, x_test, y_test

In [6]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow_hub as hub

def get_embeddings(x_train, output_len=128):
    embed = TextVectorization(max_tokens=10000, output_mode='int', 
                                        output_sequence_length=output_len)
    embed.adapt(x_train)
    return embed

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re

def preprocess(text_inp):
    TAG_RE = re.compile(r'<[^>]+>')

    text = TAG_RE.sub('', text_inp)
    text = re.sub('[^a-zA-Z]', ' ', text) # non alphabets
    text = re.sub(r'\s+', ' ', text)  # multiple space
    
    # stopwords
    text = text.lower().split()
    stopwords_set = set(stopwords.words('english'))
    text = [x for x in text if x not in stopwords_set]
    return " ".join(text)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [15]:
def build_cnn(num_class, in_len=128):
    model = Sequential([
        tf.keras.layers.Embedding(10000, 16, input_length=in_len),
        tf.keras.layers.Conv1D(filters = 32, kernel_size = 7, activation = "relu"),
        tf.keras.layers.MaxPooling1D(5),
        tf.keras.layers.Conv1D(filters = 32, kernel_size = 7, activation = "relu"),
        tf.keras.layers.MaxPooling1D(3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.5),
    ])
    
    if (num_class == 2):
        net = tf.keras.layers.Dense(1, activation='sigmoid')
    else:
        net = tf.keras.layers.Dense(num_class, activation='softmax')
    model.add(net)
    
    if num_class == 2:
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [10]:
def build_rnn(num_class, in_len=128):
    model = Sequential([
        tf.keras.layers.Embedding(10000, 16, input_length=in_len),
        tf.keras.layers.SimpleRNN(64, dropout=0.5),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.5),
    ])
    if (num_class == 2):
        net = tf.keras.layers.Dense(1, activation='sigmoid')
    else:
        net = tf.keras.layers.Dense(num_class, activation='softmax')
    model.add(net)
    
    if num_class == 2:
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [11]:
def build_lstm(num_class, in_len=128):
    model = Sequential([
        tf.keras.layers.Embedding(10000, 16, input_length=in_len),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.5)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.5),
    ])
    
    if (num_class == 2):
        net = tf.keras.layers.Dense(1, activation='sigmoid')
    else:
        net = tf.keras.layers.Dense(num_class, activation='softmax')
    model.add(net)
    
    if num_class == 2:
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
  

In [12]:
def train(model, x_train, y_train, x_val, y_val, epochs=10):
  # Train the model
  history = model.fit(x_train, y_train, epochs=epochs,
                      validation_data=(x_val, y_val))

  # Evaluate the model on the validation set
  loss, accuracy = model.evaluate(x_val, y_val)
  print(f'Validation loss: {loss:.4f}, Validation accuracy: {accuracy:.4f}')
  return history

In [1]:
def test_model(data, model_name):
    if data == "fin":
        x_train_raw, y_train_, x_test_raw, y_test = load_fin()
        num_classes = 3
    elif data == "imdb":
        x_train_raw, y_train_, x_test_raw, y_test = load_imdb()
        num_classes = 2
    elif data == "sst5":
        x_train_raw, y_train_, x_test_raw, y_test = load_sst5()
        num_classes = 5
    elif data == "sst2":
        x_train_raw, y_train_, x_test_raw, y_test = load_sst2()
        num_classes = 2
    
    x_train_ = x_train_raw.apply(preprocess)
    x_test_ = x_test_raw.apply(preprocess)
    
    maxlen = 500
    embed = get_embeddings(x_train_)
    x_train_ = embed(x_train_)
    x_test = embed(x_test_)
    
    x_train, x_val, y_train, y_val = train_test_split(x_train_.numpy(), y_train_, test_size=0.2)
    
    print(y_train[:10])
    print(x_train.shape, x_val.shape)
    
    if model_name == "rnn":
        model = build_rnn(num_classes)
    elif model_name == "lstm":
        model = build_lstm(num_classes)
    elif model_name == "cnn":
        model = build_cnn(num_classes)
        
    print(model)
        
    hist = train(model, x_train, y_train, x_val, y_val)
    
    model.evaluate(x_test, y_test)
    model_save = f"/kaggle/working/{data}_{model_name}"
    
#     model.save(model_save)
#     shutil.make_archive(model_save, 'zip', "/kaggle/working")
    
    hist_df = pd.DataFrame(hist.history)
    hist_df.to_csv(model_save + "_hist.csv")

In [17]:
test_model('imdb', 'cnn')

19970    0
12531    0
23754    0
14206    0
17305    0
11094    1
15485    0
17552    0
15691    0
7438     1
Name: label, dtype: int64
(20000, 128) (5000, 128)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation loss: 1.4614, Validation accuracy: 0.7333


In [20]:
test_model('sst2', 'cnn')

3831    1
7819    0
604     1
8542    0
8313    1
2079    1
1743    1
1771    1
5177    0
1263    1
Name: label, dtype: int64
(5536, 128) (1384, 128)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation loss: 0.9228, Validation accuracy: 0.5476


In [24]:
test_model('fin', 'cnn')

3039    2
629     1
3232    2
334     1
4139    0
1421    1
24      1
1911    0
2193    1
4391    0
Name: Sentiment, dtype: int64
(3504, 128) (877, 128)
Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert' defined at (most recent call last):
    File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/conda/lib/python3.7/site-packages/traitlets/config/application.py", line 1041, in launch_instance
      app.start()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/opt/conda/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
      self._run_once()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
      handle._run()
    File "/opt/conda/lib/python3.7/asyncio/events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 387, in do_execute
      cell_id=cell_id,
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2976, in run_cell
      raw_cell, store_history, silent, shell_futures, cell_id
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell
      return runner(coro)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3258, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_23/3969946307.py", line 1, in <module>
      test_model('fin', 'cnn')
    File "/tmp/ipykernel_23/1606620890.py", line 35, in test_model
      hist = train(model, x_train, y_train, x_val, y_val)
    File "/tmp/ipykernel_23/3296847510.py", line 4, in train
      validation_data=(x_val, y_val))
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1024, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1083, in compute_loss
      y, y_pred, sample_weight, regularization_losses=self.losses
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/conda/lib/python3.7/site-packages/keras/losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "/opt/conda/lib/python3.7/site-packages/keras/losses.py", line 284, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/losses.py", line 2103, in sparse_categorical_crossentropy
      axis=axis,
    File "/opt/conda/lib/python3.7/site-packages/keras/backend.py", line 5634, in sparse_categorical_crossentropy
      labels=target, logits=output
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert'
assertion failed: [Condition x == y did not hold element-wise:] [x (sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/Shape_1:0) = ] [32 1] [y (sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/strided_slice:0) = ] [32 6]
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert}}]] [Op:__inference_train_function_59117]

In [25]:
test_model('sst5', 'cnn')

170     2
4768    2
3049    2
736     4
91      3
1722    3
4733    2
5489    1
955     4
7658    0
Name: label, dtype: int64
(6835, 128) (1709, 128)
Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert' defined at (most recent call last):
    File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/conda/lib/python3.7/site-packages/traitlets/config/application.py", line 1041, in launch_instance
      app.start()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/opt/conda/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
      self._run_once()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
      handle._run()
    File "/opt/conda/lib/python3.7/asyncio/events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 387, in do_execute
      cell_id=cell_id,
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2976, in run_cell
      raw_cell, store_history, silent, shell_futures, cell_id
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell
      return runner(coro)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3258, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_23/2046397980.py", line 1, in <module>
      test_model('sst5', 'cnn')
    File "/tmp/ipykernel_23/1606620890.py", line 35, in test_model
      hist = train(model, x_train, y_train, x_val, y_val)
    File "/tmp/ipykernel_23/3296847510.py", line 4, in train
      validation_data=(x_val, y_val))
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1024, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1083, in compute_loss
      y, y_pred, sample_weight, regularization_losses=self.losses
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/conda/lib/python3.7/site-packages/keras/losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "/opt/conda/lib/python3.7/site-packages/keras/losses.py", line 284, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/losses.py", line 2103, in sparse_categorical_crossentropy
      axis=axis,
    File "/opt/conda/lib/python3.7/site-packages/keras/backend.py", line 5634, in sparse_categorical_crossentropy
      labels=target, logits=output
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert'
assertion failed: [Condition x == y did not hold element-wise:] [x (sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/Shape_1:0) = ] [32 1] [y (sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/strided_slice:0) = ] [32 6]
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert}}]] [Op:__inference_train_function_61408]