In [None]:
import tensorflow as tf
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
def load_imdb():
  # download dataset
  url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

  dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

  dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
  train_dir = os.path.join(dataset_dir, 'train')
  test_dir = os.path.join(dataset_dir, 'test')
  
  # remove irrelevant data
  remove_dir = os.path.join(train_dir, 'unsup')
  shutil.rmtree(remove_dir)

  # load to dataframes
  train_lst, test_lst = [], []
  label2id = {"pos" : 1, "neg" : 0}

  for label in ['pos', 'neg']:
    path = train_dir + "/" + label
    files = os.listdir(path)
    for _file in files:
      with open(os.path.join(path, _file), 'r') as f:
        # strip <br /> tags
        text = f.read()
        train_lst.append([text, label2id[label]])
    
    path = test_dir + "/" + label
    files = os.listdir(path)
    for _file in files:
      with open(os.path.join(path, _file), 'r') as f:
        text = f.read()
        test_lst.append([text, label2id[label]])
    
  df_train = pd.DataFrame(train_lst, columns=['text', 'label']).sample(frac=1)
  df_test  = pd.DataFrame(test_lst, columns=['text', 'label'])
  x_train, y_train = df_train["text"], df_train["label"]
  x_test, y_test = df_test["text"], df_test["label"]

  return x_train, y_train, x_test, y_test

In [None]:
def load_fin():
    # download dataset
    url = '/kaggle/input/financial-sentiment-analysis/data.csv'

    # load to dataframes
    df_raw = pd.read_csv(url)
    label2id = {"positive" : 2, "neutral" : 1, "negative" : 0}
    df_raw["Sentiment"] = df_raw["Sentiment"].apply(lambda x : label2id[x])

    df_train, df_test = train_test_split(df_raw)
    df_train = df_train.sample(frac=1)
    
    x_train, y_train = df_train["Sentence"], df_train["Sentiment"]
    x_test, y_test = df_test["Sentence"], df_test["Sentiment"]

    return x_train, y_train, x_test, y_test

In [None]:
def load_sst5():
    train_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_train.csv'
    test_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_test.csv'
    
    df_train = pd.read_csv(train_url).sample(frac=1)
    df_test = pd.read_csv(test_url)
    
    x_train, y_train = df_train["sentence"], df_train["label"]
    x_test, y_test = df_test["sentence"], df_test["label"]

    return x_train, y_train, x_test, y_test

In [None]:
def load_sst2():
    train_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_train.csv'
    test_url = 'https://raw.githubusercontent.com/christycty/sentiment-analysis-review/main/data/sst5_test.csv'
    
    df_train = pd.read_csv(train_url).sample(frac=1)
    df_test = pd.read_csv(test_url)
    
    # remove neutral
    df_train = df_train[df_train["label"] != 2]
    df_test = df_test[df_test["label"] != 2]
    
    # map to positive or negative
    label2id = {0:0, 1:0, 3:1, 4:1}
    df_train["label"] = df_train["label"].apply(lambda x : label2id[x])
    df_test["label"] = df_test["label"].apply(lambda x : label2id[x])
    
    x_train, y_train = df_train["sentence"], df_train["label"]
    x_test, y_test = df_test["sentence"], df_test["label"]

    return x_train, y_train, x_test, y_test

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re

def preprocess(text_inp):
    TAG_RE = re.compile(r'<[^>]+>')

    text = TAG_RE.sub('', text_inp)
    text = re.sub('[^a-zA-Z]', ' ', text) # non alphabets
    text = re.sub(r'\s+', ' ', text)  # multiple space
    
    # stopwords
    text = text.lower().split()
    stopwords_set = set(stopwords.words('english'))
    text = [x for x in text if x not in stopwords_set]
    return " ".join(text)

In [None]:
def get_max_len(text1_, text2_, name):
    text1 = text1_.apply(preprocess)
    lengths = text1.apply(lambda x:len(str(x).split(' ')))
    maxlen1 = lengths.max()
    meanlen1 = lengths.mean()
    
    text2 = text2_.apply(preprocess)
    lengths = text2.apply(lambda x:len(str(x).split(' ')))
    maxlen2 = lengths.max()
    meanlen2 = lengths.mean()
    
    print(f"{name}: train max len {maxlen1}, test max len {maxlen2}")
    print(f"{name}: train mean len {meanlen1}, test mean len {meanlen2}")

In [None]:
def evaluate_data():
    x_train_raw, y_train_, x_test_raw, y_test = load_fin()
    get_max_len(x_train_raw, x_test_raw, "fin")
    
    x_train_raw, y_train_, x_test_raw, y_test = load_imdb()
    get_max_len(x_train_raw, x_test_raw, "imdb")
    
    x_train_raw, y_train_, x_test_raw, y_test = load_sst5()
    get_max_len(x_train_raw, x_test_raw, "sst5")
    
    x_train_raw, y_train_, x_test_raw, y_test = load_sst2()
    get_max_len(x_train_raw, x_test_raw, "sst5")

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import gensim.downloader as api
import gensim

def get_embeddings(dataset, name="word2vec"):
    # tokenize on training dataset
    tokenizer = Tokenizer(num_words = 15000, oov_token="<oov>")
    tokenizer.fit_on_texts(dataset)
    word_index = tokenizer.word_index
    print(f"there are {len(word_index)} unique words in dataset")
    # number of words to keep in dictionary
    num_words = min(15000, len(word_index) + 1)
    
    # import the word embeddings
    if name == "word2vec":
        word2vec_path = '../input/google-word2vec/GoogleNews-vectors-negative300.bin'
        embeddings = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary = True)
#         embeddings = api.load('word2vec-google-news-300')
        embed_dim = 300
    
    elif name == "glove100":
        embeddings = {}
        f = open('/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
        f.close()
        embed_dim = 100
    
    elif name == "glove200":
        embeddings = {}
        f = open('/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
        f.close()
        embed_dim = 200
    
    # create embedding matrix (map tokenizer index to word embeddings)
    embedding_matrix = np.zeros((num_words, embed_dim))
    
    for word, i in word_index.items():
        if i >= num_words:
            break
        if word in embeddings:
            embedding_matrix[i] = embeddings[word]
        else:
            embedding_matrix[i] = np.random.randn(embed_dim)
        
    return tokenizer, embedding_matrix, embed_dim

## Model Part

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [None]:
def build_rnn(num_class, num_words, embed_dim, embed_matrix):
    model = Sequential([
        tf.keras.layers.Embedding(num_words, 
                                  embed_dim, 
                                  weights=[embed_matrix], 
                                  trainable=True),
        tf.keras.layers.SimpleRNN(32, dropout=0.5),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.5),
    ])
    if (num_class == 2):
        net = tf.keras.layers.Dense(1, activation='sigmoid')
    else:
        net = tf.keras.layers.Dense(num_class, activation='softmax')
    model.add(net)
    
    if num_class == 2:
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def build_lstm(num_class, num_words, embed_dim, embed_matrix):
    model = Sequential([
        tf.keras.layers.Embedding(num_words, 
                                  embed_dim, 
                                  weights=[embed_matrix], 
                                  trainable=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, dropout=0.5)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dropout(0.5),
    ])
    
    if (num_class == 2):
        net = tf.keras.layers.Dense(1, activation='sigmoid')
    else:
        net = tf.keras.layers.Dense(num_class, activation='softmax')
    model.add(net)
    
    if num_class == 2:
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
  

In [None]:
def build_nn(num_class, num_words, embed_dim, embed_matrix):
    model = Sequential([
        tf.keras.layers.Embedding(num_words, 
                                  embed_dim, 
                                  weights=[embed_matrix], 
                                  trainable=False),
        tf.keras.layers.Dense(2048, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1024, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dropout(0.5),
    ])
    
    if (num_class == 2):
        net = tf.keras.layers.Dense(1, activation='sigmoid')
    else:
        net = tf.keras.layers.Dense(num_class, activation='softmax')
    model.add(net)
    
    if num_class == 2:
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
  

In [None]:
def train(model, x_train, y_train, x_val, y_val, epochs=10):
  # Train the model
  history = model.fit(x_train, y_train, epochs=epochs,
                      validation_data=(x_val, y_val))

  # Evaluate the model on the validation set
  loss, accuracy = model.evaluate(x_val, y_val)
  print(f'Validation loss: {loss:.4f}, Validation accuracy: {accuracy:.4f}')
  return history

In [None]:
def test_model(data, model_name, embed_name, in_epoch=-1):
    if data == "fin":
        x_train_raw, y_train, x_test_raw, y_test = load_fin()
        num_classes = 3
        seq_len = 40  # trim extra
        
    elif data == "imdb":
        x_train_raw, y_train, x_test_raw, y_test = load_imdb()
        num_classes = 2
        seq_len = 500 # trim extra
        
    elif data == "sst5":
        x_train_raw, y_train, x_test_raw, y_test = load_sst5()
        num_classes = 5
        seq_len = 30
        
    elif data == "sst2":
        x_train_raw, y_train, x_test_raw, y_test = load_sst2()
        num_classes = 2
        seq_len = 30
    
    # text preprocessing
    x_train_ = x_train_raw.apply(preprocess)
    x_test_ = x_test_raw.apply(preprocess)
    
    tokenizer, embed_matrix, embed_dim = get_embeddings(x_train_, embed_name)
    num_words = min(15000, len(tokenizer.word_index) + 1)
    
    # tokenize sentences (text to sequence of indices)
    x_train = tokenizer.texts_to_sequences(x_train_)
    x_test = tokenizer.texts_to_sequences(x_test_)
    
    x_train = pad_sequences(x_train, seq_len, truncating="post")
    x_test = pad_sequences(x_test, seq_len, truncating="post")
    
    # build models
    if model_name == "rnn":
        model = build_rnn(num_classes, num_words, embed_dim, embed_matrix)
        epochs = 10
        
    elif model_name == "lstm":
        model = build_lstm(num_classes, num_words, embed_dim, embed_matrix)
        epochs = 8
        
    elif model_name == "nn":
        model = build_nn(num_classes, num_words, embed_dim, embed_matrix)
        epochs = 20
    
    if in_epoch != -1:
        epochs = in_epoch
        
    print(model.summary())
    hist = train(model, x_train, y_train, x_test, y_test, epochs=epochs)
    
    model.evaluate(x_test, y_test)
    model_save = f"/kaggle/working/{data}_{model_name}"
    
#     model.save(model_save)
#     shutil.make_archive(model_save, 'zip', "/kaggle/working")
    
    hist_df = pd.DataFrame(hist.history)
    hist_df.to_csv(model_save + "_hist.csv")

In [None]:
test_model("imdb", "rnn", "word2vec", 15)

In [None]:
# test_model("fin", "lstm", "glove100", 15)

In [None]:
# test_model("imdb", "lstm", "glove100", 15)

In [None]:
# test_model("fin", "rnn", "glove100", 15)

In [None]:
# test_model("imdb", "rnn", "glove100", 15)