# Imports

In [1]:

import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import string
import re
import time

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from tensorflow.keras.layers import Dense, Input, Flatten, Reshape, concatenate, Dropout
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Embedding
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras.constraints import max_norm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import metrics


# Dataset

In [2]:

# Text cleaning

def cleanStr(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()


In [3]:

data_folder = ["rt-polarity.pos", "rt-polarity.neg"]

sentences = []
labels = []

path = 'MovieReviews/rt-polaritydata/'

pos_file = data_folder[0]
neg_file = data_folder[1]

with open(path + pos_file, "rb") as f:
    for line in f:       
        temp = line.strip()
        temp = temp.decode("latin-1") 
        temp = cleanStr(temp)

        sentences.append(temp)
        labels.append(1)

with open(path + neg_file, "rb") as f:
    for line in f:
        temp = line.strip()
        temp = temp.decode("latin-1") 
        temp = cleanStr(temp)

        sentences.append(temp)
        labels.append(0)



In [4]:

df = pd.DataFrame(list(zip(sentences, labels)), columns = ['sentence', 'sentiment'])
df = df.sample(frac = 1, random_state = 0) # shuffle the dataset
df.head()


Unnamed: 0,sentence,sentiment
7168,"serry wants to blend politics and drama , an a...",0
8649,"an amateurish , quasi improvised acting exerci...",0
3381,playing a role of almost bergmanesque intensit...,1
8718,"no big whoop , nothing new to see , zero thril...",0
5367,"please , someone , stop eric schaeffer before ...",0


In [5]:

df['sentiment'].value_counts()


0    5331
1    5331
Name: sentiment, dtype: int64

In [6]:

# max token count in all sentences = 56

def vaderSentiment(sentence):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(sentence)

    if vs['compound'] <= -0.05:
        return [0 for i in range(56)]
    elif vs['compound'] >= 0.05:
        return [2 for i in range(56)]
    return [1 for i in range(56)]

df['vader'] = df['sentence'].apply(lambda x: vaderSentiment(x))
df.head()


Unnamed: 0,sentence,sentiment,vader
7168,"serry wants to blend politics and drama , an a...",0,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
8649,"an amateurish , quasi improvised acting exerci...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3381,playing a role of almost bergmanesque intensit...,1,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
8718,"no big whoop , nothing new to see , zero thril...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5367,"please , someone , stop eric schaeffer before ...",0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [7]:
# VADER
# 0: negative
# 1: neutral
# 2: positive

def vaderAccuracy(s, v):
    if s == 1 and v[0] == 2:
        return 1
    elif s == 0 and v[0] == 0:
        return 1
    else:
        return 0

df['match'] = df.apply(lambda x: vaderAccuracy(x['sentiment'], x['vader']), axis = 1)
df.head()


Unnamed: 0,sentence,sentiment,vader,match
7168,"serry wants to blend politics and drama , an a...",0,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",0
8649,"an amateurish , quasi improvised acting exerci...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
3381,playing a role of almost bergmanesque intensit...,1,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",1
8718,"no big whoop , nothing new to see , zero thril...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
5367,"please , someone , stop eric schaeffer before ...",0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [8]:

sum(df['match'])/len(df['match'])


0.5485837553929844

In [9]:

df['sentiment'].value_counts()


0    5331
1    5331
Name: sentiment, dtype: int64

In [10]:

# Tokenize the sentences
df['sentence'] = df['sentence'].apply(lambda x: word_tokenize(x))
df.head()


Unnamed: 0,sentence,sentiment,vader,match
7168,"[serry, wants, to, blend, politics, and, drama...",0,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",0
8649,"[an, amateurish, ,, quasi, improvised, acting,...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
3381,"[playing, a, role, of, almost, bergmanesque, i...",1,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",1
8718,"[no, big, whoop, ,, nothing, new, to, see, ,, ...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
5367,"[please, ,, someone, ,, stop, eric, schaeffer,...",0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [11]:

sentences = df['sentence'].tolist()
#sentences[0:5]


In [12]:

labels = df['sentiment'].tolist()
#labels[0:5]


In [13]:

vaders = df['vader'].tolist()
#vaders[0:5]


In [14]:

# Calculate the maximum number of tokens in a sentence
max_len = max([len(s) for s in sentences])
max_len


56

In [15]:

# unique tokens in the dataset
dataset_vocabulary = set([item for sublist in sentences for item in sublist])
#dataset_vocabulary


In [16]:

# assign id to each unique token
word_index = dict(zip(dataset_vocabulary, range(2, 2 + len(dataset_vocabulary))))
#word_index


In [17]:

sentences_ready = []
vaders_ready = []
for sentence, vad in zip(sentences, vaders):
    temp = [0 for i in range(max_len)]

    for i in range(min(max_len, len(sentence))):
        temp[i] = word_index[sentence[i]]

    sentences_ready.append(np.array(temp))
    vaders_ready.append(np.array(vad))

sentences_ready = np.array(sentences_ready)
labels_ready = np.array(labels)
vaders_ready = np.array(vaders_ready)

print(sentences_ready[:5])
print(sentences_ready.shape)

print(labels_ready[:5])
print(labels_ready.shape)

print(vaders_ready[:5])
print(vaders_ready.shape)


[[17121  8339  3990  2509  8646 10049 14303 15219 14365  1865   801 11817
   3071 17537  3325 13645 18068  8058  8563 10816  1755  3990 16207  7781
   7873 14841 10131   914 15701  4480     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [14365 15093 15219   713  4561  1876  8764  6750  6885 14283  8726 12232
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [14250   914 17333  5992  8276  6100  5349   773 14841 16863 12268 10049
   7896     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0   

In [18]:

# Classification using Logistic Regression
def logisticRegression(x_train, y_train, x_test, y_test):

    # Create a Logistic Regression classifier
    clf = LogisticRegression(random_state = 0, solver = 'liblinear', multi_class = 'ovr', max_iter = 1000)

    # Train the model using the training set
    clf.fit(x_train, y_train)

    # Predict the classes for the test set
    y_pred = clf.predict(x_test)

    # Calculate accuracy of the model
    accuracy = metrics.accuracy_score(y_test, y_pred)

    return accuracy


In [19]:

# Classification using Support Vector Machines
def supportVectorMachines(x_train, y_train, x_test, y_test):

    # Create a Support Vector Machines classifier
    clf = svm.LinearSVC(random_state = 0, C = 0.01)

    # Train the model using the training set
    clf.fit(x_train, y_train)

    # Predict the classes for the test set
    y_pred = clf.predict(x_test)

    # Calculate accuracy of the model
    accuracy = metrics.accuracy_score(y_test, y_pred)

    return accuracy


In [20]:

dimension = 300 # 50 100 200 300

glove_embeddings = {}


In [21]:

# Load GloVe embeddings into a dictionary

f = open('GloVe/glove.6B.' + str(dimension) + 'd.txt', 'r', encoding = 'utf8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    glove_embeddings[word] = coefs
    
f.close()

print('Found %s word vectors in GloVe.' % len(glove_embeddings))


Found 400000 word vectors in GloVe.


## Plain

In [22]:

# Prepare the plain word embeddings
num_tokens = len(dataset_vocabulary) + 2
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, dimension))
for word, i in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        """
        For the unknown words create random word vectors.    
        0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones.
        """
        embedding_matrix[i] = np.random.uniform(-0.25, 0.25, dimension)
        misses += 1

print("Converted %d words (%d misses)" % (hits, misses))


Converted 17640 words (944 misses)


### GloVe | Plain | LR & SVM

In [23]:

kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
kf.get_n_splits(sentences_ready)

model_acc_lr = 0
model_acc_svm = 0

for train_index, test_index in kf.split(sentences_ready):
    #print("TRAIN:", train_index, "TEST:", test_index)
    plain_x_train, plain_x_test = sentences_ready[train_index], sentences_ready[test_index]
    plain_y_train, plain_y_test = labels_ready[train_index], labels_ready[test_index]

    # plain GloVe
    logreg_plain_x_train = []

    for sent in plain_x_train:
        temp = []
        for token in sent:
            temp.extend(embedding_matrix[token])
        
        logreg_plain_x_train.append(temp)


    logreg_plain_x_test = []

    for sent in plain_x_test:
        temp = []
        for token in sent:
            temp.extend(embedding_matrix[token])

        logreg_plain_x_test.append(temp)

    logreg_plain_y_train = plain_y_train.copy()
    logreg_plain_y_test = plain_y_test.copy()
   
    accuracy_lr = logisticRegression(logreg_plain_x_train, logreg_plain_y_train, logreg_plain_x_test, logreg_plain_y_test)
    accuracy_svm = supportVectorMachines(logreg_plain_x_train, logreg_plain_y_train, logreg_plain_x_test, logreg_plain_y_test)

    model_acc_lr += accuracy_lr
    model_acc_svm += accuracy_svm

    #print('Test accuracy LR: %.4f' % accuracy_lr)
    #print('Test accuracy SVM: %.4f' % accuracy_svm)

print('10-fold CV Logistic Regression accuracy: %.4f' % (model_acc_lr/10))
print('10-fold CV Support Vector Machines accuracy: %.4f' % (model_acc_svm/10))


10-fold CV Logistic Regression accuracy: 0.6682
10-fold CV Support Vector Machines accuracy: 0.6808


### GloVe | Plain | CNN

In [24]:

plain_embedding_layer = Embedding(
    num_tokens,
    dimension,
    embeddings_initializer = keras.initializers.Constant(embedding_matrix),
    input_length = max_len,
    trainable = False,
)


In [25]:

kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
kf.get_n_splits(sentences_ready)

model_acc = 0

for train_index, test_index in kf.split(sentences_ready):
    #print("TRAIN:", train_index, "TEST:", test_index)
    plain_x_train, plain_x_test = sentences_ready[train_index], sentences_ready[test_index]
    plain_y_train, plain_y_test = labels_ready[train_index], labels_ready[test_index]

    plain_x_train, plain_x_val, plain_y_train, plain_y_val = train_test_split(plain_x_train, plain_y_train, test_size = 0.2, random_state = 42)

    
    ##### Convolutional Neural Network - single input #####
    
    sequence_input = Input(shape = (max_len,), dtype = 'int64')
    embedded_sequences = plain_embedding_layer(sequence_input)
    print(embedded_sequences.shape)

    # add first conv filter
    x = Conv1D(100, 5, activation = 'relu', kernel_constraint = max_norm(3))(embedded_sequences)
    x = MaxPooling1D(max_len - 5 + 1)(x)

    # add second conv filter
    y = Conv1D(100, 4, activation = 'relu', kernel_constraint = max_norm(3))(embedded_sequences)
    y = MaxPooling1D(max_len - 4 + 1)(y)

    # add third conv filter
    z = Conv1D(100, 3, activation = 'relu', kernel_constraint = max_norm(3))(embedded_sequences)
    z = MaxPooling1D(max_len - 3 + 1)(z)

    # concate the conv layers
    alpha = concatenate([x,y,z])

    # flatted the pooled features
    alpha = Flatten()(alpha)

    # dropout
    alpha = Dropout(0.5)(alpha)

    # predictions
    preds = Dense(1, activation = 'sigmoid')(alpha)

    # build model
    model = Model(inputs = sequence_input, outputs = preds)

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        0.0001,
        decay_steps = 100000,
        decay_rate = 0.95,
        staircase = True)

    optimizer = optimizers.Adam(learning_rate = lr_schedule)

    model.compile(loss = 'binary_crossentropy',
                      optimizer = optimizer,
                      metrics = ['acc'])

    model.summary()


    best_val_loss = None
    best_test_acc = 0
    test_acc = None

    for m in range(25):
        history = model.fit(plain_x_train,
              plain_y_train,
              batch_size = 50, 
              epochs = 1, 
              validation_data = (plain_x_val, plain_y_val))
        
        if best_val_loss is None or history.history['val_loss'][0] < best_val_loss:
            best_val_loss = history.history['val_loss'][0]

            _, test_acc = model.evaluate(plain_x_test, plain_y_test, batch_size = 50)

            if test_acc > best_test_acc:
                best_test_acc = test_acc

    model_acc += best_test_acc

    #print('Test accuracy: %.4f' % best_test_acc)

print('10-fold CV Convolutional Neural Network accuracy: %.4f' % (model_acc/10))


10-fold CV Convolutional Neural Network accuracy: 0.7738


## Context-refined

In [26]:

# Add context

context_len = 5

context_word_index = {}
context_embedding_dict = {}

sentences_ready = []


hits = 0
misses = 0

word_index_count = 2

for sentence in sentences:

    indexed_sentence = [0 for i in range(max_len)]

    for index, token in enumerate(sentence):

        found = False
        vec = [0 for i in range(dimension)]
        count = 0

        item = []

        for i in range(index-context_len, index+context_len+1):
            if i >= 0 and i < len(sentence):
                item.append(sentence[i])
                if sentence[i] in glove_embeddings:
                    vec += glove_embeddings[sentence[i]]
                    count += 1
                    found = True
        
        if not found:
            """
            For the unknown words create random word vectors.    
            0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones.
            """
            vec = np.random.uniform(-0.25, 0.25, dimension)
            misses += 1
        else:
            hits += 1

        item_tuple = tuple(item)
        #print(item_tuple)

        if item_tuple not in context_word_index:
            context_word_index[item_tuple] = word_index_count
            
            if count > 1:
                vec /= count
            
            context_embedding_dict[word_index_count] = vec

            word_index_count += 1
        
        indexed_sentence[index] = context_word_index[item_tuple]
    
    sentences_ready.append(np.array(indexed_sentence))


sentences_ready = np.array(sentences_ready)
labels_ready = np.array(labels)

print("Converted %d words (%d misses)" % (hits, misses))


Converted 218520 words (0 misses)


In [27]:

context_embedding_matrix = np.zeros((len(context_word_index) + 2, dimension))

for i, val in context_embedding_dict.items():
    context_embedding_matrix[i] = val


### GloVe | Context-refined | LR & SVM

In [28]:

kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
kf.get_n_splits(sentences_ready)

model_acc_lr = 0
model_acc_svm = 0

for train_index, test_index in kf.split(sentences_ready):
    #print("TRAIN:", train_index, "TEST:", test_index)
    context_x_train, context_x_test = sentences_ready[train_index], sentences_ready[test_index]
    context_y_train, context_y_test = labels_ready[train_index], labels_ready[test_index]

    # context GloVe
    logreg_context_x_train = []

    for sent in context_x_train:
        temp = []
        for token in sent:
            temp.extend(context_embedding_matrix[token])
        
        logreg_context_x_train.append(temp)


    logreg_context_x_test = []

    for sent in context_x_test:
        temp = []
        for token in sent:
            temp.extend(context_embedding_matrix[token])

        logreg_context_x_test.append(temp)

    logreg_context_y_train = context_y_train.copy()
    logreg_context_y_test = context_y_test.copy()


    accuracy_lr = logisticRegression(logreg_context_x_train, logreg_context_y_train, logreg_context_x_test, logreg_context_y_test)
    accuracy_svm = supportVectorMachines(logreg_context_x_train, logreg_context_y_train, logreg_context_x_test, logreg_context_y_test)

    model_acc_lr += accuracy_lr
    model_acc_svm += accuracy_svm

    #print('Test accuracy LR: %.4f' % accuracy_lr)
    #print('Test accuracy SVM: %.4f' % accuracy_svm)

print('10-fold CV Logistic Regression accuracy: %.4f' % (model_acc_lr/10))
print('10-fold CV Support Vector Machines accuracy: %.4f' % (model_acc_svm/10))


10-fold CV Logistic Regression accuracy: 0.7160
10-fold CV Support Vector Machines accuracy: 0.7364


### GloVe | Context-refined | CNN

In [29]:

context_embedding_layer = Embedding(
    len(context_word_index) + 2,
    dimension,
    embeddings_initializer = keras.initializers.Constant(context_embedding_matrix),
    input_length = max_len,
    trainable = False,
)


In [30]:

kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
kf.get_n_splits(sentences_ready)

model_acc = 0

for train_index, test_index in kf.split(sentences_ready):
    #print("TRAIN:", train_index, "TEST:", test_index)
    context_x_train, context_x_test = sentences_ready[train_index], sentences_ready[test_index]
    context_y_train, context_y_test = labels_ready[train_index], labels_ready[test_index]

    context_x_train, context_x_val, context_y_train, context_y_val = train_test_split(context_x_train, context_y_train, test_size = 0.2, random_state = 42)
    

    ##### Convolutional Neural Network - single input #####
    
    sequence_input = Input(shape = (max_len,), dtype = 'int64')
    embedded_sequences = context_embedding_layer(sequence_input)
    print(embedded_sequences.shape)

    # add first conv filter
    x = Conv1D(100, 5, activation = 'relu', kernel_constraint = max_norm(3))(embedded_sequences)
    x = MaxPooling1D(max_len - 5 + 1)(x)

    # add second conv filter
    y = Conv1D(100, 4, activation = 'relu', kernel_constraint = max_norm(3))(embedded_sequences)
    y = MaxPooling1D(max_len - 4 + 1)(y)

    # add third conv filter
    z = Conv1D(100, 3, activation = 'relu', kernel_constraint = max_norm(3))(embedded_sequences)
    z = MaxPooling1D(max_len - 3 + 1)(z)

    # concate the conv layers
    alpha = concatenate([x,y,z])

    # flatted the pooled features
    alpha = Flatten()(alpha)

    # dropout
    alpha = Dropout(0.5)(alpha)

    # predictions
    preds = Dense(1, activation = 'sigmoid')(alpha)

    # build model
    model = Model(inputs = sequence_input, outputs = preds)

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        0.0001,
        decay_steps = 100000,
        decay_rate = 0.95,
        staircase = True)

    optimizer = optimizers.Adam(learning_rate = lr_schedule)

    model.compile(loss = 'binary_crossentropy',
                      optimizer = optimizer,
                      metrics = ['acc'])

    model.summary()


    best_val_loss = None
    best_test_acc = 0
    test_acc = None

    for m in range(25):
        history = model.fit(context_x_train,
              context_y_train,
              batch_size = 50, 
              epochs = 1, 
              validation_data = (context_x_val, context_y_val))
        
        if best_val_loss is None or history.history['val_loss'][0] < best_val_loss:
            best_val_loss = history.history['val_loss'][0]

            _, test_acc = model.evaluate(context_x_test, context_y_test, batch_size = 50)

            if test_acc > best_test_acc:
                best_test_acc = test_acc

    model_acc += best_test_acc

    #print('Test accuracy: %.4f' % best_test_acc)

print('10-fold CV Convolutional Neural Network accuracy: %.4f' % (model_acc/10))


10-fold CV Convolutional Neural Network accuracy: 0.7764


## Sentiment & context-refined

### GloVe | Sentiment & context-refined | LR & SVM

In [31]:

kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
kf.get_n_splits(sentences_ready)

model_acc_lr = 0
model_acc_svm = 0

for train_index, test_index in kf.split(sentences_ready):
    #print("TRAIN:", train_index, "TEST:", test_index)
    context_x_train, context_x_test = sentences_ready[train_index], sentences_ready[test_index]
    context_y_train, context_y_test = labels_ready[train_index], labels_ready[test_index]

    vader_train, vader_test = vaders_ready[train_index], vaders_ready[test_index]

    # vader + context GloVe
    logreg_vader_x_train = []

    for sentiment_sent, context_sent in zip(vader_train, context_x_train):
        temp = []
        for token in sentiment_sent:
            temp.extend(sentiment_sent)
        
        for token in context_sent:
            temp.extend(context_embedding_matrix[token])
        
        logreg_vader_x_train.append(temp)


    logreg_vader_x_test = []

    for sentiment_sent, context_sent in zip(vader_test, context_x_test):
        temp = []
        for token in sentiment_sent:
            temp.extend(sentiment_sent)
        
        for token in context_sent:
            temp.extend(context_embedding_matrix[token])
        
        logreg_vader_x_test.append(temp)


    logreg_vader_y_train = context_y_train.copy()
    logreg_vader_y_test = context_y_test.copy()


    accuracy_lr = logisticRegression(logreg_vader_x_train, logreg_vader_y_train, logreg_vader_x_test, logreg_vader_y_test)
    accuracy_svm = supportVectorMachines(logreg_vader_x_train, logreg_vader_y_train, logreg_vader_x_test, logreg_vader_y_test)

    model_acc_lr += accuracy_lr
    model_acc_svm += accuracy_svm

    #print('Test accuracy LR: %.4f' % accuracy_lr)
    #print('Test accuracy SVM: %.4f' % accuracy_svm)

print('10-fold CV Logistic Regression accuracy: %.4f' % (model_acc_lr/10))
print('10-fold CV Support Vector Machines accuracy: %.4f' % (model_acc_svm/10))


10-fold CV Logistic Regression accuracy: 0.7205
10-fold CV Support Vector Machines accuracy: 0.7418


In [32]:

vader_embedding_layer = Embedding(
    3,
    1,
    input_length = max_len,
)


### GloVe | Sentiment & context-refined | CNN

In [33]:

kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
kf.get_n_splits(sentences_ready)

model_acc = 0

for train_index, test_index in kf.split(sentences_ready):
    #print("TRAIN:", train_index, "TEST:", test_index)
    context_x_train, context_x_test = sentences_ready[train_index], sentences_ready[test_index]
    context_y_train, context_y_test = labels_ready[train_index], labels_ready[test_index]

    context_x_train, context_x_val, context_y_train, context_y_val = train_test_split(context_x_train, context_y_train, test_size=0.2, random_state = 42)
    

    vader_x_train, vader_x_test = vaders_ready[train_index], vaders_ready[test_index]
    vader_y_train, vader_y_test = labels_ready[train_index], labels_ready[test_index]

    vader_x_train, vader_x_val, vader_y_train, vader_y_val = train_test_split(vader_x_train, vader_y_train, test_size=0.2, random_state = 42)



    ##### Convolutional Neural Network - multiple input #####
    
    # sentiment word embeddings
    vader_input = Input(shape = (max_len,), dtype = 'float32')
    vader_embedded_sequences = vader_embedding_layer(vader_input)

    # context word embeddings
    context_input = Input(shape = (max_len,), dtype = 'int64')
    context_embedded_sequences = context_embedding_layer(context_input)

    # concate the word embeddings
    embedded_sequences = concatenate([vader_embedded_sequences, context_embedded_sequences])

    # add first conv filter
    x = Conv1D(100, 5, activation = 'relu', kernel_constraint = max_norm(3))(embedded_sequences)
    x = MaxPooling1D(max_len - 5 + 1)(x)

    # add second conv filter
    y = Conv1D(100, 4, activation = 'relu', kernel_constraint = max_norm(3))(embedded_sequences)
    y = MaxPooling1D(max_len - 4 + 1)(y)

    # add third conv filter
    z = Conv1D(100, 3, activation = 'relu', kernel_constraint = max_norm(3))(embedded_sequences)
    z = MaxPooling1D(max_len - 3 + 1)(z)

    # concate the conv layers
    alpha = concatenate([x,y,z])

    # flatted the pooled features
    alpha = Flatten()(alpha)

    # dropout
    alpha = Dropout(0.5)(alpha)

    # predictions
    preds = Dense(1, activation = 'sigmoid')(alpha)

    # build model
    model = Model(inputs = [vader_input, context_input], outputs = preds)

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        0.0001,
        decay_steps = 100000,
        decay_rate = 0.95,
        staircase = True)

    optimizer = optimizers.Adam(learning_rate = lr_schedule)

    model.compile(loss = 'binary_crossentropy',
                      optimizer = optimizer,
                      metrics = ['acc'])

    model.summary()

    
    best_val_loss = None
    best_test_acc = 0
    test_acc = None

    for m in range(25):
        history = model.fit([vader_x_train, context_x_train],
              context_y_train,
              batch_size = 50, 
              epochs = 1, 
              validation_data = ([vader_x_val, context_x_val], context_y_val))
        
        if best_val_loss is None or history.history['val_loss'][0] < best_val_loss:
            best_val_loss = history.history['val_loss'][0]

            _, test_acc = model.evaluate([vader_x_test, context_x_test], context_y_test, batch_size = 50)

            if test_acc > best_test_acc:
                best_test_acc = test_acc

    model_acc += best_test_acc

    #print('Test accuracy: %.4f' % best_test_acc)

print('10-fold CV Convolutional Neural Network accuracy: %.4f' % (model_acc/10))


10-fold CV Convolutional Neural Network accuracy: 0.7796
