In [52]:
# Import statements and global helper methods
import numpy as np
import nltk
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook
from gensim.models import word2vec
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
import re

In [53]:
# Load in the data sets

raw_sentences = []

with open("datasets/datasetSentences.txt", encoding="utf-8") as sentence_file:
    # Skip header line
    next(sentence_file)
    for line in tqdm_notebook(sentence_file):
        sentence = line.split("\t")[1].strip()
        raw_sentences.append(sentence)
    
sentiment_map = {}
with open("datasets/sentiment_labels.txt") as sentiment_file:
    next(sentiment_file)
    for line in tqdm_notebook(sentiment_file):
        index, score = tuple(line.split("|"))
        score = float(score.strip())
        sentiment_map[index] = score
    
sentence_to_phrase_map = {}
with open("datasets/dictionary.txt", encoding="utf-8") as dictionary_file:
    for line in tqdm_notebook(dictionary_file):
        phrase, phrase_num = tuple(line.split("|"))
        sentence_to_phrase_map[phrase] = phrase_num.strip()

labelled_sentences = [(sentence, sentiment_map[sentence_to_phrase_map[sentence]]) for sentence in raw_sentences]
            
splits = ([], [], [])

with open("datasets/datasetSplit.txt") as split_file:
    next(split_file)
    for line in tqdm_notebook(split_file):
        index, split = tuple(line.split(","))
        index, split = int(index), int(split)
        # Splits are labelled 1-3
        splits[split - 1].append(labelled_sentences[index - 1])
        
train_set, test_set, dev_set = splits
train_sentences, train_y = zip(*train_set)
dev_sentences, dev_y = zip(*dev_set)
test_sentences, test_y = zip(*test_set)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [54]:
# Tools for feature extraction

# When lemmatizing, we need to convert from NLTK's part of speec
# to wordnet's recognized parts of speech
def get_wordnet_pos(treebank_pos):
    if treebank_pos.startswith('J'):
        return wordnet.ADJ
    elif treebank_pos.startswith('V'):
        return wordnet.VERB
    elif treebank_pos.startswith('N'):
        return wordnet.NOUN
    elif treebank_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def sentence_tokenize(sentence, lem = WordNetLemmatizer()):
    tokens = nltk.word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    return [lem.lemmatize(w, pos=get_wordnet_pos(pos)) for (w, pos) in tagged_tokens]

count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize
)

tuple_count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize,
    ngram_range = (2, 2)
)

# Fit all the sentences in the training set
count_vectorizer.fit(train_sentences)
tuple_count_vectorizer.fit(train_sentences)

def count_vectorize(sentences, ngram=False):
    if ngram:
        return tuple_count_vectorizer.transform(sentences)
    else:
        return count_vectorizer.transform(sentences)

In [55]:
# Label conversion
def coarse_label(sentiment):
    if sentiment >= 0.5:
        return "Positive"
    else:
        return "Negative"
    
def fine_label(sentiment):
    if sentiment <= 0.2:
        return "Very Negative"
    elif sentiment <= 0.4:
        return "Negative"
    elif sentiment <= 0.6:
        return "Neutral"
    elif sentiment <= 0.8:
        return "Positive"
    else:
        return "Very Positive"
    
coarse_train_y = [coarse_label(y) for y in tqdm_notebook(train_y)]
coarse_dev_y = [coarse_label(y) for y in tqdm_notebook(dev_y)]
coarse_test_y = [coarse_label(y) for y in tqdm_notebook(test_y)]

fine_train_y = [fine_label(y) for y in tqdm_notebook(train_y)]
fine_dev_y = [fine_label(y) for y in tqdm_notebook(dev_y)]
fine_test_y = [fine_label(y) for y in tqdm_notebook(test_y)]

HBox(children=(IntProgress(value=0, max=8544), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1101), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2210), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8544), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1101), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2210), HTML(value='')))




In [56]:
# Common Setup
ps = PredefinedSplit([-1 for s in train_sentences] + [0 for s in dev_sentences])
nb_grid = {"alpha": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0]}

svm_grid = {
#         "kernel": ["linear", "poly", "rbf", "sigmoid"],
#         "degree": range(1, 20, 2),
        "C": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0], 
        "tol": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0],
        "max_iter": range(1000, 10001, 1000)
}

In [57]:
# Vectorizing

count_Xs = count_vectorize(train_sentences + dev_sentences)
count_test_Xs = count_vectorize(test_sentences)

bigram_Xs = count_vectorize(train_sentences + dev_sentences, ngram=True)
bigram_test_Xs = count_vectorize(test_sentences, ngram=True)

In [58]:
# Naive Bayes

# BernoulliNB Training (Coarse)
bnb = BernoulliNB()
bnb_grid = GridSearchCV(bnb, nb_grid, cv=ps, refit=False)
bnb_grid.fit(count_Xs, coarse_train_y + coarse_dev_y)

# BernoulliNB Results (Coarse)
print(bnb_grid.best_params_)
bnb.set_params(**bnb_grid.best_params_)
bnb.fit(count_Xs[0:len(train_sentences)], coarse_train_y)
print(bnb.score(count_test_Xs, coarse_test_y))

# BernoulliNB Training (Fine)
bnb_grid.fit(count_Xs, fine_train_y + fine_dev_y)

# BernoulliNB Results (Fine)
print(bnb_grid.best_params_)
bnb.set_params(**bnb_grid.best_params_)
bnb.fit(count_Xs[0:len(train_sentences)], fine_train_y)
print(bnb.score(count_test_Xs, fine_test_y))

# MultinomialNB Training (Coarse)
mnb = MultinomialNB()
mnb_grid = GridSearchCV(mnb, nb_grid, cv=ps, refit=False)
mnb_grid.fit(count_Xs, coarse_train_y + coarse_dev_y)

# MultinomialNB Results (Coarse)
print(mnb_grid.best_params_)
mnb.set_params(**mnb_grid.best_params_)
mnb.fit(count_Xs[0:len(train_sentences)], coarse_train_y)
print(mnb.score(count_test_Xs, coarse_test_y))

# MultinomialNB Training (Fine)
mnb_grid.fit(count_Xs, fine_train_y + fine_dev_y)

# MultinomialNB Results (Fine)
print(mnb_grid.best_params_)
mnb.set_params(**mnb_grid.best_params_)
mnb.fit(count_Xs[0:len(train_sentences)], fine_train_y)
print(mnb.score(count_test_Xs, fine_test_y))

{'alpha': 1.0}
0.7683257918552037
{'alpha': 1.0}
0.39592760180995473
{'alpha': 1.0}
0.7710407239819005
{'alpha': 1.0}
0.40180995475113124


In [60]:
# Bigram Naive Bayes

# BernoulliNB Training (Coarse)
b_bnb = BernoulliNB()
b_bnb_grid = GridSearchCV(b_bnb, nb_grid, cv=ps, refit=False)
b_bnb_grid.fit(bigram_Xs, coarse_train_y + coarse_dev_y)

# BernoulliNB Results (Coarse)
print(b_bnb_grid.best_params_)
b_bnb.set_params(**b_bnb_grid.best_params_)
b_bnb.fit(bigram_Xs[0:len(train_sentences)], coarse_train_y)
print(b_bnb.score(bigram_test_Xs, coarse_test_y))

# BernoulliNB Training (Fine)
b_bnb_grid.fit(bigram_Xs, fine_train_y + fine_dev_y)

# BernoulliNB Results (Fine)
print(b_bnb_grid.best_params_)
b_bnb.set_params(**b_bnb_grid.best_params_)
b_bnb.fit(bigram_Xs[0:len(train_sentences)], fine_train_y)
print(b_bnb.score(bigram_test_Xs, fine_test_y))

# MultinomialNB Training (Coarse)
b_mnb = MultinomialNB()
b_mnb_grid = GridSearchCV(b_mnb, nb_grid, cv=ps, refit=False)
b_mnb_grid.fit(bigram_Xs, coarse_train_y + coarse_dev_y)

# MultinomialNB Results (Coarse)
print(b_mnb_grid.best_params_)
b_mnb.set_params(**b_mnb_grid.best_params_)
b_mnb.fit(bigram_Xs[0:len(train_sentences)], coarse_train_y)
print(b_mnb.score(bigram_test_Xs, coarse_test_y))

# MultinomialNB Training (Fine)
b_mnb_grid.fit(bigram_Xs, fine_train_y + fine_dev_y)

# MultinomialNB Results(Fine)
print(b_mnb_grid.best_params_)
b_mnb.set_params(**b_mnb_grid.best_params_)
b_mnb.fit(bigram_Xs[0:len(train_sentences)], fine_train_y)
print(b_mnb.score(bigram_test_Xs, fine_test_y))

{'alpha': 0.1}
0.711764705882353
{'alpha': 0.1}
0.3556561085972851
{'alpha': 1.0}
0.7303167420814479
{'alpha': 1.0}
0.36787330316742084


In [None]:
# SVM

# Linear SVM Training (Coarse)
svm = LinearSVC()
svm_grid = GridSearchCV(svm, svm_grid, cv=ps, refit=False)
svm_grid.fit(count_Xs, coarse_train_y + coarse_dev_y)

# Linear SVM Results (Coarse)
print(svm_grid.best_params_)
svm.set_params(**svm_grid.best_params_)
svm.fit(count_Xs[0:len(train_sentences)], coarse_train_y)
print(svm.score(count_test_Xs, coarse_test_y))

# Linear SVM Training (Fine)
svm_grid.fit(count_Xs, fine_train_y + fine_dev_y)

# Linear SVM Results (Fine)
print(svm_grid.best_params_)
svm.set_params(**svm_grid.best_params_)
svm.fit(count_Xs[0:len(train_sentences)], fine_train_y)
print(svm.score(count_test_Xs, fine_test_y))



{'C': 0.1, 'max_iter': 9000, 'tol': 1.0}
0.7633484162895927
{'C': 0.01, 'max_iter': 4000, 'tol': 2.0}
0.40316742081447965


In [None]:
# Recurrent Neural Networks

nn_grid = {
#     'epochs': [10, 100, 200, 500, 1000],
#     'batch_size': [5, 10, 100, 1000]
    'epochs': [100, 200],
    'batch_size': [5]
}

fine_encoder = LabelEncoder()
fine_encoder.fit(fine_train_y)
coarse_encoder = LabelEncoder()
coarse_encoder.fit(coarse_train_y)

encoded_fine_y = fine_encoder.transform(fine_train_y + fine_dev_y)
encoded_fine_test_y = fine_encoder.transform(fine_test_y)

encoded_coarse_y = coarse_encoder.transform(coarse_train_y + coarse_dev_y)
encoded_coarse_test_y = coarse_encoder.transform(coarse_test_y)

def fine_count_model():
    model = Sequential([
        Dense(32, input_dim=count_Xs.shape[1]),
        Activation('relu'),
        Dense(5),
        Activation('softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

fine_count_nn = GridSearchCV(KerasClassifier(build_fn=fine_count_model, verbose=0), nn_grid, cv=ps, refit=False)
    
fine_count_nn.fit(count_Xs, np_utils.to_categorical(encoded_fine_y))
print(fine_count_nn.score(count_test_Xs, np_utils.to_categorical(encoded_fine_test_y)))
print(fine_count_nn.best_params_)

def fine_bigram_model():
    model = Sequential([
        Dense(32, input_dim=bigram_Xs.shape[1]),
        Activation('relu'),
        Dense(5),
        Activation('softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

fine_bigram_nn = GridSearchCV(KerasClassifier(build_fn=fine_bigram_model, verbose=0), nn_grid, cv=ps, refit=False)

fine_bigram_nn.fit(bigram_Xs, np_utils.to_categorical(encoded_fine_y))
print(fine_bigram_nn.score(bigram_test_Xs, np_utils.to_categorical(encoded_fine_test_y)))
print(fine_bigram_nn.best_params_)