In [None]:
# Import statements and global helper methods
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm_notebook

In [None]:
# Load in the data sets

raw_sentences = []

with open("datasets/datasetSentences.txt", encoding="utf-8") as sentence_file:
    # Skip header line
    next(sentence_file)
    for line in tqdm_notebook(sentence_file):
        sentence = line.split("\t")[1].strip()
        raw_sentences.append(sentence)
    
sentiment_map = {}
with open("datasets/sentiment_labels.txt") as sentiment_file:
    next(sentiment_file)
    for line in tqdm_notebook(sentiment_file):
        index, score = tuple(line.split("|"))
        score = float(score.strip())
        sentiment_map[index] = score
    
sentence_to_phrase_map = {}
with open("datasets/dictionary.txt", encoding="utf-8") as dictionary_file:
    for line in tqdm_notebook(dictionary_file):
        phrase, phrase_num = tuple(line.split("|"))
        sentence_to_phrase_map[phrase] = phrase_num.strip()

labelled_sentences = [(sentence, sentiment_map[sentence_to_phrase_map[sentence]]) for sentence in raw_sentences]
            
splits = ([], [], [])

with open("datasets/datasetSplit.txt") as split_file:
    next(split_file)
    for line in tqdm_notebook(split_file):
        index, split = tuple(line.split(","))
        index, split = int(index), int(split)
        # Splits are labelled 1-3
        splits[split - 1].append(labelled_sentences[index - 1])
        
train_set, test_set, dev_set = splits
train_sentences, train_y = zip(*train_set)
dev_sentences, dev_y = zip(*dev_set)
test_sentences, test_y = zip(*test_set)

In [None]:
# Tools for feature extraction

# When lemmatizing, we need to convert from NLTK's part of speec
# to wordnet's recognized parts of speech
def get_wordnet_pos(treebank_pos):
    if treebank_pos.startswith('J'):
        return wordnet.ADJ
    elif treebank_pos.startswith('V'):
        return wordnet.VERB
    elif treebank_pos.startswith('N'):
        return wordnet.NOUN
    elif treebank_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def sentence_tokenize(sentence, lem = WordNetLemmatizer()):
    tokens = nltk.word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    return [lem.lemmatize(w, pos=get_wordnet_pos(pos)) for (w, pos) in tagged_tokens]

count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize
)

tuple_count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize,
    ngram_range = (2, 2)
)

# Fit all the sentences in the training set
count_vectorizer.fit(train_sentences)
tuple_count_vectorizer.fit(train_sentences)

def count_vectorize(sentences, ngram=False):
    if ngram:
        return tuple_count_vectorizer.transform(sentences)
    else:
        return count_vectorizer.transform(sentences)
    
def binary_vectorize(sentences, ngram=False):
    raw_vectors = count_vectorize(sentences, ngram)
    return np.clip(raw_vectors, 0, 1)

In [None]:
# Label conversion
def coarse_label(sentiment):
    if sentiment >= 0.5:
        return "Positive"
    else:
        return "Negative"
    
def fine_label(sentiment):
    if sentiment < 0.2:
        return "Very Negative"
    elif sentiment < 0.4:
        return "Negative"
    elif sentiment < 0.6:
        return "Neutral"
    elif sentiment < 0.8:
        return "Positive"
    else:
        return "Very Positive"
    
coarse_train_y = [coarse_label(y) for y in tqdm_notebook(train_y)]
coarse_dev_y = [coarse_label(y) for y in tqdm_notebook(dev_y)]
coarse_test_y = [coarse_label(y) for y in tqdm_notebook(test_y)]

fine_train_y = [fine_label(y) for y in tqdm_notebook(train_y)]
fine_dev_y = [fine_label(y) for y in tqdm_notebook(dev_y)]
fine_test_y = [fine_label(y) for y in tqdm_notebook(test_y)]

In [None]:
# Common Setup
ps = PredefinedSplit([0 for s in test_sentences] + [1 for s in dev_sentences])
nb_grid = {"alpha": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0]}

In [None]:
# Naive Bayes

# BernoulliNB Vectorizing
binary_Xs = binary_vectorize(train_sentences + dev_sentences)
binary_test_Xs = binary_vectorize(test_sentences)

print("Vectorized for BernoulliNB")

# BernoulliNB Training
bernoulli_naive_bayes = GridSearchCV(BernoulliNB(), nb_grid, cv=ps)
bernoulli_naive_bayes.fit(binary_Xs, coarse_train_y + coarse_dev_y)

# BernoulliNB Results
print(bernoulli_naive_bayes.score(binary_test_Xs, coarse_test_y))
print(bernoulli_naive_bayes.get_params())

# MultinomialNB Vectorizing
count_Xs = count_vectorize(train_sentence + dev_sentences)
count_test_Xs = count_vectorize(test_sentences)

print("Vectorized for MultinomialNB")

# MultinomialNB Training
multinomial_naive_bayes = GridSearchCV(MultinomialNB(), nb_grid, cv=ps)
multinomial_naive_bayes.fit(count_Xs, coarse_train_y + coarse_dev_y)

# Results
print(multinomial_naive_bayes.score(count_test_Xs, coarse_test_y))
print(multinomial_naive_bayes.get_params())

In [None]:
# Bigram Naive Bayes

# BernoulliNB Vectorizing
bigram_binary_Xs = binary_vectorize(train_sentences + dev_sentences, ngram=True)
bigram_binary_test_Xs = binary_vectorize(test_sentences, ngram=True)

print("Vectorized for BernoulliNB")

# BernoulliNB Training
bernoulli_bigram_nb = GridSearchCV(BernoulliNB(), nb_grid, cv=ps)
bernoulli_bigram_nb.fit(bigram_binary_Xs, coarse_train_y + coarse_dev_y)

# BernoulliNB Results
print(bernoulli_bigram_nb.score(bigram_binary_test_Xs, coarse_test_y))
print(bernoulli_bigram_nb.get_params())

# MultinomialNB Vectorizing
bigram_count_Xs = count_vectorize(train_sentences + dev_sentences, ngram=True)
bigram_count_test_Xs = count_vectorize(test_sentences, ngram=True)

print("Vectorized for MultinomialNB")

# MultinomialNB Training
multinomial_bigram_nb = GridSearchCV(MultinomialNB(), nb_grid, cv=ps)
multinomial_bigram_nb.fit(bigram_count_Xs, coarse_train_y + coarse_dev_y)

# Results
print(multinomial_bigram_nb.score(bigram_count_test_Xs, coarse_test_y))
print(multinomial_bigram_nb.get_params())

In [None]:
# SVM

In [None]:
# Word Vector Averaging

In [None]:
# Recurrent Neural Networks

In [None]:
# Matrix Vector RNN

In [None]:
# Recurrent Tensor Neural Networks