In [None]:
# Import statements and global helper methods
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Load in the data sets

raw_sentences = []

with open("datasets/datasetSentences.txt", encoding="utf-8") as sentence_file:
    # Skip header line
    next(sentence_file)
    for line in sentence_file:
        sentence = line.split("\t")[1].strip()
        raw_sentences.append(sentence)
    
sentiment_map = {}
with open("datasets/sentiment_labels.txt") as sentiment_file:
    next(sentiment_file)
    for line in sentiment_file:
        index, score = tuple(line.split("|"))
        score = float(score.strip())
        sentiment_map[index] = score
    
sentence_to_phrase_map = {}
with open("datasets/dictionary.txt", encoding="utf-8") as dictionary_file:
    for line in dictionary_file:
        phrase, phrase_num = tuple(line.split("|"))
        sentence_to_phrase_map[phrase] = phrase_num.strip()

labelled_sentences = [(sentence, sentiment_map[sentence_to_phrase_map[sentence]]) for sentence in raw_sentences]
            
splits = ([], [], [])

with open("datasets/datasetSplit.txt") as split_file:
    next(split_file)
    for line in split_file:
        index, split = tuple(line.split(","))
        index, split = int(index), int(split)
        # Splits are labelled 1-3
        splits[split - 1].append(labelled_sentences[index - 1])
        
train_set, test_set, dev_set = splits

In [None]:
# Tools for feature extraction

# When lemmatizing, we need to convert from NLTK's part of speec
# to wordnet's recognized parts of speech
def get_wordnet_pos(treebank_pos):
    if treebank_pos.startswith('J'):
        return wordnet.ADJ
    elif treebank_pos.startswith('V'):
        return wordnet.VERB
    elif treebank_pos.startswith('N'):
        return wordnet.NOUN
    elif treebank_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def sentence_tokenize(sentence, lemmatizer = WordNetLemmatizer()):
    tokens = nltk.word_tokenize(s)
    tagged_tokens = nltk.pos_tag(tokens)
    return [lemmatizer(w, pos=get_wordnet_pos(pos)) for (w, pos) in tagged_tokens]

count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize
)

tuple_count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize
    ngram_range = (2, 2)
)

# Fit all the sentences in the training set
count_vectorizer.fit([s for s, _ in train_set])
tuple_count_vectorizer.fit([s for s, _ in train_set])

def count_vectorize(sentences, ngram=False):
    if ngram:
        return tuple_count_vectorizer.transform(sentences)
    else:
        return count_vectorizer.transform(sentences)
    
def binary_vectorize(sentences, ngram=False):
    raw_vectors = count_vectorize(sentences, ngram)
    return np.clip(raw_vectors, 0, 1)

In [None]:
# Naive Bayes

# Step 1: Vectorize binary and count vectors based on the train vocabulary
# Step 2: Run BernoulliNB and Multinomial NB on those vectors, testing performance
# Step 3: Run the same models on the vectors but using broader categories (positive, negative)
# Step 4: Tune the hyper-parameters



In [None]:
# Bigram Naive Bayes

In [None]:
# SVM

In [None]:
# Word Vector Averaging

In [None]:
# Recurrent Neural Networks

In [None]:
# Matrix Vector RNN

In [None]:
# Recurrent Tensor Neural Networks