In [None]:
# Import statements and global helper methods
import numpy as np
import nltk
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, LinearSVC
from tqdm import tqdm_notebook
from gensim.models import word2vec
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
import re

In [None]:
stanford_run = True

In [None]:
if stanford_run:
    # Load in the data sets

    raw_sentences = []

    with open("datasets/datasetSentences.txt", encoding="utf-8") as sentence_file:
        # Skip header line
        next(sentence_file)
        for line in tqdm_notebook(sentence_file):
            sentence = line.split("\t")[1].strip()
            raw_sentences.append(sentence)

    sentiment_map = {}
    with open("datasets/sentiment_labels.txt") as sentiment_file:
        next(sentiment_file)
        for line in tqdm_notebook(sentiment_file):
            index, score = tuple(line.split("|"))
            score = float(score.strip())
            sentiment_map[index] = score

    sentence_to_phrase_map = {}
    with open("datasets/dictionary.txt", encoding="utf-8") as dictionary_file:
        for line in tqdm_notebook(dictionary_file):
            phrase, phrase_num = tuple(line.split("|"))
            sentence_to_phrase_map[phrase] = phrase_num.strip()

    labelled_sentences = [(sentence, sentiment_map[sentence_to_phrase_map[sentence]]) for sentence in raw_sentences]

    splits = ([], [], [])

    with open("datasets/datasetSplit.txt") as split_file:
        next(split_file)
        for line in tqdm_notebook(split_file):
            index, split = tuple(line.split(","))
            index, split = int(index), int(split)
            # Splits are labelled 1-3
            splits[split - 1].append(labelled_sentences[index - 1])

    train_set, test_set, dev_set = splits
    train_sentences, train_y = zip(*train_set)
    dev_sentences, dev_y = zip(*dev_set)
    test_sentences, test_y = zip(*test_set)

In [None]:
if not stanford_run:
    #Method used to load the data from the given files
    def openFile(fileName):
        #open file
        with open(fileName, 'r', encoding = 'utf-8') as file:

            #read the entire file
            data = file.read()

            #split the data into the separate reviews
            data = data.split('\t')

            reviews = []
            scores = []

            #append the first review to list
            reviews.append(data[0])

            #append each review to the list and each score to the previous review
            for review in data[1:-1]:
                reviews.append(review[1:])
                scores.append(review[0])

            #remove trailing \n and add last score
            reviews[-1] = reviews[-1].replace('\n', '')
            scores.append(data[-1].replace('\n', ''))

            return reviews, scores

    #loading all the data
    train_sentences, train_y = openFile('datasets/IMDB-train.txt')
    dev_sentences, dev_y = openFile('datasets/IMDB-valid.txt')
    test_sentences, test_y = openFile('datasets/IMDB-test.txt')

In [None]:
# Tools for feature extraction

# When lemmatizing, we need to convert from NLTK's part of speec
# to wordnet's recognized parts of speech
def get_wordnet_pos(treebank_pos):
    if treebank_pos.startswith('J'):
        return wordnet.ADJ
    elif treebank_pos.startswith('V'):
        return wordnet.VERB
    elif treebank_pos.startswith('N'):
        return wordnet.NOUN
    elif treebank_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def sentence_tokenize(sentence, lem = WordNetLemmatizer()):
    tokens = nltk.word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    return [lem.lemmatize(w, pos=get_wordnet_pos(pos)) for (w, pos) in tagged_tokens]

count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize
)

tuple_count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize,
    ngram_range = (1, 2)
)

# Fit all the sentences in the training set
count_vectorizer.fit(train_sentences)
tuple_count_vectorizer.fit(train_sentences)

def count_vectorize(sentences, ngram=False):
    if ngram:
        return tuple_count_vectorizer.transform(sentences)
    else:
        return count_vectorizer.transform(sentences)

In [None]:
# Label conversion
def coarse_label(sentiment):
    if sentiment >= 0.5:
        return "Positive"
    else:
        return "Negative"
    
def fine_label(sentiment):
    if sentiment <= 0.2:
        return "Very Negative"
    elif sentiment <= 0.4:
        return "Negative"
    elif sentiment <= 0.6:
        return "Neutral"
    elif sentiment <= 0.8:
        return "Positive"
    else:
        return "Very Positive"
    
coarse_train_y = [coarse_label(y) for y in tqdm_notebook(train_y)]
coarse_dev_y = [coarse_label(y) for y in tqdm_notebook(dev_y)]
coarse_test_y = [coarse_label(y) for y in tqdm_notebook(test_y)]

fine_train_y = [fine_label(y) for y in tqdm_notebook(train_y)]
fine_dev_y = [fine_label(y) for y in tqdm_notebook(dev_y)]
fine_test_y = [fine_label(y) for y in tqdm_notebook(test_y)]

In [None]:
# Common Setup
ps = PredefinedSplit([-1 for s in train_sentences] + [0 for s in dev_sentences])

In [None]:
# Vectorizing

count_Xs = count_vectorize(train_sentences + dev_sentences)
count_test_Xs = count_vectorize(test_sentences)

bigram_Xs = count_vectorize(train_sentences + dev_sentences, ngram=True)
bigram_test_Xs = count_vectorize(test_sentences, ngram=True)

In [None]:
def test_model(underlying_model, param_grid, cv=ps, fine=False, bigrams=False):
    if not fine:
        train_y, dev_y, test_y = coarse_train_y, coarse_dev_y, coarse_test_y
    else:
        train_y, dev_y, test_y = fine_train_y, fine_dev_y, fine_test_y
    if bigrams:
        Xs, test_Xs = bigram_Xs, bigram_test_Xs
    else:
        Xs, test_Xs = count_Xs, count_test_Xs
    grid = GridSearchCV(underlying_model, param_grid, cv=ps)
    grid.fit(Xs, train_y + dev_y)
    
    print(grid.best_params_)
    print(grid.score(test_Xs, test_y))

In [None]:
# Naive Bayes
nb_grid = {"alpha": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0]}


bnb = BernoulliNB()
test_model(bnb, nb_grid)
test_model(bnb, nb_grid, fine=True)

test_model(bnb, nb_grid, bigrams=True)
test_model(bnb, nb_grid, bigrams=True, fine=True)

print("")

mnb = MultinomialNB()
test_model(mnb, nb_grid)
test_model(mnb, nb_grid, fine=True)

test_model(mnb, nb_grid, bigrams=True)
test_model(mnb, nb_grid, bigrams=True, fine=True)

In [None]:
# SVM
svm_grid = {
        "C": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0], 
        "tol": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0],
        "max_iter": range(1000, 10001, 1000)
}

svm = LinearSVC()
test_model(svm, svm_grid)
test_model(svm, svm_grid, fine=True)

In [None]:
# Random Forest
rf_grid = {
    "n_estimators": [10, 20, 30, 40, 50],
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 10, 50, 100, 1000]
}

rf = RandomForestClassifier()
test_model(rf, rf_grid)
test_model(rf, rf_grid, fine=True)

In [None]:
# Logistic Regression
lr_grid = {
    "penalty": ["l1", "l2"],
    "tol": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0],
    "C": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0]
}

lr = LogisticRegression()
test_model(lr, lr_grid)
test_model(lr, lr_grid, fine=True)

In [None]:
# K-Nearest Neighbors
kn_grid = {
    "n_neighbors": [5, 10, 15, 20],
    "weights": ["uniform", "distance"]
}

kn = KNeighborsClassifier()
test_model(kn, kn_grid)
test_model(kn, kn_grid, fine=True)

In [None]:
# Word Vector Averaging

def review_wordlist(review, remove_stopwords=False):
    review_text = review
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

num_features = 100  # Word vector dimensionality
min_word_count = 5 # Minimum word count
num_workers = 2     # Number of parallel threads
context = 5        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

combined_training_sentences = list(train_sentences + dev_sentences)
tokenized_sentences = []
for sentence in combined_training_sentences:
    tokenized_sentences.append(review_wordlist(sentence, remove_stopwords=False))

model = word2vec.Word2Vec(tokenized_sentences, min_count=1)

model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=10)

# Function to average all word vectors in a review
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    if nwords == 0:
        print(words)
        return featureVec
    featureVec = np.divide(featureVec, nwords)
    return featureVec

# Function for calculating average word vectors for all reviews
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs
    
trainDataVecs = getAvgFeatureVecs(tokenized_sentences, model, num_features)

filtered_test_reviews = []
for review in test_sentences:
    filtered_test_reviews.append(review_wordlist(review, remove_stopwords=False))
    
testDataVecs = getAvgFeatureVecs(filtered_test_reviews, model, num_features)

clf = LinearSVC(max_iter=20000)
clf.fit(trainDataVecs, coarse_train_y + coarse_dev_y)

print(clf.score(testDataVecs, coarse_test_y))

clf.fit(trainDataVecs, fine_train_y + fine_dev_y)

print(clf.score(testDataVecs, fine_test_y))