In [45]:
# Import statements and global helper methods
import numpy as np
import nltk
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, LinearSVC
from tqdm import tqdm_notebook
from gensim.models import word2vec
# from keras.models import Sequential
# from keras.layers import Dense, Activation
# from keras.wrappers.scikit_learn import KerasClassifier
# from keras.utils import np_utils
import re

In [46]:
stanford_run = True

In [47]:
if stanford_run:
    # Load in the data sets

    raw_sentences = []

    with open("datasets/datasetSentences.txt", encoding="utf-8") as sentence_file:
        # Skip header line
        next(sentence_file)
        for line in tqdm_notebook(sentence_file):
            sentence = line.split("\t")[1].strip()
            raw_sentences.append(sentence)

    sentiment_map = {}
    with open("datasets/sentiment_labels.txt") as sentiment_file:
        next(sentiment_file)
        for line in tqdm_notebook(sentiment_file):
            index, score = tuple(line.split("|"))
            score = float(score.strip())
            sentiment_map[index] = score

    sentence_to_phrase_map = {}
    with open("datasets/dictionary.txt", encoding="utf-8") as dictionary_file:
        for line in tqdm_notebook(dictionary_file):
            phrase, phrase_num = tuple(line.split("|"))
            sentence_to_phrase_map[phrase] = phrase_num.strip()

    labelled_sentences = [(sentence, sentiment_map[sentence_to_phrase_map[sentence]]) for sentence in raw_sentences]

    splits = ([], [], [])

    with open("datasets/datasetSplit.txt") as split_file:
        next(split_file)
        for line in tqdm_notebook(split_file):
            index, split = tuple(line.split(","))
            index, split = int(index), int(split)
            # Splits are labelled 1-3
            splits[split - 1].append(labelled_sentences[index - 1])

    train_set, test_set, dev_set = splits
    train_sentences, train_y = zip(*train_set)
    dev_sentences, dev_y = zip(*dev_set)
    test_sentences, test_y = zip(*test_set)
    
     #Copy data, for filtering neutral coarse labels
    train_sentences_coarse = train_sentences[:]
    dev_sentences_coarse = dev_sentences[:]
    test_sentences_coarse = test_sentences[:]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [48]:
if not stanford_run:
    #Method used to load the data from the given files
    def openFile(fileName):
        with open(fileName, 'r', encoding = 'utf-8') as file:
            split_lines = [line.strip().split("\t") for line in tqdm_notebook(file)]
            reviews, scores = zip(*split_lines)
            return reviews, scores

    #loading all the data
    train_sentences, coarse_train_y = openFile('datasets/IMDB-train.txt')
    dev_sentences, coarse_dev_y = openFile('datasets/IMDB-valid.txt')
    test_sentences, coarse_test_y = openFile('datasets/IMDB-test.txt')
    
     #Copy data, for filtering neutral coarse labels
    train_sentences_coarse = train_sentences[:]
    dev_sentences_coarse = dev_sentences[:]
    test_sentences_coarse = test_sentences[:]

In [49]:
# Tools for feature extraction

# When lemmatizing, we need to convert from NLTK's part of speec
# to wordnet's recognized parts of speech
def get_wordnet_pos(treebank_pos):
    if treebank_pos.startswith('J'):
        return wordnet.ADJ
    elif treebank_pos.startswith('V'):
        return wordnet.VERB
    elif treebank_pos.startswith('N'):
        return wordnet.NOUN
    elif treebank_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def sentence_tokenize(sentence, lem = WordNetLemmatizer()):
    tokens = nltk.word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    return [lem.lemmatize(w, pos=get_wordnet_pos(pos)) for (w, pos) in tagged_tokens]

count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize
)

tuple_count_vectorizer = CountVectorizer(
    input = "content",
    tokenizer = sentence_tokenize,
    ngram_range = (1, 2)
)

# Fit all the sentences in the training set
count_vectorizer.fit(tqdm_notebook(train_sentences))
tuple_count_vectorizer.fit(tqdm_notebook(train_sentences))

def count_vectorize(sentences, ngram=False):
    if ngram:
        return tuple_count_vectorizer.transform(tqdm_notebook(sentences))
    else:
        return count_vectorizer.transform(tqdm_notebook(sentences))

HBox(children=(IntProgress(value=0, max=8544), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8544), HTML(value='')))




In [50]:
# Label conversion
def coarse_label(sentiment):
    if sentiment > 0.6:
        return "Positive"
    elif sentiment < 0.4:
        return "Negative"
    else:
        return "Neutral"
    
def fine_label(sentiment):
    if sentiment <= 0.2:
        return "Very Negative"
    elif sentiment <= 0.4:
        return "Negative"
    elif sentiment <= 0.6:
        return "Neutral"
    elif sentiment <= 0.8:
        return "Positive"
    else:
        return "Very Positive"

if stanford_run:
    coarse_train_y = [coarse_label(y) for y in tqdm_notebook(train_y)]
    coarse_dev_y = [coarse_label(y) for y in tqdm_notebook(dev_y)]
    coarse_test_y = [coarse_label(y) for y in tqdm_notebook(test_y)]

    fine_train_y = [fine_label(y) for y in tqdm_notebook(train_y)]
    fine_dev_y = [fine_label(y) for y in tqdm_notebook(dev_y)]
    fine_test_y = [fine_label(y) for y in tqdm_notebook(test_y)]

# Remove neutral coarse labels and sentences from the earlier copies of sentences
def remove_neutral_labels(coarse_sentences, coarse_labels):
    non_neutral_sentences = []
    non_neutral_labels = []
    for i in range(len(coarse_labels)):
        if coarse_labels[i] != "Neutral":
            non_neutral_sentences += (coarse_sentences[i],)
            non_neutral_labels.append(coarse_labels[i])
            
            
    return non_neutral_sentences, non_neutral_labels

if stanford_run:
    train_sentences_coarse, coarse_train_y = remove_neutral_labels(train_sentences_coarse, coarse_train_y)
    dev_sentences_coarse, coarse_dev_y = remove_neutral_labels(dev_sentences_coarse, coarse_dev_y)
    test_sentences_coarse, coarse_test_y = remove_neutral_labels(test_sentences_coarse, coarse_test_y)

HBox(children=(IntProgress(value=0, max=8544), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1101), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2210), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8544), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1101), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2210), HTML(value='')))




In [51]:
# Common Setup
ps = PredefinedSplit([-1 for s in train_sentences] + [0 for s in dev_sentences])
ps_coarse = PredefinedSplit([-1 for s in train_sentences_coarse] + [0 for s in dev_sentences_coarse])

In [114]:
# Vectorizing

count_Xs = count_vectorize(train_sentences + dev_sentences)
count_test_Xs = count_vectorize(test_sentences)

bigram_Xs = count_vectorize(train_sentences + dev_sentences, ngram=True)
bigram_test_Xs = count_vectorize(test_sentences, ngram=True)

# Fit and then vectorize on coarse sentences
count_vectorizer.fit(train_sentences_coarse)
tuple_count_vectorizer.fit(train_sentences_coarse)

count_Xs_coarse = count_vectorize(train_sentences_coarse + dev_sentences_coarse)
count_test_Xs_coarse = count_vectorize(test_sentences_coarse) 

bigram_Xs_coarse = count_vectorize(train_sentences_coarse + dev_sentences_coarse, ngram=True)
bigram_test_Xs_coarse = count_vectorize(test_sentences_coarse, ngram=True)

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

In [115]:
def test_model(underlying_model, param_grid, fine=False, bigrams=False):
    if fine and not stanford_run:
        return
    
    if not fine:
        cv = ps_coarse
        train_y, dev_y, test_y = coarse_train_y, coarse_dev_y, coarse_test_y
    else:
        cv = ps
        train_y, dev_y, test_y = fine_train_y, fine_dev_y, fine_test_y
    if bigrams:
        if not fine:
            Xs, test_Xs = bigram_Xs_coarse, bigram_test_Xs_coarse
        else:
            Xs, test_Xs = bigram_Xs, bigram_test_Xs
    else:
        if not fine:
            Xs, test_Xs = count_Xs_coarse, count_test_Xs_coarse
        else:
            Xs, test_Xs = count_Xs, count_test_Xs
    grid = GridSearchCV(underlying_model, param_grid, cv=cv)
    grid.fit(Xs, train_y + dev_y)
    
    print(grid.best_params_)
    print(grid.score(test_Xs, test_y))

In [116]:
# Naive Bayes
nb_grid = {"alpha": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0]}


bnb = BernoulliNB()
mnb = MultinomialNB()

print("Bag of Words:")

test_model(bnb, nb_grid)
test_model(bnb, nb_grid, fine=True)

test_model(mnb, nb_grid)
test_model(mnb, nb_grid, fine=True)

print("Bigrams:")

test_model(bnb, nb_grid, bigrams=True)
test_model(bnb, nb_grid, bigrams=True, fine=True)

test_model(mnb, nb_grid, bigrams=True)
test_model(mnb, nb_grid, bigrams=True, fine=True)

Bag of Words:
{'alpha': 1.0}
0.82856
{'alpha': 1.0}
0.80684
Bigrams:
{'alpha': 0.1}
0.85252
{'alpha': 0.1}
0.84368


In [117]:
# SVM
svm_grid = {
        "C": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0], 
        "tol": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0],
        "max_iter": range(1000, 5001, 1000)
}

svm = LinearSVC()
test_model(svm, svm_grid)
test_model(svm, svm_grid, fine=True)

{'C': 0.01, 'max_iter': 3000, 'tol': 2.0}
0.87732


In [118]:
# Random Forest
rf_grid = {
    "n_estimators": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    "criterion": ["gini", "entropy"],
    "max_depth": [2800, 3000, 3200]
}

rf = RandomForestClassifier()
test_model(rf, rf_grid)
test_model(rf, rf_grid, fine=True)

{'criterion': 'entropy', 'max_depth': 2800, 'n_estimators': 100}
0.84284


In [119]:
# Logistic Regression
lr_grid = {
    "penalty": ["l1", "l2"],
    "tol": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0],
    "C": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0]
}

lr = LogisticRegression()
test_model(lr, lr_grid)
test_model(lr, lr_grid, fine=True)

{'C': 2.0, 'penalty': 'l1', 'tol': 0.1}
0.85788


In [120]:
# K-Nearest Neighbors
kn_grid = {
    "n_neighbors": [15, 20, 30, 50, 70],
    "weights": ["uniform", "distance"]
}

kn = KNeighborsClassifier()
test_model(kn, kn_grid)
test_model(kn, kn_grid, fine=True)

{'n_neighbors': 70, 'weights': 'distance'}
0.6588


In [None]:
# Word Vector Averaging

import warnings
warnings.filterwarnings("ignore")

def review_wordlist(review, remove_stopwords=False):
    review_text = review
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return words

# Function to average all word vectors in a review
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    if nwords == 0:
        return featureVec
    featureVec = np.divide(featureVec, nwords)
    return featureVec

# Function for calculating average word vectors for all reviews
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:

        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

num_features = [100,150,200]  # Word vector dimensionality
contexts = [15,17,20]      # Context window size
epochs = [10,20]
min_counts = [1,3,5]
#sg = [0,1]

tokenized_sentences_fine = []
tokenized_sentences_coarse = []

for sentence in list(train_sentences):
    tokenized_sentences_fine.append(review_wordlist(sentence, remove_stopwords=False))

for sentence in list(train_sentences_coarse):
    tokenized_sentences_coarse.append(review_wordlist(sentence, remove_stopwords=False))

# Hypertune word2vec average model parameters
best_score_coarse = 0
best_score_fine = 0
best_hyperparameters_coarse = []
best_hyperparameters_fine = []

for num_feature in num_features:
    for context in contexts:
        for epoch in epochs:
             for min_count in min_counts:
                model_fine = word2vec.Word2Vec(tokenized_sentences_fine, size=num_feature, min_count=min_count, window = context, sample=1e-2, sg=1, iter=epoch)
                model_coarse = word2vec.Word2Vec(tokenized_sentences_coarse, size=num_feature, min_count=min_count, window = context, sample=1e-2, sg=1, iter=epoch)

                # To make the model memory efficient
                model_coarse.wv.init_sims(replace=True)
                model_fine.wv.init_sims(replace=True)

                trainDataVecs_fine = getAvgFeatureVecs(tokenized_sentences_fine, model_fine, num_feature)
                trainDataVecs_coarse = getAvgFeatureVecs(tokenized_sentences_coarse, model_coarse, num_feature)

                filtered_dev_reviews_fine = []
                filtered_dev_reviews_coarse = []

                for review in dev_sentences:
                    filtered_dev_reviews_fine.append(review_wordlist(review, remove_stopwords=False))

                for review in dev_sentences_coarse:
                    filtered_dev_reviews_coarse.append(review_wordlist(review, remove_stopwords=False))

                devDataVecs_fine = getAvgFeatureVecs(filtered_dev_reviews_fine, model_fine, num_feature)
                devDataVecs_coarse = getAvgFeatureVecs(filtered_dev_reviews_coarse, model_coarse, num_feature)

                clf = LogisticRegression()
                clf.fit(trainDataVecs_coarse, coarse_train_y)
                current_score_coarse = clf.score(devDataVecs_coarse, coarse_dev_y)
                if current_score_coarse > best_score_coarse:
                    best_score_coarse = current_score_coarse
                    best_hyperparameters_coarse = [num_feature,context,epoch, min_count]

                clf = LogisticRegression()
                clf.fit(trainDataVecs_fine, fine_train_y)
                current_score_fine = clf.score(devDataVecs_fine, fine_dev_y)
                if current_score_fine > best_score_fine:
                    best_score_fine = current_score_fine
                    best_hyperparameters_fine = [num_feature,context,epoch, min_count]


print('Best hyperparameters (Coarse):', best_hyperparameters_coarse)
print('Best validation score (Coarse):', best_score_coarse)
print('Best hyperparameters (Fine):', best_hyperparameters_fine)
print('Best validation score (Fine):', best_score_fine)

filtered_train_dev_reviews_coarse = []
filtered_train_dev_reviews_fine = []
filtered_test_reviews_coarse = []
filtered_test_reviews_fine = []

for review in list(train_sentences_coarse + dev_sentences_coarse):
    filtered_train_dev_reviews_coarse.append(review_wordlist(review, remove_stopwords=False))

for review in list(train_sentences + dev_sentences):
    filtered_train_dev_reviews_fine.append(review_wordlist(review, remove_stopwords=False))

for review in test_sentences_coarse:
    filtered_test_reviews_coarse.append(review_wordlist(review, remove_stopwords=False))
    
for review in test_sentences:
    filtered_test_reviews_fine.append(review_wordlist(review, remove_stopwords=False))

model_coarse_final = word2vec.Word2Vec(filtered_train_dev_reviews_coarse, size=best_hyperparameters_coarse[0], window = best_hyperparameters_coarse[1],iter=best_hyperparameters_coarse[2], min_count=best_hyperparameters_coarse[3], sample=1e-2, sg=1)
model_fine_final = word2vec.Word2Vec(filtered_train_dev_reviews_fine, size=best_hyperparameters_fine[0], window = best_hyperparameters_fine[1], iter=best_hyperparameters_fine[2], min_count=best_hyperparameters_fine[3], sample=1e-2, sg=1)
model_coarse_final.train(filtered_train_dev_reviews_coarse, total_examples=len(filtered_train_dev_reviews_coarse), epochs=10)
model_fine_final.train(filtered_train_dev_reviews_fine, total_examples=len(tokenized_sentences_fine), epochs=10)

# To make the model memory efficient
model_coarse_final.wv.init_sims(replace=True)
model_fine_final.wv.init_sims(replace=True)

trainDataVecs_coarse = getAvgFeatureVecs(filtered_train_dev_reviews_coarse, model_coarse_final, best_hyperparameters_coarse[0])
trainDataVecs_fine = getAvgFeatureVecs(filtered_train_dev_reviews_fine, model_fine_final, best_hyperparameters_fine[0])
testDataVecs_coarse = getAvgFeatureVecs(filtered_test_reviews_coarse, model_coarse_final, best_hyperparameters_coarse[0])
testDataVecs_fine = getAvgFeatureVecs(filtered_test_reviews_fine, model_fine_final, best_hyperparameters_fine[0])

clf = LogisticRegression()
clf.fit(trainDataVecs_coarse, coarse_train_y + coarse_dev_y)
test_score_coarse = clf.score(testDataVecs_coarse, coarse_test_y)

clf = LogisticRegression()
clf.fit(trainDataVecs_fine, fine_train_y + fine_dev_y)
test_score_fine = clf.score(testDataVecs_fine, fine_test_y)

print('Coarse test score on optimal hyperparameters:', test_score_coarse)
print('Fine test score on optimal hyperparameters:', test_score_fine)

In [None]:
# Hypertune Linear SVM with word2vec using gridsearch
svm_grid = {
        "C": [0.01, 10.0, 13.0, 15.0], 
        "tol": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0],
        'penalty':['l1','l2']
}

grid = GridSearchCV(LogisticRegression(), param_grid = svm_grid, cv=ps_coarse)
grid.fit(trainDataVecs_coarse, coarse_train_y + coarse_dev_y)

print("Best params with VecAvg representation and SoftMax (coarse labels):", grid.best_params_)
print('Optimal accuracy score with VecAvg representation and SoftMax (coarse labels):', grid.score(testDataVecs_coarse, coarse_test_y))

grid = GridSearchCV(LogisticRegression(), param_grid = svm_grid, cv=ps)
grid.fit(trainDataVecs_fine, fine_train_y + fine_dev_y)
print("Best params with VecAvg representation and SoftMax (fine labels):", grid.best_params_)
print('Optimal accuracy score with VecAvg representation and SoftMax (fine labels):', grid.score(testDataVecs_fine, fine_test_y))