In [3]:
import re, time
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

Importing training and testing data.

In [4]:
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

print(f"Read {train['review'].size} labeled train reviews, {test['review'].size} labeled test reviews, " \
 "and {unlabeled_train['review'].size} unlabeled reviews\n")

Read 25000 labeled train reviews, 25000 labeled test reviews, and {unlabeled_train['review'].size} unlabeled reviews



Function for preprocessing raw reviews. \
Removes HTML elements, non-alphanumeric characters, and stopwords.

In [5]:
def review_to_wordlist( review, remove_stopwords=False ):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z0-9_]"," ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

Loading Word2Vec model trained in other notebook

In [6]:
model = Word2Vec.load("300features_40minwords_10context")

## Averaging Word Vectors
Function for averaging all word vectors in a given paragraph

In [7]:
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.wv.index_to_key)
    
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model.wv[word])
    
    featureVec = np.divide(featureVec,nwords)
    return featureVec

Function for gathering the averaged feature vectors of a given set of reviews

In [8]:
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
       if counter%1000. == 0:
           print("Review %d of %d" % (counter, len(reviews)))
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
       counter = counter + 1
    return reviewFeatureVecs

Getting the averaged feature vectors for our training and testing data

In [None]:
num_features = 300 
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist(review, remove_stopwords=True))
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Training a Random Forest Classifier on the averaged features vectors

In [10]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(trainDataVecs, train["sentiment"])

Running inference on our testing data with the trained model

In [11]:
result = forest.predict(testDataVecs)

Saving predictions

In [12]:
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("results/Word2Vec_AverageVectors.csv", index=False, quoting=3)

## K-Means Clustering

Initialize and train clustering model

In [13]:
start = time.time() # Start time
word_vectors = model.wv.vectors
num_clusters = word_vectors.shape[0] / 5

kmeans_clustering = KMeans(n_clusters = int(num_clusters))
idx = kmeans_clustering.fit_predict(word_vectors)

end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

Time taken for K Means clustering:  437.86132550239563 seconds.


Get map of each vocab word to its cluster

In [14]:
word_centroid_map = dict(zip(model.wv.index_to_key, idx))

Function for performing word bagging, but recording centroid frequency rather than word frequency.

In [15]:
def create_bag_of_centroids(wordlist, word_centroid_map):
    num_centroids = max(word_centroid_map.values()) + 1
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
            
    return bag_of_centroids

Getting bag of centroids for our train and test sets

In [16]:
num_clusters = int(num_clusters)
train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32")

counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

test_centroids = np.zeros((test["review"].size, num_clusters), dtype="float32")

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

Initializing and training RF model

In [17]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_centroids,train["sentiment"])

Running inference on our testing data with the trained model

In [18]:
result = forest.predict(test_centroids)

Saving predictions

In [19]:
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("results/Bag_Of_Centroids.csv", index=False, quoting=3)