In [3]:
# Import statements 
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data

In [4]:
# Read the Data Files
train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [6]:
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [7]:
# Checking for NULL values
(pd.isnull(test)).describe()

Unnamed: 0,id,review
count,25000,25000
unique,1,1
top,False,False
freq,25000,25000


In [8]:
(pd.isnull(train)).describe()

Unnamed: 0,id,sentiment,review
count,25000,25000,25000
unique,1,1,1
top,False,False,False
freq,25000,25000,25000


In [9]:
(pd.isnull(unlabeled_train)).describe()

Unnamed: 0,id,review
count,50000,50000
unique,1,1
top,False,False
freq,50000,50000


In [10]:
from nltk.tokenize import word_tokenize
def review_to_wordlist( review, remove_stopwords=False ):
    words = word_tokenize(review)
    words = [i.lower() for i in words]
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

In [11]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    return sentences

In [12]:
sentences = []
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print(len(sentences))

795538


In [13]:
print(sentences[0])

['``', 'with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', "'ve", 'started', 'listening', 'to', 'his', 'music', ',', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', ',', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again', '.']


In [14]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10 
downsampling = 1e-3

In [15]:
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

In [16]:
model.wv.syn0.shape

(17316, 300)

In [19]:
from sklearn.cluster import KMeans
import time

start = time.time()
num_clusters = int(model.wv.syn0.shape[0]/5)
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( model.wv.syn0 )
end = time.time()
elapsed = end - start
print(elapsed)

999.2009742259979


In [20]:
model.init_sims(replace=True)
model_name = "pickle"
model.save(model_name)

In [26]:
word_centroid_map = dict(zip( model.wv.index2word, idx ))
def create_bag_of_centroids( wordlist, word_centroid_map ):
    num_centroids = max( word_centroid_map.values() ) + 1
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    return bag_of_centroids

In [29]:
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append(review_to_wordlist( review, remove_stopwords=True ))

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ))

In [30]:
train_centroids = np.zeros( (train["review"].size, num_clusters), dtype="float32" )
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1
    

In [34]:
test_centroids = np.zeros(( test["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1


In [35]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("BagOfCentroids.csv", index=False, quoting=3)