In [24]:
#!/usr/bin/env python

#  Author: Angela Chapman
#  Date: 8/6/2014
#
#  This file contains code to accompany the Kaggle tutorial
#  "Deep learning goes to the movies".  The code in this file
#  is for Part 2 of the tutorial and covers Bag of Centroids
#  for a Word2Vec model. This code assumes that you have already
#  run Word2Vec and saved a model called "300features_40minwords_10context"
#
# *************************************** #
import warnings
warnings.filterwarnings('ignore')
import copy
# Load a pre-trained model
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import numpy as np
import os
from KaggleWord2VecUtility import KaggleWord2VecUtility


# Define a function to create bags of centroids
#
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [5]:
model = Word2Vec.load("300features_40minwords_10context")


# ****** Run k-means on the word vectors and print a few clusters
#

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.syn0
num_clusters = int(word_vectors.shape[0] / 5)

# Initalize a k-means object and use it to extract centroids
print ("Running K means")
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print ("Time taken for K Means clustering: ", elapsed, "seconds.")

Running K means
Time taken for K Means clustering:  372.5064539909363 seconds.


In [28]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number
word_centroid_map = dict(zip( model.index2word, idx ))
wcm_array = list(word_centroid_map.values())
wcm_keys = list(word_centroid_map.keys())
#print(wcm_array[0])
# Print the first ten clusters
for cluster in range(0,10):
    #
    # Print the cluster number
    print ("\nCluster %d" % cluster)
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(0,len(wcm_array)):
        if( wcm_array[i] == cluster ):
            words.append(wcm_keys[i])
    print (words)

3023

Cluster 0
['tenth']

Cluster 1
['anonymous', 'opium', 'ss']

Cluster 2
['shields']

Cluster 3
['undercurrent', 'illusion', 'aura']

Cluster 4
['polo', 'superstars', 'brigade']

Cluster 5
['headstrong', 'saucy', 'gracious', 'perky', 'feisty', 'coy', 'sweetly', 'spunky', 'bitchy', 'sassy', 'plump', 'foxy', 'snobbish', 'hush']

Cluster 6
['greeted', 'defeated', 'personified', 'teased', 'ridiculed', 'bullied', 'conducted', 'harassed', 'startled', 'terrorized', 'thwarted', 'befriended']

Cluster 7
['contemplate', 'resolve', 'discuss', 'absorb', 'examine', 'explore', 'observe', 'evaluate']

Cluster 8
['terrence', 'ron']

Cluster 9
['smashed', 'blasted', 'dumping', 'someones', 'bashed', 'mangled', 'hammered', 'smashing']


In [30]:
# Create clean_train_reviews and clean_test_reviews as we did before
#

# Read data from files
file = '/Users/david/notebooks/'
train = pd.read_csv( os.path.join(os.path.dirname(file), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
test = pd.read_csv(os.path.join(os.path.dirname(file), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )


print ("Cleaning training reviews")
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
        remove_stopwords=True ))

print ("Cleaning test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
        remove_stopwords=True ))


# ****** Create bags of centroids
#
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train["review"].size, num_clusters), \
    dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

# Repeat for test reviews
test_centroids = np.zeros(( test["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1


# ****** Fit a random forest and extract predictions
#
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print ("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

# Write the test results
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("BagOfCentroids.csv", index=False, quoting=3)
print ("Wrote BagOfCentroids.csv")

Cleaning training reviews




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Cleaning test reviews
Fitting a random forest to labeled training data...
Wrote BagOfCentroids.csv
