In [1]:
import re
import os

In [2]:
import pandas as pd
import numpy as np

In [3]:
from bs4 import BeautifulSoup             

In [4]:
from nltk.corpus import stopwords
import nltk.data

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

In [6]:
import gensim
from gensim.models import word2vec



In [7]:
import logging

-----
# Part 1: For Beginners - Bag of Words

[source](https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words)

## Reading the Data

In [8]:
train = pd.read_table('data/labeledTrainData.tsv', 
                      delimiter = '\t', 
                      quoting = 3)

train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [9]:
test = pd.read_table('data/testData.tsv', 
                     delimiter = '\t', 
                     quoting = 3)

test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [10]:
unlabeled_train = pd.read_csv("data/unlabeledTrainData.tsv", 
                              delimiter = "\t", 
                              quoting = 3)

unlabeled_train.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


------
## Data Cleaning and Text Preprocessing

In [11]:
def review_to_words(raw_review):
    """Function to convert a raw review to a string of words
    The input is a single string (a raw movie review), and 
    the output is a single string (a preprocessed movie review)
    """

    review_text = BeautifulSoup(raw_review, 'lxml').get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()                             
    
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]   

    return(" ".join( meaningful_words ))   

# review_to_words( train["review"][0] )

In [12]:
%%time
clean_train_reviews = [review_to_words(review) for review in train.review]

CPU times: user 24 s, sys: 1.19 s, total: 25.2 s
Wall time: 26 s


------
## Creating Features from a Bag of Words (Using `scikit-learn`)

In [13]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000) 

In [14]:
%%time
train_data_features = vectorizer.fit_transform(clean_train_reviews)

CPU times: user 4.73 s, sys: 197 ms, total: 4.93 s
Wall time: 5.1 s


In [15]:
train_data_features = train_data_features.toarray()
# print(train_data_features.shape)

In [16]:
vocab = vectorizer.get_feature_names()
# print(vocab)

If you're interested, you can also print the counts of each word in the vocabulary:

In [17]:
# # Sum up the counts of each vocabulary word
# dist = np.sum(train_data_features, axis=0)

# # For each, print the vocabulary word and the number of times it 
# # appears in the training set
# for tag, count in zip(vocab, dist):
#     print(count, tag)

----
## Random Forest

Initializing a Random Forest classifier with 100 trees and fitting the forest to the training set, using the bag of words as features and the sentiment labels as the response variable.

In [18]:
forest = RandomForestClassifier(n_estimators = 100) 

In [19]:
%%time
forest = forest.fit(train_data_features, train["sentiment"])

CPU times: user 2min 6s, sys: 3.41 s, total: 2min 10s
Wall time: 2min 18s


----
## Making Predictions

Doing the same stuff, but with the test data:

In [20]:
%%time
clean_test_reviews = [review_to_words(review) for review in test.review]

CPU times: user 23.2 s, sys: 1.22 s, total: 24.5 s
Wall time: 24.7 s


In [21]:
%%time
test_data_features = vectorizer.transform(clean_test_reviews)

CPU times: user 5.12 s, sys: 141 ms, total: 5.26 s
Wall time: 5.77 s


In [22]:
test_data_features = test_data_features.toarray()

In [23]:
%%time
result = forest.predict(test_data_features)

CPU times: user 2.66 s, sys: 1.47 s, total: 4.13 s
Wall time: 5.18 s


In [24]:
# # Creating a Submission
# output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
# output.to_csv("submissions/Bag_of_Words_model.csv", 
#               index = False, 
#               quoting = 3)

-------
# Part 2: Word Vectors

[source](https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors)

In [25]:
def review_to_wordlist(review, remove_stopwords = False):
    """Function to convert a document to a sequence of words,
    optionally removing stop words.
    Returns a list of words.
    """
    review_text = BeautifulSoup(review, 'lxml').get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    return(words)

Load the `punkt` tokenizer

In [26]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [27]:
def review_to_sentences(review, tokenizer, remove_stopwords = False):
    """Function to split a review into parsed sentences.
    Returns a list of sentences, 
    where each sentence is a list of words
    """
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    # 2. Loop over each sentence
    sentences = []
    
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [28]:
%%time
sentences = []

for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


CPU times: user 2min 5s, sys: 4.12 s, total: 2min 9s
Wall time: 2min 11s


In [29]:
%%time
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


CPU times: user 4min 4s, sys: 9.12 s, total: 4min 14s
Wall time: 4min 16s


In [30]:
print(len(sentences))

795538


In [31]:
print(sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


-----
## Training and Saving Your Model

In [32]:
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s',
                    level = logging.INFO)

In [33]:
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

### Initialize and train the model.

This will take some time: with `num_workers = 4` 
```
CPU times: user 8min 20s, sys: 13.3 s, total: 8min 33s
```

In [34]:
%%time
model = word2vec.Word2Vec(sentences, 
                          workers = num_workers, 
                          size = num_features, 
                          min_count = min_word_count,
                          window = context, 
                          sample = downsampling)

2016-11-10 23:49:23,457 : INFO : collecting all words and their counts
2016-11-10 23:49:23,462 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2016-11-10 23:49:23,554 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2016-11-10 23:49:23,682 : INFO : PROGRESS: at sentence #20000, processed 451867 words, keeping 24947 word types
2016-11-10 23:49:23,766 : INFO : PROGRESS: at sentence #30000, processed 671290 words, keeping 30033 word types
2016-11-10 23:49:23,834 : INFO : PROGRESS: at sentence #40000, processed 897790 words, keeping 34347 word types
2016-11-10 23:49:23,901 : INFO : PROGRESS: at sentence #50000, processed 1116929 words, keeping 37760 word types
2016-11-10 23:49:23,969 : INFO : PROGRESS: at sentence #60000, processed 1338370 words, keeping 40722 word types
2016-11-10 23:49:24,037 : INFO : PROGRESS: at sentence #70000, processed 1561505 words, keeping 43332 word types
2016-11-10 23:49:24,119 : INFO : PROGRESS: 

CPU times: user 8min 2s, sys: 9.51 s, total: 8min 12s
Wall time: 2min 31s


If you don't plan to train the model any further, calling `init_sims` will make the model much more memory-efficient.

In [35]:
model.init_sims(replace = True)

2016-11-10 23:51:55,401 : INFO : precomputing L2-norms of word weight vectors


It can be helpful to create a meaningful model name and save the model for later use. You can load it later using `Word2Vec.load()`

In [36]:
model_name = "300features_40minwords_10context"
model.save(model_name)

2016-11-10 23:51:55,700 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2016-11-10 23:51:55,703 : INFO : not storing attribute cum_table
2016-11-10 23:51:55,705 : INFO : not storing attribute syn0norm
2016-11-10 23:51:56,588 : INFO : saved 300features_40minwords_10context


### Exploring the Model Results

In [37]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [38]:
model.doesnt_match("france england germany berlin".split())

'berlin'

In [39]:
model.doesnt_match("paris berlin london austria".split())

'paris'

In [40]:
model.most_similar("man")

[('woman', 0.6225396394729614),
 ('lady', 0.6021609306335449),
 ('lad', 0.6000834107398987),
 ('monk', 0.5472439527511597),
 ('guy', 0.5301573276519775),
 ('chap', 0.528995156288147),
 ('soldier', 0.5207506418228149),
 ('farmer', 0.5195120573043823),
 ('men', 0.5185949802398682),
 ('businessman', 0.5123093128204346)]

In [41]:
 model.most_similar("queen")

[('princess', 0.6707057952880859),
 ('bride', 0.6562491059303284),
 ('duchess', 0.624799370765686),
 ('victoria', 0.6214991807937622),
 ('stepmother', 0.6054560542106628),
 ('maid', 0.6039066314697266),
 ('mistress', 0.6025797128677368),
 ('latifah', 0.5982476472854614),
 ('eva', 0.5889024138450623),
 ('goddess', 0.5796060562133789)]

In [42]:
model.most_similar("awful")

[('terrible', 0.7803564667701721),
 ('horrible', 0.7395068407058716),
 ('atrocious', 0.7191551923751831),
 ('dreadful', 0.7096336483955383),
 ('abysmal', 0.7004678249359131),
 ('horrendous', 0.6907781958580017),
 ('appalling', 0.6566586494445801),
 ('horrid', 0.6554067134857178),
 ('lousy', 0.6236556768417358),
 ('bad', 0.6007861495018005)]

-----
# Part 3: More Fun With Word Vectors

[source](https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors)

## Numeric Representations of Words

The Word2Vec model trained in Part 2 consists of a feature vector for each word in the vocabulary, stored in a `numpy` array called "`syn0`":

In [43]:
model = word2vec.Word2Vec.load("300features_40minwords_10context")

2016-11-10 23:51:56,722 : INFO : loading Word2Vec object from 300features_40minwords_10context
2016-11-10 23:51:57,390 : INFO : setting ignored attribute cum_table to None
2016-11-10 23:51:57,392 : INFO : setting ignored attribute syn0norm to None
2016-11-10 23:51:57,394 : INFO : loaded 300features_40minwords_10context


In [44]:
type(model.syn0)

numpy.ndarray

In [45]:
model.syn0.shape

(16490, 300)

The number of rows in `syn0` is the number of words in the model's vocabulary, and the number of columns corresponds to the size of the feature vector, which we set in Part 2.  Setting the minimum word count to 40 gave us a total vocabulary of 16,492 words with 300 features apiece. Individual word vectors can be accessed in the following way:

In [46]:
# model["flower"]

-----
## From Words To Paragraphs, Attempt 1: Vector Averaging

In [47]:
def makeFeatureVec(words, model, num_features):
    """Function to average all of the word vectors in a given
    paragraph
    """
    
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,), dtype = "float64")
    nwords = 0.

    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)

    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    """Given a set of reviews (each one a list of words), 
    calculate the average feature vector 
    for each one and return a 2D numpy array 
    """
    # Initialize a counter
    counter = 0.
    
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features), dtype = "float64")
    
    # Loop through the reviews
    for review in reviews:
        # Print a status message every 1000th review
        if counter % 1000. == 0.:
            ("Review %d of %d" % (counter, len(reviews)))
        
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        # Increment the counter
        counter = counter + 1.
        
    return reviewFeatureVecs

Calculate average feature vectors for training and testing sets,
using the functions we defined above. Notice that we now use stop word
removal.

In [48]:
%%time
clean_train_reviews = []

for review in train["review"]:
    clean_train_reviews.append(review_to_wordlist(review, 
                                                  remove_stopwords = True))

trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)



CPU times: user 1min 34s, sys: 5.99 s, total: 1min 40s
Wall time: 1min 42s


In [49]:
%%time
clean_test_reviews = []

for review in test["review"]:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))

testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)



CPU times: user 1min 35s, sys: 2.75 s, total: 1min 38s
Wall time: 1min 39s


### Random Forest
Next, use the average paragraph vectors to train a random forest. Note that, as in Part 1, we can only use the labeled training reviews to train the model. 

In [50]:
forest = RandomForestClassifier(n_estimators = 100)

In [51]:
%%time
# forest = forest.fit(trainDataVecs, train["sentiment"])

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 15 µs


I get an error here:
```
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
```

In [52]:
%%time
# result = forest.predict(testDataVecs)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs


In [53]:
# # Write the test results 
# output = pd.DataFrame(data = {"id":test["id"], "sentiment":result})
# output.to_csv("Word2Vec_AverageVectors.csv", 
#               index = False, 
#               quoting = 3)

-----
## From Words to Paragraphs, Attempt 2: Clustering 

Word2Vec creates clusters of semantically related words, so another possible approach is to exploit the similarity of words within a cluster. Grouping vectors in this way is known as "vector quantization." To accomplish this, we first need to find the centers of the word clusters, which we can do by using a clustering algorithm such as K-Means.

Set "k" (`num_clusters`) to be 1/5th of the vocabulary size, or an average of 5 words per cluster

In [54]:
word_vectors = model.syn0
num_clusters = word_vectors.shape[0] // 5

In [55]:
kmeans_clustering = KMeans(n_clusters = num_clusters)

In [56]:
%%time
idx = kmeans_clustering.fit_predict(word_vectors)

CPU times: user 15min 6s, sys: 52.1 s, total: 15min 58s
Wall time: 11min 16s


Create a Word / Index dictionary, mapping each vocabulary word to a cluster number                                                                                            

In [57]:
word_centroid_map = dict(zip( model.index2word, idx ))

This is a little abstract, so let's take a closer look at what our clusters contain. Your clusters may differ, as Word2Vec relies on a random number seed. Here is a loop that prints out the words for clusters 0 through 9:



In [58]:
for cluster in range(10):
    print("\nCluster %d" % cluster)

    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(len(word_centroid_map.values())):
        if( list(word_centroid_map.values())[i] == cluster ):
            words.append(list(word_centroid_map.keys())[i])
    print(words)


Cluster 0
['sledgehammer']

Cluster 1
['janis', 'joplin', 'harp', 'hendrix', 'mozart']

Cluster 2
['demonic', 'slugs', 'humanoid', 'insects', 'rotting', 'wasps', 'spiders', 'eaters', 'mutated', 'prehistoric', 'toxic', 'rats', 'worms', 'ants', 'raptor', 'infested', 'pollution', 'radioactive']

Cluster 3
['discovery']

Cluster 4
['celebration', 'monastery', 'temporary', 'retreat']

Cluster 5
['custody', 'injured', 'appointed', 'wounded', 'healed', 'slave', 'imprisoned', 'courts', 'condemned', 'freed', 'starving']

Cluster 6
['harrison', 'hopkins', 'jeff', 'lloyd', 'john', 'trevor', 'donald', 'alan', 'richard']

Cluster 7
['downbeat', 'anticlimactic', 'hasty']

Cluster 8
['genius', 'greatness', 'mastery', 'brilliance', 'excellence']

Cluster 9
['jeep', 'getaway', 'crossing', 'car', 'blast', 'truck', 'passenger', 'train', 'helicopter', 'bus']


Now we have a cluster (or "centroid") assignment for each word, and we can define a function to convert reviews into bags-of-centroids. This works just like Bag of Words but uses semantically related clusters instead of individual words.

* The number of clusters is equal to the highest cluster index in the word / centroid map

In [59]:
def create_bag_of_centroids(wordlist, word_centroid_map):
    num_centroids = max(word_centroid_map.values()) + 1
    
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    
    return bag_of_centroids

Transform the training set reviews into bags of centroids

In [60]:
%%time
train_centroids = np.zeros((train["review"].size, num_clusters),
                           dtype = "float32")

counter = 0

for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

CPU times: user 43.1 s, sys: 1.41 s, total: 44.5 s
Wall time: 46.2 s


Repeat for test reviews 

In [61]:
%%time
test_centroids = np.zeros((test["review"].size, num_clusters), 
                          dtype = "float32")

counter = 0

for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

CPU times: user 43.3 s, sys: 1.35 s, total: 44.7 s
Wall time: 46 s


### Random Forest

Fit a random forest and extract predictions 

In [62]:
forest = RandomForestClassifier(n_estimators = 100)

In [63]:
%%time
forest = forest.fit(train_centroids, train["sentiment"])

CPU times: user 1min 32s, sys: 1.76 s, total: 1min 34s
Wall time: 2min 10s


In [64]:
%%time
result = forest.predict(test_centroids)

CPU times: user 2.14 s, sys: 450 ms, total: 2.59 s
Wall time: 3.01 s


In [65]:
# # Write the test results 
# output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
# output.to_csv("BagOfCentroids.csv", 
#               index = False, 
#               quoting = 3)