Import pandas, numpy and other packages as required

In [41]:
import numpy as np
import pandas as pd
import re
import nltk
import logging
import warnings
warnings.filterwarnings('ignore')

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,auc
from sklearn.cross_validation import train_test_split,roc_auc_score
from gensim.models import word2vec

Next, read the data(reviews) which is provided in three different categories - labeled, unlabeled, and test datasets

In [42]:
labeledTrain = pd.read_csv('labeledTrainData.tsv', header=0, quoting=3, delimiter='\t')
testData = pd.read_csv('testData.tsv', header=0, quoting=3, delimiter='\t')
unlabeledTrain = pd.read_csv('unlabeledTrainData.tsv', header=0, quoting=3, delimiter='\t')

Count the number of reviews in each type of dataset in order to check the total number of reviews

In [43]:
print(len(labeledTrain))
print(len(unlabeledTrain))
print(len(testData))

25000
50000
25000


In [44]:
print(labeledTrain.columns)
print(unlabeledTrain.columns)
print(testData.columns)

Index(['id', 'sentiment', 'review'], dtype='object')
Index(['id', 'review'], dtype='object')
Index(['id', 'review'], dtype='object')


Write a function to clean the reviews having noise such as html tags,etc

In [51]:
def clean_review(review, remove_stopwords=False):
    # 1. Remove HTML
    text_data = BeautifulSoup(review,'html.parser').get_text()
    
    # 2. Remove numeric data
    text_data = re.sub("[^A-Za-z]"," ", text_data)
    
    # 3. Convert letters to lower-case and split the data to form a list of words
    words = text_data.lower().split()
    
    # 4. Remove stopwords if true
    if remove_stopwords:
        s = set(stopwords.words("english"))
        words  = [w for w in words if not w in s]
        
    return(words)

Convert the words into word vectors. For this we are going to use the Word2Vec from the gensim package. Word2Vec 
expects as inputs single sentences. For converting the reviews into a list of sentences, we need to split a review into different sentences for which we will use the 'punkt' tokenizer contained in nltk() for splitting up the sentences.

In [6]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')  ## fo splitting the review into sentences

def review_to_sentences(review, tokenizer):
    # split the review into different sentences
    raw_review = tokenizer.tokenize(review.strip())
    
    sentences = []
    
    for raw_sentence in raw_review:
        if len(raw_sentence) > 0:
            sentences.append(clean_review(raw_sentence))
    
    #return the list of sentences where each sentence is a list of words
    return(sentences)


Clean the reviews in order to use in Word2Vec

In [7]:
sentences = []

for review in labeledTrain['review']:
    sentences += review_to_sentences(review, tokenizer)
    
for review in unlabeledTrain['review']:
    sentences += review_to_sentences(review,tokenizer)

  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [8]:
len(sentences)                              

795538

In [9]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

Couldn't import dot_parser, loading of dot files will not be possible.


Define a word2vec model by providing the parameters of your chocie depending upon the size of your dataset

In [10]:
model = word2vec.Word2Vec(sentences, workers=4 , min_count=40, size=300, window=5, sample=1e-3)

In [11]:
model_name = 'myModel'
model.save(model_name)         ## save the model for future use

In [17]:
model.syn0.shape               ##syn0 rows represents the size of vocabulary in the model

(16490, 300)

In [18]:
model.syn0                      ## word vectors stored as numpy arrays with each word having dimensions (1x300)

array([[ 0.55321699, -0.77782232,  0.82532221, ...,  0.72397226,
         0.68414426, -0.35472327],
       [ 0.09805842,  0.03103116,  0.53339213, ..., -0.54614192,
        -0.55391514, -1.27474058],
       [ 0.88128853, -0.43353438,  1.2196058 , ..., -0.67345876,
         0.3025381 ,  0.44857651],
       ..., 
       [-0.01545802, -0.09082257,  0.05533485, ...,  0.00173796,
        -0.0165602 ,  0.04005823],
       [-0.01607578, -0.06936552, -0.08594656, ..., -0.1842301 ,
         0.0749278 , -0.14317097],
       [ 0.01619974, -0.06358982, -0.08173489, ..., -0.13500381,
         0.03976946,  0.06671745]], dtype=float32)

Because each review is of differen length,hence feature vector for each review will be different. In order to handle this problem, we can either attempt 'vector averaging' or 'vector quantization'. It makes sense to make to group similar words together. Hence we can make clusters of word vectors that are somewhat similar. Size of cluster depends on your choice, though small clusters(i.e. clusters with small number of words) are good for making good predictions and it also takes less time to clusters such large number of word vectors.  

In [24]:
from sklearn.cluster import MiniBatchKMeans    ## KMeans will take huge amount of time to cluster so use MiniBatchKMeans

word_vectors = model.syn0             ## all the word vectors are now stored in word_vectors
n_clusters = word_vectors.shape[0]//5  ## 16490/5 will give a float which will not be accepted as number of clusters

In [26]:
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100)  
idx = kmeans.fit_predict(word_vectors)   ## fit_predict will generate the index for each cluster

103.9913558959961


Create a mapping of words to the clusters by creating a dictionary

In [27]:
word_vector_clusters_map = dict(zip(model.index2word, idx))   ##index2word contains indexing for each word in vocab

In [37]:
##Print out a number of clusters to check the words in a cluster

for cluster in range(0,10):
    print("Cluster  : ", cluster)
    words = []
    for i in range(len(word_vector_clusters_map.values())):
        if(cluster == list(word_vector_clusters_map.values())[i]):
           words.append(list(word_vector_clusters_map.keys())[i])
    print(words)

Cluster  :  0
['symbol']
Cluster  :  1
['buscemi', 'martin', 'mcqueen', 'carell']
Cluster  :  2
['sergeant', 'deputy', 'meanwhile']
Cluster  :  3
['blame', 'forgive', 'trust', 'tell', 'call']
Cluster  :  4
['hare']
Cluster  :  5
['flight', 'landing', 'crash']
Cluster  :  6
['lost', 'achieved', 'received', 'gained']
Cluster  :  7
['youth']
Cluster  :  8
['unlikeable', 'unlikable', 'menacing', 'lovable', 'unsympathetic', 'likeable', 'credible', 'endearing']
Cluster  :  9
['franz', 'hans']


Convert the reviews into  a bag of clusters. This will give us a numpy array with a fixed size such that for each review we have a fixed number of features.

In [40]:
def create_reviews_clusters_bag(words, word_vector_clusters_map):
    ## Number of clusters will be equal to the maximum value in the word_vector_clusters_map
    num_clusters  = max(word_vector_clusters_map.values()) + 1
    
    ## Create a numpy array and initialzie it to zero. Pre-initializing array will help in faster operations
    bag_of_clusters = np.zeros(num_clusters, dtype='float32')
    
    ##Loop over each word in the review, find its cluser and increment the respective numpy array index value
    for word in words:
        if word in word_vector_clusters_map:
            index  = word_vector_clusters_map[word]
            bag_of_clusters[index] += 1
            
    return bag_of_clusters        

Clean the reveiews again by removing the stopwords in order to have low noise.

In [52]:
cleaned_train_reviews = []
for review in labeledTrain['review']:
    cleaned_train_reviews.append(clean_review(review, remove_stopwords=True))
    
cleaned_test_reviews = []
for review in testData['review']:
    cleaned_test_reviews.append(clean_review(review, remove_stopwords=True))

Make numpy array for training clusters and testing clusters

In [53]:
train_centroids = np.zeros((labeledTrain['review'].size, n_clusters), dtype='float32')
test_centroids = np.zeros((testData['review'].size, n_clusters), dtype='float32')

index = 0 
for review in cleaned_train_reviews:
    train_centroids[index] = create_reviews_clusters_bag(review, word_vector_clusters_map)
    index = index + 1

index = 0
for review in cleaned_test_reviews:
    test_centroids[index] = create_reviews_clusters_bag(review, word_vector_clusters_map)
    index = index + 1

Make predictions with an appropriate model like RandomForest

In [57]:
X_train, X_test, y_train, y_test = train_test_split(train_centroids, labeledTrain['sentiment'], test_size=0.2, random_state=333)

In [62]:
model = RandomForestClassifier(n_estimators=200, oob_score=True, n_jobs=-1)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [63]:
s = model.predict(X_test)

In [65]:
result = model.predict(test_centroids)