In [3]:
import re

import os

In [1]:
import pandas as pd
import numpy as np

In [2]:
from bs4 import BeautifulSoup             

In [5]:
from nltk.corpus import stopwords

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [40]:
# from KaggleWord2VecUtility import KaggleWord2VecUtility

-----
## Part 1: For Beginners - Bag of Words

[source](https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words)

### Reading the Data

In [6]:
train = pd.read_table('data/labeledTrainData.tsv', 
                      delimiter = '\t', 
                      quoting = 3)

train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [21]:
test = pd.read_table('data/testData.tsv', 
                     delimiter = '\t', 
                     quoting = 3)

test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


------
### Data Cleaning and Text Preprocessing

In [7]:
def review_to_words(raw_review):
    """Function to convert a raw review to a string of words
    The input is a single string (a raw movie review), and 
    the output is a single string (a preprocessed movie review)
    """

    review_text = BeautifulSoup(raw_review, 'lxml').get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()                             
    
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]   

    return(" ".join( meaningful_words ))   

# review_to_words( train["review"][0] )

In [8]:
%%time
clean_train_reviews = [review_to_words(review) for review in train.review]

CPU times: user 26.4 s, sys: 1.76 s, total: 28.2 s
Wall time: 29.7 s


------
### Creating Features from a Bag of Words (Using `scikit-learn`)

In [9]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000) 

In [10]:
%%time
train_data_features = vectorizer.fit_transform(clean_train_reviews)

CPU times: user 4.09 s, sys: 117 ms, total: 4.2 s
Wall time: 4.22 s


In [11]:
train_data_features = train_data_features.toarray()
# print(train_data_features.shape)

In [14]:
vocab = vectorizer.get_feature_names()
# print(vocab)

If you're interested, you can also print the counts of each word in the vocabulary:

In [17]:
# # Sum up the counts of each vocabulary word
# dist = np.sum(train_data_features, axis=0)

# # For each, print the vocabulary word and the number of times it 
# # appears in the training set
# for tag, count in zip(vocab, dist):
#     print(count, tag)

----
### Random Forest

Initializing a Random Forest classifier with 100 trees and fitting the forest to the training set, using the bag of words as features and the sentiment labels as the response variable.

In [18]:
forest = RandomForestClassifier(n_estimators = 100) 

In [19]:
%%time
forest = forest.fit( train_data_features, train["sentiment"] )

CPU times: user 2min 10s, sys: 4.06 s, total: 2min 14s
Wall time: 2min 22s


----
### Making Predictions

Doing the same stuff, but with the test data:

In [22]:
%%time
clean_test_reviews = [review_to_words(review) for review in test.review]

CPU times: user 26.5 s, sys: 1.94 s, total: 28.4 s
Wall time: 30.1 s


In [23]:
%%time
test_data_features = vectorizer.transform(clean_test_reviews)

CPU times: user 4.78 s, sys: 225 ms, total: 5 s
Wall time: 5.29 s


In [24]:
test_data_features = test_data_features.toarray()

In [25]:
%%time
result = forest.predict(test_data_features)

CPU times: user 2.87 s, sys: 1.73 s, total: 4.61 s
Wall time: 6.94 s


----
### Creating a Submission

In [26]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv("submissions/Bag_of_Words_model.csv", 
              index = False, 
              quoting = 3)