In [23]:
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [3]:
# read in csv data
train = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [6]:
# check the data
print train.shape
print train.columns.values

(25000, 3)
['id' 'sentiment' 'review']


In [15]:
def clean_text(review):
    '''
    Converts an HTML review into clean text without numbers and stopwords
    Input: a single string
    Output: single string with important words only
    '''
    
    # Remove HTML
    review_text = BeautifulSoup(review).get_text()
    
    # Remove punctuation and numbers
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # convert to lower case and tokenize
    words = letters_only.lower().split()
    
    #remove stopwords
    stopwords = set(stopwords.words("english")) 
    meaningful_words = [w for w in words if not w in stopwords]
    
    return(" ".join(meaningful_words)) 

In [17]:
clean_reviews = []
num_reviews = train['review'].size

In [18]:
for i in xrange(num_reviews):
    review = clean_text(train['review'][i])
    clean_reviews.append(review)

In [20]:
# get the features via count vectorizer
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None,   \
                             max_features = 5000) 

train_features = vectorizer.fit_transform(clean_reviews).toarray()

In [25]:
# random forests
forest = RandomForestClassifier(n_estimators = 200)
my_forest = forest.fit(train_features, train["sentiment"])

In [26]:
# Read the test data
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

In [28]:
# process the test data
num_reviews_test = len(test["review"])
clean_test_reviews = [] 
for i in xrange(num_reviews_test):
    clean_review = clean_text(test["review"][i])
    clean_test_reviews.append(clean_review)

In [29]:
# bag of words of test data
test_features = vectorizer.transform(clean_test_reviews)

In [30]:
# random forest prediction
prediction = forest.predict(test_features)

In [31]:
# prepare the output for a kaggle submission
output = pd.DataFrame( data={"id":test["id"], "sentiment":prediction} )
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )