In [47]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [12]:
# Reading data
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [5]:
train.shape

(25000, 3)

In [7]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [11]:
train['review'][2400]

'"There is no greater disservice to do to history than to misrepresent it. This takes the easiest and most shallow route, simply portraying him as a monster. Only showing his negative sides, and exaggerating them. \\"Those who are ignorant of the past doom us to repeat it\\". He was a human being. That may prove tough to some people to accept, but an important part of life is facing that which we don\'t want to. Rather than demonizing the man, we ought to try to understand him. Otherwise, we stand little chance of preventing anyone similar in the future, or possibly even the present, from succeeding at anything of remotely comparable scope, as far as damage and misery goes. Hate him and what he did, don\'t make him into something mythical, intentionally or otherwise. Frankly, far too much of this mini-series could play \\"dumb dumb *duuum*!\\" after or during scenes. The whole thing nods, nudges and winks at the audience, with a clear message of \\"was this guy evil or what\\", incorpo

In [18]:
# Data cleaning and preprocessing

# HTML tags
example1 = BeautifulSoup(train['review'][0])

In [15]:
example1.get_text()

u'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 

In [20]:
# Punctuations, Numbers and stopwords
letters_only = re.sub('[^a-zA-Z]',
                      ' ',
                      example1.get_text())

In [54]:
letters_only

u' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    

In [22]:
lower_case = letters_only.lower()

In [24]:
words = lower_case.split()

In [31]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [34]:
from nltk.corpus import stopwords
print stopwords.words('english')

# Remove stopwords from words
words = [w for w in words if w not in stopwords.words('english')]
print words


[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [39]:
def review_to_words(raw_review):
    # Function to convert raw review to a string of words
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text()
    
    # 2. Remove non-letters
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    
    # 3. Convert to lowercase and split into individual words
    words = letters_only.lower().split()
    
    # 4. Search faster in set
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join strings back to one space separated string
    return (" ".join(meaningful_words))

In [40]:
clean_review = review_to_words(train['review'][0])
print(clean_review)


stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

In [44]:
num_reviews = train.review.size

In [45]:
clean_train_reviews = []

for i in xrange(0, num_reviews):
    if( (i+1)%1000 == 0 ):
        print "Review %d of %d\n" % ( i+1, num_reviews )   
    clean_train_reviews.append(review_to_words(train['review'][i]))


Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



In [48]:
# Creating features from a Bag Of Words
vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)

In [49]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()


In [50]:
print train_data_features.shape

(25000, 5000)


In [53]:
vocab = vectorizer.get_feature_names()


In [52]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features, train['sentiment'])

In [55]:
# Now using word vectors
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [57]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [60]:
# Split review into parsed sentences
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # Returns list of sentences, each sentence is a list of words
    # 1. Split paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    
    return sentences