# Kaggle IMDB Review Sentiment Analysis

### Import necessary libraries

In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

### Read in data and check it out

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header = 0, 
                    delimiter="\t", quoting = 3)

# shape of the dataframe
#print train.shape

# column name
#print train.columns.values

# first few rows
#print train.head(3)

# check out a review
#print train["sentiment"].values.tolist()

### Clean up the reviews

In [3]:
def review_to_words( raw_review ):
	''' function to convert raw IMDB review 
		to list of words'''
	# remove markup and tags
	bs_review = BeautifulSoup( raw_review )
	# remove numbers and punctuation
	letters_only = re.sub(r'[^a-zA-Z]', ' ', bs_review.get_text())
	# convert to lower case
	lower_case = letters_only.lower()
	# split string into list
	words_only = lower_case.split()
	# define the stop words
	stops = set(stopwords.words("english"))
	# remove stop words from review
	words = [w for w in words_only if w not in stops]

	return " ".join(words)

clean_train_reviews = []
for i, rev in enumerate(train["review"]):
    if (i + 1) % 5000 == 0:
        print "{} reviews processed.".format(i+1)
    clean_train_reviews.append( review_to_words( rev ))

5000 reviews processed.
10000 reviews processed.
15000 reviews processed.
20000 reviews processed.
25000 reviews processed.




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


### Creating Features from a Bag of Words

In [4]:
# Now, we need to convert the reviews to some kind of numeric 
# representation for machine learning.  To do this, we'll use the 
# 'Bag of Words' approach.

# The 'Bag of Words' model learns a vocabulary from all of the 
# documents, then models each document by counting the number of
# times each word appears.

# We take the full text and form a feature vector that contains the 
# full 'vocabulary'.  Then, for each review, we count how many times 
# each word appears.

# Using the "feature_extraction" module from scikit-learn:

# first, initialize the "CountVectorizer" object, which is scikit-
# learn's bag of words tool:
vectorizer = CountVectorizer(max_features = 1000)

# fit_transform learns the vocabulary dictionary and returns the 
# term-document matrix
train_data_features = vectorizer.fit_transform(clean_train_reviews,)

# convert to an array since they're easier to work with
train_data_features = train_data_features.toarray()

### Take a look at the vocabulary

In [5]:
vocab = vectorizer.get_feature_names()
#print vocab

### Train a Random Forest Model

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_data_features, 
                                                    train['sentiment'],
                                                    test_size = 0.3,
                                                    random_state = 42)

forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(X_train, y_train)

pred = forest.predict(X_test)

print "accuracy = ", accuracy_score(y_test, pred)

accuracy =  0.846666666667


### Try a Simple Logistic Regression

In [7]:
logreg = LogisticRegression()
logreg = logreg.fit(X_train, y_train)

print "accuracy = ", accuracy_score(y_test, logreg.predict(X_test))

accuracy =  0.8748


### Make a Prediction and Submission File

In [8]:
# Read in the test data
test = pd.read_csv("data/testData.tsv", header = 0, 
                   delimiter="\t", quoting=3)

# Clean the test reviews
clean_test_reviews = []
for i, rev in enumerate(test["review"]):
    if (i + 1) % 5000 == 0:
        print "{} reviews processed.".format(i+1)
    clean_test_reviews.append( review_to_words( rev ))

5000 reviews processed.
10000 reviews processed.
15000 reviews processed.
20000 reviews processed.
25000 reviews processed.


In [9]:
# Bag of words for the test set
test_data_features = vectorizer.transform(clean_test_reviews)
# convert to an array since they're easier to work with
test_data_features = test_data_features.toarray()

In [10]:
# Use the Random Forest to make predictions for sentiment
# from test reviews
rf_pred = forest.predict(test_data_features)

# Use logistic regression to make predictions for sentiment
# from test reviews
lr_pred = logreg.predict(test_data_features)

In [11]:
# Copy results to a pandas DataFrame
resultsRF = pd.DataFrame( data={"id": test["id"], "sentiment": rf_pred} )
resultsLR = pd.DataFrame( data={"id": test["id"], "sentiment": lr_pred} )

# Use pandas to output results to csv file
resultsRF.to_csv("data/BagOfWords_RandomForest.csv", index=False, quoting=3)
resultsLR.to_csv("data/BagOfWords_LogReg.csv", index=False, quoting=3)